In [2]:
import numpy as np
import csv

# Function to read the header
def read_header(file_path):
    with open(file_path, 'r') as f:
        reader = csv.reader(f)
        header = next(reader)
    return header

# Read headers
x_test_header = read_header('dataset/x_test.csv')
y_train_header = read_header('dataset/y_train.csv')
x_train_header = read_header('dataset/x_train.csv')

# Load csvs x_test, y_test and x_train, skipping the header row
x_test = np.genfromtxt('dataset/x_test.csv', delimiter=',', skip_header=1)
y_train = np.genfromtxt('dataset/y_train.csv', delimiter=',', skip_header=1)
x_train = np.genfromtxt('dataset/x_train.csv', delimiter=',', skip_header=1)
x_train_cleaned = x_train.copy()
x_test_cleaned = x_test.copy()


columns_to_drop_by_name = [
    "FMONTH", "IDATE", "IMONTH", "IDAY", "IYEAR", "SEQNO", "_PSU", "LADULT", "CTELENUM", 
    "PVTRESD1", "COLGHOUS", "STATERES", "CELLFON3", "CTELNUM1", "CELLFON2", "CADULT", 
    "PVTRESD2", "CCLGHOUS", "CSTATE", "LANDLINE", "HHADULT", "NUMHHOL2", "NUMPHON2", 
    "CPDEMO1", "PREGNANT", "BLIND", "EXRACT11", "EXRACT21", "HIVTST6", "HIVTSTD3", 
    "PAINACT2", "QLMENTL2", "QLSTRES2", "QLHLTH2", "CAREGIV1", "CRGVREL1", "CRGVLNG1", 
    "CRGVHRS1", "CRGVPRB1", "CRGVPERS", "CRGVHOUS", "CRGVMST2", "CRGVEXPT", "VIDFCLT2", 
    "VIREDIF3", "VIPRFVS2", "VINOCRE2", "VIEYEXM2", "VIINSUR2", "VICTRCT4", "VIGLUMA2", 
    "VIMACDG2", "CIMEMLOS", "CDHOUSE", "CDASSIST", "CDHELP", "CDSOCIAL", "CDDISCUS", 
    "HOWLONG", "LASTPAP2", "HPLSTTST", "PROFEXAM", "LENGEXAM", "LSTBLDS3", "HADSGCO1", 
    "LASTSIG3", "PCPSAAD2", "PCPSADI1", "PCPSARE1", "PSATIME", "PCPSARS1", "PCPSADE1", 
    "PCDMDECN", "SCNTPAID", "SCNTWRK1", "SXORIENT", "TRNSGNDR", "RCSGENDR", "RCSRLTN2", 
    "CASTHDX2", "CASTHNO2", "QSTVER", "QSTLANG", "EXACTOT1", "EXACTOT2", "_STSTR", 
    "_STRWT", "_RAWRAKE", "_WT2RAKE", "_CHISPNC", "_CRACE1", "_CPRACE", "_CLLCPWT", 
    "_DUALUSE", "_DUALCOR", "_LLCPWT", "_PRACE1", "FEETCHK2", "PAINACT2", "QLMENTL2", 
    "QLSTRES2", "QLHLTH2", "CAREGIV1", "CRGVREL1", "CRGVLNG1", "CRGVHRS1", "CRGVPRB1", 
    "CRGVPERS", "CRGVHOUS", "CRGVMST2", "CRGVEXPT", "VIDFCLT2", "VIREDIF3", "VIPRFVS2", 
    "VINOCRE2", "VIEYEXM2", "VIINSUR2", "VICTRCT4", "VIGLUMA2", "VIMACDG2", "ASTHMAGE", 
    "ASATTACK", "ASERVIST", "ASDRVIST", "ASRCHKUP", "ASACTLIM", "ASYMPTOM", "ASNOSLEP", 
    "ASTHMED3", "ASINHALR", "HPVADVC2", "HPVADSHT", "_AGE_G", "HTIN4", "_CHLDCNT", 
    "_DRNKWEK", "FTJUDA1_", "FRUTDA1_", "BEANDAY_", "GRENDAY_", "ORNGDAY_", "VEGEDA1_", 
    "_MISFRTN", "_MISVEGN", "_FRTRESP", "_VEGRESP", "_FRT16", "_VEG23", "_FRUITEX", 
    "_VEGETEX", "PAMISS1_", "_PA150R2", "_PA300R2", "_PA30021", "_PASTRNG", "_PAREC1", 
    "_PASTAE1", "_LMTACT1", "_LMTWRK1", "_LMTSCL1", "_RFSEAT3", "_RACE", "_RACEG21", 
    "_RACE_G1", "_AGEG5YR", "_AGE65YR"
]

# Convert column names to indices and combine with automatically identified columns
columns_to_drop_by_index = []
for i in range(x_train_cleaned.shape[1]): 
    unique_values = np.unique(x_train_cleaned[:, i][~np.isnan(x_train_cleaned[:, i])]) 
    if len(unique_values) == 1: 
        columns_to_drop_by_index.append(i)  # Append the index

# Combine indices of manually specified columns and automatically detected ones
columns_to_drop_indices = columns_to_drop_by_index + [i for i, col in enumerate(x_train_header) if col in columns_to_drop_by_name]

# Drop these columns in both x_train and x_test
x_train_cleaned = np.delete(x_train_cleaned, columns_to_drop_indices, axis=1)
x_test_cleaned = np.delete(x_test_cleaned, columns_to_drop_indices, axis=1)

# Drop the same columns from the headers
x_train_header_cleaned = [col for i, col in enumerate(x_train_header) if i not in columns_to_drop_indices]
x_test_header_cleaned = [col for i, col in enumerate(x_test_header) if i not in columns_to_drop_indices]



# Save cleaned data with headers
with open('dataset/x_train_cleaned.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(x_train_header_cleaned)
    writer.writerows(x_train_cleaned)

with open('dataset/x_test_cleaned.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(x_test_header_cleaned)
    writer.writerows(x_test_cleaned)

print(x_train.shape)
print(x_train_cleaned.shape)


(328135, 322)
(328135, 184)


In [3]:
#list the headers of the cleaned data
print(x_train_header_cleaned)

['Id', '_STATE', 'DISPCODE', 'NUMADULT', 'NUMMEN', 'NUMWOMEN', 'GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'POORHLTH', 'HLTHPLN1', 'PERSDOC2', 'MEDCOST', 'CHECKUP1', 'BPHIGH4', 'BPMEDS', 'BLOODCHO', 'CHOLCHK', 'TOLDHI2', 'CVDSTRK3', 'ASTHMA3', 'ASTHNOW', 'CHCSCNCR', 'CHCOCNCR', 'CHCCOPD1', 'HAVARTH3', 'ADDEPEV2', 'CHCKIDNY', 'DIABETE3', 'DIABAGE2', 'SEX', 'MARITAL', 'EDUCA', 'RENTHOM1', 'VETERAN3', 'EMPLOY1', 'CHILDREN', 'INCOME2', 'INTERNET', 'WEIGHT2', 'HEIGHT3', 'QLACTLM2', 'USEEQUIP', 'DECIDE', 'DIFFWALK', 'DIFFDRES', 'DIFFALON', 'SMOKE100', 'SMOKDAY2', 'STOPSMK2', 'LASTSMK2', 'USENOW3', 'ALCDAY5', 'AVEDRNK2', 'DRNK3GE5', 'MAXDRNKS', 'FRUITJU1', 'FRUIT1', 'FVBEANS', 'FVGREEN', 'FVORANG', 'VEGETAB1', 'EXERANY2', 'EXEROFT1', 'EXERHMM1', 'EXEROFT2', 'EXERHMM2', 'STRENGTH', 'LMTJOIN3', 'ARTHDIS2', 'ARTHSOCL', 'JOINPAIN', 'SEATBELT', 'FLUSHOT6', 'FLSHTMY2', 'IMFVPLAC', 'PNEUVAC3', 'WHRTST10', 'PDIABTST', 'PREDIAB1', 'INSULIN', 'BLDSUGAR', 'DOCTDIAB', 'CHKHEMO3', 'FEETCHK', 'EYEEXAM', 'DIABEYE', 