In [1]:
import numpy as np
from helpers import *

In [2]:
def drop_feature(data: np.array, feature_to_drop: str | list[str], features: list[str], feature_index: dict):
    """
    Drop a feature for all the samples.
    :param data: np.array of shape (N, D)
    :param feature_to_drop: name(s) of the feature(s) to drop
    :param features: list(str) of the names of the features
    :param feature_index: dict(str : int) linking the name of the features to their index
    """

    # Get the index of the feature to drop
    ids_feature_to_drop = [feature_index[feature] for feature in feature_to_drop]

    # Drop the column corresponding to the feature to drop and update the features list
    data = np.delete(data, ids_feature_to_drop, axis=1)
    features = [f for f in features if f not in feature_to_drop]

    # Update the feature_index dictionary
    assert len(features) == data.shape[1]
    feature_index = {feature: index for index, feature in enumerate(features)}

    print(f"Removed {len(feature_to_drop)} features: {feature_to_drop}")

    return data, features, feature_index

In [3]:
def drop_feature_threshold(data: np.array, features: list[str], feature_index: dict, threshold=0.9):
    """
    Drop feature for all the samples, if the values for that features are NaN
    for a percentage higher than threshold.
    :param data: np.array of shape (N, D)
    :param features: list(str) of the names of the features
    :param feature_index: dict(str : int) linking the name of the features to their index
    :param threshold: percentage of NaN values above which we drop the feature
    """

    N = data.shape[0]

    # Compute percentage of NaN values for each feature
    n_NaN = np.sum(np.isnan(data), axis=0)
    p_NaN = n_NaN / N

    # Get the indices of the features for which the percentage of NaN values is higher than threshold
    ids_features_to_drop = np.where(p_NaN > threshold)[0]
    features_to_drop = []
    for f in feature_index.keys():
        if feature_index[f] in ids_features_to_drop:
            features_to_drop.append(f)
    assert len(features_to_drop) == len(ids_features_to_drop)

    return drop_feature(data, features_to_drop, features, feature_index)


In [4]:
def load_train_data(sub_sample=False):
    path_x_train = "data/x_train.csv"
    path_y_train = "data/y_train.csv"

    features_names = np.genfromtxt(
        path_x_train, 
        delimiter=",", 
        dtype=str,
        max_rows=1
    )

    x_train = np.genfromtxt(
        path_x_train, 
        delimiter=",", 
        skip_header=1
    )
    
    y_train = np.genfromtxt(
        path_y_train,
        delimiter=",",
        skip_header=1,
        usecols=0
    )

    # sub-sample
    if sub_sample:
        x_train = x_train[::50]
        y_train = y_train[::50]

    return x_train, y_train, features_names

In [5]:
x, y, features = load_train_data()

In [6]:
feature_indexes = dict(zip(features, range(len(features))))

useless_features = [        "FMONTH", "IDATE", "IMONTH", "IDAY", "IYEAR", "SEQNO", "_PSU", "CTELENUM", "COLGHOUS", "STATERES", 
                            "_STATE", "PVTRESD1", "LANDLINE", "HHADULT", "CELLFON3", "NUMADULT", "NUMMEN", "NUMWOMEN",
                            "CTELNUM1", "CELLFON2", "PVTRESD2", "CCLGHOUS", "CSTATE", "RENTHOM1", "NUMHHOL2", "NUMPHON2",
                            "CPDEMO1", "VETERAN3", "BLIND", "STOPSMK2", "LASTSMK2", "USENOW3", "FLSHTMY2", "IMFVPLAC",
                            "HIVTSTD3", "WHRTST10", "PDIABTST", "CRGVLNG1", "CRGVPERS", "CRGVHOUS", "CRGVMST2", "VINOCRE2",
                            "VIEYEXM2", "LONGWTCH", "SXORIENT", "RCSGENDR", "RCSRLTN2", "CASTHNO2", "EMTSUPRT", "LSATISFY",
                            "ADPLEASR", "ADENERGY", "ADFAIL", "ADTHINK", "ADMOVE", "_STSTR", "_STRWT", "_CHISPNC", "_CRACE1",
                            "_CPRACE", "_CLLCPWT", "_DUALUSE", "_DUALCOR", "_LLCPWT", "_MRACE1", "_HISPANC", "_RACEG21",
                            "_RACEGR3", "_RACE_G1", "_AGEG5YR", "_AGE65YR", "_AGE_G", "_EDUCAG", "_INCOMG", "FC60_", "PAMIN11_",
                            "PAMIN21_", "_PAREC1", "_PASTAE1", "_RFSEAT2", "_RFSEAT3", "_FRTRESP", "_VEGRESP", "_FRUITEX", "_VEGETEX", "PAMISS1_"]

repetitive_features = ["LADULT", "CADULT", "HLTHPLN1", "BPHIGH4", "BPMEDS", "BLOODCHO", "CHOLCHK", "TOLDHI2", 
                       "ASTHMA3", "ASTHNOW", "HAVARTH3", "EDUCA", "CHILDREN", "INCOME2", "INTERNET", "WEIGHT2",
                       "HEIGHT3", "SMOKE100", "SMOKDAY2", "ALCDAY5", "AVEDRNK2", "DRNK3GE5", "MAXDRNKS", "FRUITJU1",
                       "FRUIT1", "FVBEANS", "FVGREEN", "FVORANG", "VEGETAB1", "EXERANY2", "EXRACT11", "EXEROFT1", 
                       "EXERHMM1", "EXRACT21", "EXEROFT2", "EXERHMM2", "STRENGTH", "FLUSHOT6", "PNEUVAC3", "HIVTST6",
                       "_RFHLTH"]

In [7]:
x.shape

(328135, 321)

In [11]:
# Remove useless and repetitive features
clean_x, clean_features, clean_feature_indexes = drop_feature(x, list(set(useless_features).union(set(repetitive_features))), features, feature_indexes)

# Remove features with more than 90% of NaN values
clean_x, clean_features, clean_feature_indexes = drop_feature_threshold(clean_x, clean_features, clean_feature_indexes, threshold=0.9)

Removed 127 features: ['PAMISS1_', 'INCOME2', 'CADULT', 'STATERES', 'PDIABTST', 'CASTHNO2', '_STSTR', 'VEGETAB1', '_STATE', '_CPRACE', 'RCSGENDR', 'TOLDHI2', '_INCOMG', 'CCLGHOUS', 'CRGVPERS', 'CPDEMO1', 'LASTSMK2', 'PVTRESD1', 'NUMPHON2', 'CHOLCHK', 'CRGVHOUS', 'FRUITJU1', 'HAVARTH3', 'SMOKE100', 'EDUCA', 'NUMWOMEN', 'CRGVMST2', 'PAMIN11_', 'VIEYEXM2', '_DUALUSE', '_PASTAE1', 'ADMOVE', 'HHADULT', 'RCSRLTN2', '_FRUITEX', 'FRUIT1', 'VINOCRE2', '_AGE_G', '_RFSEAT2', 'IMONTH', 'FVBEANS', 'RENTHOM1', 'BLOODCHO', 'FVGREEN', 'IMFVPLAC', 'FC60_', '_STRWT', 'VETERAN3', 'ADTHINK', 'ASTHMA3', '_RFHLTH', 'INTERNET', '_CRACE1', 'CELLFON3', '_PSU', 'AVEDRNK2', 'HLTHPLN1', '_RACEGR3', 'EXERHMM2', 'FMONTH', '_RACE_G1', 'SEQNO', 'SXORIENT', '_LLCPWT', '_RACEG21', 'IDATE', 'WEIGHT2', 'COLGHOUS', '_CHISPNC', 'IDAY', 'ADPLEASR', 'EXERHMM1', 'LONGWTCH', 'NUMADULT', '_PAREC1', 'PNEUVAC3', 'BPMEDS', '_CLLCPWT', 'CTELENUM', 'LANDLINE', 'NUMHHOL2', '_RFSEAT3', '_AGE65YR', '_DUALCOR', 'EXEROFT1', 'FLSHTMY2', '