In [1]:
import numpy as np
import matplotlib.pyplot as plt
from helpers import *
import seaborn as sns
from preprocess import *
from helpers_given import *

In [2]:
x_train, x_test, y_train, train_ids, test_ids =  load_csv_data('dataset', sub_sample=False)
with open('dataset/x_train.csv', 'r') as f:
    features_string = f.readline()
    features = features_string.split(',')
features = features[1:]

y_train loaded
x_train loaded
x_test loaded


In [3]:
data = x_train

# faster alternative for number of nans:
NaNs = nb_of_nans(data)
# convert y from {-1,1} to {0,1} to avoid problems with logistic
y_01 = convert_minus1_to_0(y_train)

# Get reduced data
reduced_data, reduced_features, Removed_features = removing_features(NaNs,features,data)

In [4]:
# remove the same columns from x_test
reduced_x_test = np.delete(x_test, Removed_features, 1)

# Get feature correlation dictionary
feature_correlation_dict = create_dictionary_from_correlation(reduced_data,reduced_features,0.6)

max_corrr_feature_dict = {}

# Find the 50 most correlated features
for key, val in feature_correlation_dict.items():
    max_corrr_feature_dict[key] = len(val)

# Sort the dictionary by value in descending order
max_corrr_feature_dict = {k: v for k, v in sorted(max_corrr_feature_dict.items(), key=lambda item: item[1], reverse=True)}



 Finished for feature: _STATE
 Finished for feature: FMONTH
 Finished for feature: IDATE
 Finished for feature: IMONTH
 Finished for feature: IDAY
 Finished for feature: IYEAR
 Finished for feature: DISPCODE
 Finished for feature: SEQNO
 Finished for feature: _PSU
 Finished for feature: GENHLTH
 Finished for feature: PHYSHLTH
 Finished for feature: MENTHLTH
 Finished for feature: HLTHPLN1
 Finished for feature: PERSDOC2
 Finished for feature: MEDCOST
 Finished for feature: CHECKUP1
 Finished for feature: BPHIGH4
 Finished for feature: BLOODCHO
 Finished for feature: CVDSTRK3
 Finished for feature: ASTHMA3
 Finished for feature: CHCSCNCR
 Finished for feature: CHCOCNCR
 Finished for feature: CHCCOPD1
 Finished for feature: HAVARTH3
 Finished for feature: ADDEPEV2
 Finished for feature: CHCKIDNY
 Finished for feature: DIABETE3
 Finished for feature: SEX
 Finished for feature: MARITAL
 Finished for feature: EDUCA
 Finished for feature: RENTHOM1
 Finished for feature: VETERAN3
 Finished fo

In [5]:
features_to_drop = []
for key in max_corrr_feature_dict.keys():
    #Add features that have at least one correlation with another feature
    if max_corrr_feature_dict[key] > 0:
        features_to_drop.append(key)

In [6]:
# Remove redundant features
redundant_features = [ 'FMONTH','IDATE','IMONTH','IDAY','IYEAR', 'SEQNO', '_STATE', '_PSU', ]
for feature in redundant_features:
        features_to_drop.append(feature)

In [7]:
features_to_keep = []
for feature in reduced_features:
    if feature not in features_to_drop:
        features_to_keep.append(feature)
        
# Also replace some features with their calculated counterparts
origin_calculated_features = {
    'WEIGHT2' : 'WTKG3',
    'HEIGHT3' : 'HTM4',
    'ALCDAY5' : '_DRNKWEK',
    'FRUITJU1' : 'FTJUDA1_',
    'FRUIT1' : 'FRUTDA1_',
    'FVBEANS' : 'BEANDAY_',
    'FVGREEN' : 'GRENDAY_',
    'FVORANG' : 'ORNGDAY_',
    'VEGETAB1' : 'VEGEDA1_',
    'STRENGTH' : 'STRFREQ_'
}

# In features_to_keep replace the key of origin_calculated_features with the value
for key, val in origin_calculated_features.items():
    for i, feature in enumerate(features_to_keep):
        if key == feature:
            features_to_keep[i] = val

# Drop duplicates
features_to_keep = list(set(features_to_keep))

# Get the indices of the selected features
selected_features_indices = []
for feature in features_to_keep:
    selected_features_indices.append(reduced_features.index(feature))

selected_features_indices = sorted(selected_features_indices)

In [8]:
reduced_data = reduced_data[:, selected_features_indices]
reduced_x_test = reduced_x_test[:, selected_features_indices]

# Also remove the features from the reduced_features list
reduced_features_2 = []
for feature in reduced_features:
    if feature in features_to_keep:
        reduced_features_2.append(feature)

In [9]:
def removing_features_threshold(number_NaN,list_,data,thresh):
    Removed_features=[]
    for i in range(len(number_NaN)):
        if number_NaN[i] > round(len(data))*thresh:
            Removed_features.append(i)
    reduced_data = np.delete(data, Removed_features, 1)
    reduced_list = list(filter(lambda x: list_.index(x) not in Removed_features, list_))
    return reduced_data, reduced_list, Removed_features

In [10]:
#Calculate the number of nans for the reduced data
NaNs_reduced = nb_of_nans(reduced_data)

#Get the reduced data
reduced_data_2, reduced_features_3, Removed_features_2 = removing_features_threshold(NaNs_reduced,reduced_features_2,reduced_data,0.05)

In [11]:
reduced_x_test_2 = np.delete(reduced_x_test, Removed_features_2, 1)

In [12]:
# Replace nine values with NaNs
replace_nine_with_nan(reduced_data_2)
replace_nine_with_nan(reduced_x_test_2)

# Replace 99 values with NaNs
replace_99_with_nan(reduced_data_2)
replace_99_with_nan(reduced_x_test_2)

reduced_data_2 = clean_outliers_modified(reduced_data_2)
# should we do this for x_test too ?

# For the _DRNKWEK feature, replace 9990 with NaN
# TODO: maybe we could vectorize this so we don't do a loop over the data
for i in range(reduced_data_2.shape[0]): 
    if reduced_data_2[i, reduced_features_3.index('_DRNKWEK')] == 9990:
        reduced_data_2[i, reduced_features_3.index('_DRNKWEK')] = np.nan
for i in range(reduced_x_test.shape[0]): 
    if reduced_x_test_2[i, reduced_features_3.index('_DRNKWEK')] == 9990:
        reduced_x_test_2[i, reduced_features_3.index('_DRNKWEK')] = np.nan

In [13]:
def one_hot_encode_low_cardinality_features(data):
    """
    One-hot encodes columns with less than or equal to 4 unique values in a NumPy array.
    
    Parameters:
    data (numpy.ndarray): The input data matrix.

    Returns:
    numpy.ndarray: The one-hot encoded data matrix.
    """
    encoded_data = data.copy()
    num_rows, num_cols = data.shape

    num_of_new = 0

    for col in range(num_cols):
        unique_values = np.unique(data[:, col])
        num_unique = len(unique_values)

        if num_unique <= 4:
            # Create a binary mask for each unique value
            masks = [data[:, col] == val for val in unique_values]

            

            # Create one-hot encoded columns
            one_hot_columns = np.vstack(masks).T.astype(int)
            last_column_index = one_hot_columns.shape[1] - 1
            one_hot_columns = np.delete(one_hot_columns, last_column_index, axis=1)

            # Remove the original column and insert the one-hot columns
            encoded_data = np.delete(encoded_data, col+num_of_new, axis=1)
            for i in range(0, one_hot_columns.shape[1]):
                encoded_data = np.insert(encoded_data, col+num_of_new, one_hot_columns[:,i], axis=1)

            num_of_new = num_of_new + len(masks) - 2
    
    return encoded_data

In [14]:
one_hot_reduced_2 = one_hot_encode_low_cardinality_features(reduced_data_2)
one_hot_reduced_x_test_2 = one_hot_encode_low_cardinality_features(reduced_x_test_2)

In [15]:
# Replace NaNs with medians
reduced_median = replace_NaN(one_hot_reduced_2, method='median')
reduced_median_test = replace_NaN(one_hot_reduced_x_test_2, method='median')

# Standardize the data
std_x_med = standardize_data(reduced_median)
std_test_med = standardize_data(reduced_median_test)

#### 4-fold CV fine tuning

In [17]:
from implementations import *
NB_COL = std_x_med.shape[1] # corresponds to 'D' = number of features
NB_ROWS = std_test_med.shape[0] # corresponds to 'N' = number of observations/respondents

In [18]:
from cross_validation import *

In [19]:
# CV on two hyperparams degree and lambda on regularized logistic regression adding polynomial functi
seed = 12
degrees = np.arange(0,5,1)
k_fold = 4
lambdas = np.arange(0.0,0.5,0.1)
gamma = 0.5
max_iters = 50
initial_w = np.zeros(NB_COL)
k_indices = build_k_indices(y_01, k_fold, seed)
#for each degree, we compute the best lambdas and the associated rmse
best_lambdas = []
best_F1s = []
#vary degree
for degree in degrees:
    # cross validation
    F1_te = []
    for lambda_ in lambdas:
        F1_te_tmp = []
        for k in range(k_fold):
            _, F1_test = cross_validation(y_01, std_x_med,initial_w, max_iters, gamma, k_indices, k, lambda_, degree)
            F1_te_tmp.append(F1_test)
        F1_te.append(np.mean(F1_te_tmp))
    print(f" Finished for degree: {degree}")
    ind_lambda_opt = np.argmax(F1_te)
    best_lambdas.append(lambdas[ind_lambda_opt])
    best_F1s.append(F1_te[ind_lambda_opt])
ind_best =  np.nanargmax(best_F1s)      
best_degree = degrees[ind_best]
best_lambda = best_lambdas[ind_best]
best_F1 = best_F1s[ind_best]

 Finished for degree: 0
 Finished for degree: 1


  return 1.0 / (1 + np.exp(-z))
  loss = np.log(1 + np.exp(tx @ w)) - y * (tx @ w)


 Finished for degree: 2
 Finished for degree: 3
 Finished for degree: 4


In [20]:
print(ind_best)
print(best_degree)
print(best_lambda)
print(best_F1)

1
1
0.4
0.381872358963031


### Run regularized logistic regression with best param

In [21]:
tx = build_poly(std_x_med, best_degree)

In [22]:
size_w = tx.shape[1]

In [23]:
w_reg, loss_reg = reg_logistic_regression(y_01, tx, initial_w = np.zeros(size_w), max_iters = 50, gamma = 0.5, lambda_ = best_lambda
print(loss_reg)
y_reg = convert_0_to_minus1(convert_predict(tx @ w_reg))
print(y_reg)
# accuracy
p_reg = compute_accuracy(y_train, y_reg) #percentage of false predictions
print(p_reg)

0.23926742026026468
[-1. -1. -1. ... -1. -1. -1.]
0.9139622411507459


In [24]:
tx_test = build_poly(std_test_med, best_degree)
y_pred = convert_0_to_minus1(convert_predict(tx_test @ w_reg))

In [25]:
create_csv_submission(test_ids, y_pred, "new_result")