## select variables to read

In [75]:
import pandas as pd
import numpy as np
import math

from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA

In [21]:
# selected variable names by category
file_path = 'subsets/'

health_pca_var_names = [var for var in pd.read_fwf(file_path +'health-PCA.txt',header=None)[0]]
credit_pca_var_names = [var for var in pd.read_fwf(file_path + 'credit-PCA.txt',header=None)[0]]
med_pay_pca_var_names = [var for var in pd.read_fwf(file_path + 'medical-payment-PCA.txt',header=None)[0]]
var_names = [var for var in pd.read_fwf(file_path + 'other-variables.txt',header=None)[0]]

# all selected
all_vars = health_pca_var_names + credit_pca_var_names + med_pay_pca_var_names + var_names

----------------

## read train and holdout files

In [None]:
# read dataframes by categories
df_all =  pd.read_csv('data/2020_Competition_Training.csv', usecols=all_vars,low_memory = False)

In [61]:
# sanity check
df_all.shape

(21000, 297)

In [76]:
df_all_holdout = pd.read_csv('data/2020_Competition_Holdout.csv', usecols=all_vars,low_memory = False)

In [77]:
df_all_holdout.shape

(17681, 297)

------------

## data cleaning

In [62]:
def encoding_none_missing_values(col):
    
    '''
    Encoding a column by dividing the column into missing values and no missing values, 
    and only encode no missing values. Then append the missing values to the encoded values. 
    Return the encoded column.
    '''

    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()

    missing_vals = col[col.isnull()]
    no_missing = col[~col.isnull()]
    
    no_missing_t = le.fit_transform(no_missing)
    no_missing = pd.Series(no_missing_t, index = no_missing.index)
    col_new = pd.concat([no_missing, missing_vals]).sort_index()
    return col_new

In [63]:
def data_prepration(df, k):
    
    """
    Encode all "object" columns and impute missing values. Takes in a dataframe and k value for KNN imputation.
    
    """
    
    df_names = df.columns
    df["lang_spoken_cd"].replace("E", "ENG", inplace = True) # Encode "E" as "ENG"
    
    # encode non-missing values for categorical variables
    
    encode_df = df.select_dtypes(include = "object")
    encode_df = encode_df.copy()
    for (columnName, columnData) in encode_df.iteritems(): 
        new_col = encoding_none_missing_values(columnData)
        encode_df.loc[:,columnName] = new_col
    
    df.loc[:,df_all.columns.isin(encode_df.columns)] = encode_df
    
    # impute missing values
    
    n = df.shape[0]
    prev = 0
    num_it = math.ceil(n/10000) # total number of iteration (dataset > 10000 rows)
    
    # rangess for slicing for each imputation (due to limit computing power, maximum 10000 rows at once)
    list_range = []                  
    for i in range(num_it):
        if i < num_it-1:
            now = prev + 10000
            list_range.append((prev, now))
            prev = now
        else:
            now = n
            list_range.append((prev, now))          
            
    # KNN imputation
    imputer = KNNImputer(n_neighbors=k)

    df_list = []
    for t in list_range:
        df_i = pd.DataFrame(imputer.fit_transform(df_all[t[0]:t[1]]))
        df_list.append(df_i)
    imputed = pd.concat(df_list)
    imputed.columns = df_names
    # return the imputed dataframe
    return imputed
    


In [64]:
train = data_prepration(df_all, 2)
holdout = data_prepration(df_all_holdout, 2)

----------

## principal component analysis

In [None]:
pca_health = PCA(n_components = 80)
df_pca_health_train = pca_health.fit_transform(train[health_pca_var_names])
df_pca_health_holdout = pca_health.transform(holdout[health_pca_var_names])


pca_credit = PCA(n_components = 5)
df_pca_credit_train = pca_credit.fit_transform(train[credit_pca_var_names])
df_pca_credit_holdout = pca_credit.transform(holdout[credit_pca_var_names])


pca_med_pay = PCA(n_components = 4)
df_pca_med_pay_train = pca_med_pay.fit_transform(train[med_pay_pca_var_names])
df_pca_med_pay_holdout = pca_med_pay.transform(holdout[med_pay_pca_var_names])

In [None]:
pca_train = pd.concat(df_pca_health_train, df_pca_credit_train, df_pca_med_pay_train, train[var_names])
pca_holdout = pd.concat(df_pca_health_holdout, df_pca_credit_holdout, df_pca_med_pay_holdout, holdout[var_names])

In [82]:
health_pca_name = []
for i in range(80):
    name = "health_pc" + str(i+1)
    health_pca_name.append(name)
    
credit_pca_name = []
for i in range(5):
    name = "credit_pc" + str(i+1)
    credit_pca_name.append(name)  

medpat_pca_name = []    
for i in range(4):
    name = "medpay_pc" + str(i+1)
    medpat_pca_name.append(name) 

names_pca = health_pca_name + credit_pca_name + medpat_pca_name + var_names

In [83]:
pca_train.columns = names_pca
pca_holdout.columns = names_pca

129

In [None]:
# pca_train.to_csv("pca_train.csv")
# pca_holdout.to_csv("pca_holdout.csv")