## Updated: Sun, 10/04

Data preparation:

+ Impute missing values using KNN
+ Reduce dimentions using PCA (ncol = 149)

In [1]:
import pandas as pd
import numpy as np
import boto3
import os
import sagemaker

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sklearn.impute import KNNImputer

## Import variable files

In [320]:
file_path = '../subsets/'

In [322]:
health_pca_var_names = [var for var in pd.read_fwf(file_path +'health-PCA.txt',header=None)[0]]
credit_pca_var_names = [var for var in pd.read_fwf(file_path + 'credit-PCA.txt',header=None)[0]]
med_pay_pca_var_names = [var for var in pd.read_fwf(file_path + 'medical-payment-PCA.txt',header=None)[0]]
var_names = [var for var in pd.read_fwf(file_path + 'other-variables.txt',header=None)[0]]

In [34]:
all_vars = health_pca_var_names + credit_pca_var_names + med_pay_pca_var_names + var_names

## Read datasets

Divided the original dataset into 4 smaller subsets.

+ `health_df`: dataset for health condition-related variables (no missing value - PCA is ok)
+ `credit_df`: dataset for financial information (need to impute missing values)
+ `med_pay_df`: dataset for medical payment related variables (need to impute missing values)
+  `df`: dataset for variables that do not need PCA (need to impute missing values)

In [10]:
health_df = pd.read_csv('../data/2020_Competition_Training.csv', usecols=health_pca_var_names,low_memory = False)
credit_df = pd.read_csv('../data/2020_Competition_Training.csv', usecols=credit_pca_var_names,low_memory = False)
med_pay_df = pd.read_csv('../data/2020_Competition_Training.csv', usecols=med_pay_pca_var_names,low_memory = False)
df = pd.read_csv('../data/2020_Competition_Training.csv', usecols=var_names,low_memory = False)



In [132]:
# df_all = pd.read_csv('../data/2020_Competition_Training.csv', usecols=all_vars,low_memory = False)
df_all = pd.concat([health_df, credit_df, med_pay_df, df], axis = 1)

In [29]:
df_all["lang_spoken_cd"].replace("E", "ENG", inplace = True) # Encode "E" as "ENG"

array(['ENG', 'SPA'], dtype=object)

------

In [182]:
def encoding_none_missing_values(col):

    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()

    missing_vals = col[col.isnull()]
    no_missing = col[~col.isnull()]
    
    no_missing_t = le.fit_transform(no_missing)
    no_missing = pd.Series(no_missing_t, index = no_missing.index)
    col_new = pd.concat([no_missing, missing_vals]).sort_index()
    return col_new

In [268]:
encode_df = df_all.select_dtypes(include = "object")
encode_df = encode_df.copy()
for (columnName, columnData) in encode_df.iteritems(): 
    new_col = encoding_none_missing_values(columnData)
    encode_df.loc[:,columnName] = new_col

In [281]:
df_all.loc[:,df_all.columns.isin(encode_df.columns)] = encode_df

In [316]:
#  df_all.to_csv("df_all.csv")

In [None]:
k = 2 

imputer = KNNImputer(n_neighbors=k)

df_impute1 = pd.DataFrame(imputer.fit_transform(df_all[:10000]))
df_impute2 = pd.DataFrame(imputer.fit_transform(df_all[10000:20000]))
df_impute3 = pd.DataFrame(imputer.fit_transform(df_all[20000:30000]))
df_impute4 = pd.DataFrame(imputer.fit_transform(df_all[30000:40000]))
df_impute5 = pd.DataFrame(imputer.fit_transform(df_all[40000:50000]))
df_impute6 = pd.DataFrame(imputer.fit_transform(df_all[50000:60000]))
df_impute7 = pd.DataFrame(imputer.fit_transform(df_all[60000:]))

In [None]:
df_full = pd.concat([df_impute1, df_impute2, df_impute3, df_impute4, df_impute5, df_impute6, df_impute7])
df_full.columns  = df_all.columns

In [None]:
df_full.reset_index(drop = True, inplace = True)
# df_full.to_csv("df_full.csv")

## Health PCA

In [None]:
pca_health = PCA(n_components = 80)
X_health = pd.DataFrame(pca_health.fit_transform(df_full[health_pca_var_names]))
sum(pca_health.explained_variance_ratio_)

## Credit PCA

In [None]:
pca_credit = PCA(n_components = 5)
X_credit = pd.DataFrame(pca_credit.fit_transform(df_full[credit_pca_var_names]))
sum(pca_credit.explained_variance_ratio_)

## Medical Payment PCA

In [None]:
pca_med_pay = PCA(n_components = 4)
X_med_pay = pd.DataFrame(pca_med_pay.fit_transform(df_full[med_pay_pca_var_names]))
sum(pca_med_pay.explained_variance_ratio_)

## Concat  all PCA dataframes

In [None]:
other_df = df_full.loc[:,var_names]

In [None]:
PCA_df = pd.concat([X_health, X_credit, X_med_pay, other_df], axis = 1)
# PCA_df.to_csv("pca_df.csv")