## Goals for the next meeting (Fri, 10/2)

+ impute missing values using KNN
+ complete PCA for credit and medical payment
+ potentially fit the baseline model!!


Updated: Wed, 9/30

In [72]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD

## Import variable files

In [32]:
f = open('subsets/health-PCA.txt', 'r')
health_pca_var_names = [var for vars in f.read().split("\n")]
f = open('subsets/credit-PCA.txt', 'r')
credit_pca_var_names = f.read().split("\n")
f = open('subsets/medical-payment-PCA.txt', 'r')
med_pay_pca_var_names = f.read().split("\n")
f = open('subsets/other-variables.txt', 'r')
var_names = f.read().split("\n")

In [45]:
health_pca_var_names = [var for var in pd.read_fwf('subsets/health-PCA.txt',header=None)[0]]
credit_pca_var_names = [var for var in pd.read_fwf('subsets/credit-PCA.txt',header=None)[0]]
med_pay_pca_var_names = [var for var in pd.read_fwf('subsets/medical-payment-PCA.txt',header=None)[0]]
var_names = [var for var in pd.read_fwf('subsets/other-variables.txt',header=None)[0]]

## Read datasets

Divided the original dataset into 4 smaller subsets.

+ `health_df`: dataset for health condition-related variables (no missing value - PCA is ok)
+ `credit_df`: dataset for financial information (need to impute missing values)
+ `med_pay_df`: dataset for medical payment related variables (need to impute missing values)
+  `df`: dataset for variables that do not need PCA (need to impute missing values)

In [50]:
health_df = pd.read_csv('data/2020_Competition_Training.csv', usecols=health_pca_var_names,low_memory = False)
credit_df = pd.read_csv('data/2020_Competition_Training.csv', usecols=credit_pca_var_names,low_memory = False)
med_pay_df = pd.read_csv('data/2020_Competition_Training.csv', usecols=med_pay_pca_var_names,low_memory = False)
df = pd.read_csv('data/2020_Competition_Training.csv', usecols=var_names,low_memory = False)

## PCA for `health_df`

In [141]:
pca_health = PCA(n_components = 80)
X = pca_health.fit_transform(health_df)

In [148]:
health_df.shape

(69572, 224)

In [142]:
sum(pca_health.explained_variance_ratio_)

0.89609844282777

Reduced dimentions from 224 to 80 and retained ~ 90% variance

## The rest of the datasets

+ The remaining datasets contains NaN values -- PCA or other algorithms cannot be done. 
+ Need to impute missing values.
+ Current idea of how to impute: KNN

See this [reference](https://towardsdatascience.com/xgboost-is-not-black-magic-56ca013144b4) and [this](https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html) for implementation.

In [75]:
credit_df.shape

(69572, 20)

In [135]:
# All rows with missing values in credit_df
credit_df[credit_df.isnull().any(axis=1)]

Unnamed: 0,credit_bal_autobank,credit_bal_autofinance,credit_bal_bankcard_severederog,credit_bal_consumerfinance,credit_bal_heloc_60dpd,credit_bal_heloc_severederog,credit_bal_mtg_90to119dpd,credit_bal_mtg_bankruptcy,credit_bal_mtg_severederog,credit_bal_mtgcredit_new,credit_bal_studentloan_60dpd,credit_bal_totalallcredit_60dpd,credit_bal_totalallcredit_60to89dpd,credit_bal_totalallcredit_90to119dpd,credit_bal_totalallcredit_new,credit_highcrd_consumerfinance_new,credit_minmob_1stmtgcredit,credit_minmob_agencyfirstmtg,credit_minmob_mtgcredit,credit_minmob_nonagn
22,,,,,,,,,,,,,,,,,,,,
546,,,,,,,,,,,,,,,,,,,,
554,,,,,,,,,,,,,,,,,,,,
622,,,,,,,,,,,,,,,,,,,,
625,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67788,,,,,,,,,,,,,,,,,,,,
67932,,,,,,,,,,,,,,,,,,,,
68214,,,,,,,,,,,,,,,,,,,,
68783,,,,,,,,,,,,,,,,,,,,


In [137]:
# NA in med_pay_df
med_pay_df[med_pay_df.isnull().any(axis=1)]

Unnamed: 0,cms_ma_risk_score_nbr,cms_partd_ra_factor_amt,cms_risk_adj_payment_rate_a_amt,cms_risk_adj_payment_rate_b_amt,cms_risk_adjustment_factor_a_amt,cms_rx_risk_score_nbr,cms_tot_ma_payment_amt,cms_tot_partd_payment_amt,rev_cms_clinic_ind,rev_cms_er_ind,rev_cms_icu_ind,rev_cms_lab_ind,rev_cms_phar_ind
0,,,,,,,,,0,0,0,0,0
66,,,,,,,,,0,0,0,0,0
121,,,,,,,,,0,0,0,0,0
123,,,,,,,,,0,0,0,0,0
129,,,,,,,,,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69480,,,,,,,,,0,1,0,1,1
69520,,,,,,,,,0,0,0,0,0
69542,,,,,,,,,0,1,0,1,1
69543,,,,,,,,,0,0,0,1,0


In [None]:
# NaN in df

In [107]:
df[df.isnull().any(axis = 1)].shape

(21582, 40)

In [149]:
df[df.isnull().any(axis = 1)]

Unnamed: 0,sex_cd,est_age,smoker_current_ind,smoker_former_ind,lang_spoken_cd,mabh_seg,cci_score,dcsi_score,fci_score,hcc_weighted_sum,...,phy_em_pe_ind,phy_em_pi_ind,phy_em_px_ind,prov_fb_ind,prov_pcp_ind,prov_sp_ind,rucc_category,rx_overall_pmpm_ct,total_outpatient_visit_ct_pmpm,total_physician_office_visit_ct_pmpm
0,F,62,1,0,ENG,UNK,3.0,1,2,0,...,0,0,1,1,1,1,1-Metro,0.084756,0.000000,0.250900
3,M,75,0,0,ENG,H6,3.0,0,2,0,...,0,0,1,0,1,1,7-Nonmetro,2.333333,0.083333,0.250000
8,F,56,0,0,ENG,H3,6.0,3,1,0,...,1,0,1,1,1,1,1-Metro,3.433116,0.250324,0.610793
11,F,67,0,0,SPA,UNK,6.0,1,4,0,...,0,0,1,1,1,1,1-Metro,12.909578,0.831499,2.838363
15,M,60,0,1,ENG,H2,4.0,2,2,3,...,0,0,1,0,1,1,2-Metro,2.299222,0.125749,0.581078
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69564,F,70,0,0,ENG,UNK,4.0,3,5,4,...,0,0,1,1,1,1,2-Metro,6.876191,0.200000,0.912677
69565,F,18,0,0,ENG,UNK,0.0,0,0,0,...,0,0,0,0,0,0,6-Nonmetro,0.384005,0.000000,0.000000
69567,F,72,1,0,ENG,H7,5.0,2,2,2,...,0,0,1,1,1,1,1-Metro,3.916667,0.650000,1.333333
69568,M,75,0,0,ENG,C4,9.0,3,2,14,...,0,0,1,1,1,1,1-Metro,4.872273,1.106683,2.157150


In [130]:
df.isnull().any()[df.isnull().any() == True].index

Index(['cms_ra_factor_type_cd', 'cons_cmys', 'cons_hcaccprf_h',
       'cons_hcaccprf_p', 'cons_hhcomp', 'cons_homstat', 'cons_n2029_y',
       'cons_n2mob', 'cons_n2pbl', 'cons_n2pmv', 'cons_n65p_y',
       'cons_retail_buyer', 'hedis_dia_hba1c_ge9', 'hedis_dia_hba1c_test',
       'hedis_dia_ma_nephr'],
      dtype='object')

In [132]:
df[df.isnull().any(axis = 1)][df.isnull().any()[df.isnull().any() == True].index]

Unnamed: 0,cms_ra_factor_type_cd,cons_cmys,cons_hcaccprf_h,cons_hcaccprf_p,cons_hhcomp,cons_homstat,cons_n2029_y,cons_n2mob,cons_n2pbl,cons_n2pmv,cons_n65p_y,cons_retail_buyer,hedis_dia_hba1c_ge9,hedis_dia_hba1c_test,hedis_dia_ma_nephr
0,,,,,,,,,,,,,N,N,Y
3,CN,,,,,,,,,,,,N,Y,Y
8,E,,,,,,,,,,,,N,Y,Y
11,CP,,,,,,,,,,,,N,Y,Y
15,CP,,,,,,,,,,,,N,N,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69564,CN,,,,,,,,,,,,N,Y,N
69565,,,,,,,,,,,,,,,
69567,CN,,,,,,,,,,,,N,Y,Y
69568,CF,,,,,,,,,,,,N,Y,Y
