# First

In [91]:
import numpy as np
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import random
import os

In [2]:
with open('../Data/preprocessed.pkl.zip', 'rb') as f:
    _ = pickle.load(f)
    col_names = pickle.load(f)
    X_train = pickle.load(f)
    X_test = pickle.load(f)
    y_train = pickle.load(f)
    y_test = pickle.load(f)

In [3]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((350011, 1579), (350011,), (87503, 1579), (87503,))

In [4]:
path = '../Data/first_log_model.pkl'
if not os.path.exists(path):
    logistic_regressor = LogisticRegression(max_iter=10000)
    logistic_regressor.fit(X_train, y_train)
    with open(path, 'wb') as f:
        pickle.dump(logistic_regressor, f)
else:
    with open(path, 'rb') as f:
        logistic_regressor = pickle.load(f)

In [5]:
y_pred = logistic_regressor.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.92      0.99      0.96     79776
         1.0       0.59      0.17      0.26      7727

    accuracy                           0.92     87503
   macro avg       0.76      0.58      0.61     87503
weighted avg       0.90      0.92      0.89     87503



In [6]:
ind_coef = list(enumerate(logistic_regressor.coef_[0]))
ind_coef

[(0, 0.10463743786722643),
 (1, -0.23256081717957236),
 (2, 0.05475900592212079),
 (3, 0.20544578214872325),
 (4, -0.048280296269734374),
 (5, -0.17668458144040314),
 (6, 0.08446897287586795),
 (7, 0.05135564827690409),
 (8, -0.14279659598677738),
 (9, -0.06929852639053487),
 (10, 0.25672836223331796),
 (11, -0.3291594332669882),
 (12, 0.021512050903825068),
 (13, 0.11138799996734912),
 (14, 0.26953076959399863),
 (15, -0.05174182365189021),
 (16, -0.11140291969034674),
 (17, 0.21687300919810648),
 (18, 0.14915034639642136),
 (19, -0.02938500579166651),
 (20, -0.05717426928803108),
 (21, -0.07916699486907952),
 (22, 0.08865216917951356),
 (23, 0.11801909790887265),
 (24, -0.12357363705917432),
 (25, -0.1363545455369033),
 (26, -0.037052899879833),
 (27, -0.01941989517951865),
 (28, 0.11351994425659026),
 (29, 0.06649346225158963),
 (30, 0.02544189045057314),
 (31, -0.08336340465023151),
 (32, -0.09605370830523961),
 (33, 0.04916080390749313),
 (34, 0.05330246694870628),
 (35, 0.0076174

In [7]:
imp_coef_tup = sorted(ind_coef, reverse=True, key=lambda x: abs(x[1]))[:20]
imp_coef_tup

[(518, 1.2915624033598432),
 (1555, 1.2689299969679242),
 (811, 1.1488566078833533),
 (504, 1.0725594937774734),
 (213, -1.0452708236689472),
 (882, 1.028943252587807),
 (840, -1.0058664968816808),
 (1251, 0.985923572823749),
 (979, -0.9447158484505378),
 (753, 0.9447099029184959),
 (800, 0.9390738098206141),
 (746, -0.9319155869353413),
 (443, -0.9290842654979299),
 (783, -0.9264039976841362),
 (720, -0.899731597071418),
 (1072, 0.8883041869866927),
 (431, -0.842787950934689),
 (996, -0.8419832999851163),
 (539, 0.8262915670723553),
 (461, -0.8055078888767153)]

In [8]:
indices = list(list(zip(*imp_coef_tup))[0])
imp_coef = list(list(zip(*imp_coef_tup))[1])
imp_col = col_names[indices]

imp = list(zip(indices, imp_col, imp_coef))
imp

[(518, 'EXRACT21_41.0', 1.2915624033598432),
 (1555, '_MISVEGN', 1.2689299969679242),
 (811, 'VINOCRE2_77.0', 1.1488566078833533),
 (504, 'EXRACT21_27.0', 1.0725594937774734),
 (213, 'ASTHMA3_9.0', -1.0452708236689472),
 (882, 'CDDISCUS_9.0', 1.028943252587807),
 (840, 'VIMACDG2_7.0', -1.0058664968816808),
 (1251, 'QSTLANG_999.0', 0.985923572823749),
 (979, 'ARTHEDU_999.0', -0.9447158484505378),
 (753, 'CRGVPRB1_999.0', 0.9447099029184959),
 (800, 'VIPRFVS2_7.0', 0.9390738098206141),
 (746, 'CRGVPRB1_9.0', -0.9319155869353413),
 (443, 'EXRACT11_44.0', -0.9290842654979299),
 (783, 'VIDFCLT2_6.0', -0.9264039976841362),
 (720, 'CRGVREL1_77.0', -0.899731597071418),
 (1072, 'LSTBLDS3_9.0', 0.8883041869866927),
 (431, 'EXRACT11_32.0', -0.842787950934689),
 (996, 'HPVADSHT_77.0', -0.8419832999851163),
 (539, 'EXRACT21_62.0', 0.8262915670723553),
 (461, 'EXRACT11_62.0', -0.8055078888767153)]

# Second

In [337]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import random
import os
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [338]:
path = '../Data/df_clean_null.pkl'
df = pd.read_pickle(path)

In [339]:
feature_names = ['_RFHLTH', '_HCVU651', '_RFHYPE5', '_RFCHOL', '_ASTHMS1', '_DRDXAR1', '_RACEGR3', '_AGE_G', '_BMI5CAT', '_EDUCAG', '_INCOMG', '_SMOKER3',\
    '_RFDRHV5', '_PACAT1', '_RFSEAT2', '_FLSHOT6', '_PNEUMO2', '_AIDTST3', 'CHCCOPD1', 'ADDEPEV2', 'CHCKIDNY', 'DIABETE3', 'SEX', 'MARITAL', 'DRADVISE']
target_name = ['_MICHD']
weights_name = ['_LLCPWT'] 

In [340]:
df = df[feature_names + target_name + weights_name]
df

Unnamed: 0,_RFHLTH,_HCVU651,_RFHYPE5,_RFCHOL,_ASTHMS1,_DRDXAR1,_RACEGR3,_AGE_G,_BMI5CAT,_EDUCAG,...,_AIDTST3,CHCCOPD1,ADDEPEV2,CHCKIDNY,DIABETE3,SEX,MARITAL,DRADVISE,_MICHD,_LLCPWT
0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,5.0,4.0,2.0,...,1.0,1.0,1.0,2.0,3.0,2.0,1.0,,2.0,341.384853
1,1.0,2.0,1.0,1.0,3.0,2.0,1.0,4.0,3.0,4.0,...,2.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0,108.060903
2,2.0,9.0,1.0,2.0,3.0,1.0,1.0,6.0,2.0,2.0,...,,2.0,2.0,2.0,3.0,2.0,2.0,,,255.264797
3,2.0,1.0,2.0,2.0,3.0,1.0,1.0,5.0,3.0,2.0,...,9.0,2.0,1.0,2.0,3.0,2.0,1.0,2.0,2.0,341.384853
4,2.0,1.0,1.0,1.0,3.0,1.0,1.0,5.0,2.0,3.0,...,1.0,2.0,2.0,2.0,3.0,2.0,1.0,2.0,2.0,258.682223
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
441451,2.0,9.0,2.0,2.0,3.0,1.0,5.0,6.0,1.0,1.0,...,2.0,2.0,2.0,2.0,1.0,2.0,3.0,1.0,2.0,531.980410
441452,1.0,1.0,1.0,1.0,3.0,2.0,5.0,2.0,3.0,3.0,...,1.0,2.0,2.0,2.0,3.0,2.0,1.0,2.0,2.0,746.416599
441453,2.0,9.0,2.0,2.0,3.0,2.0,5.0,6.0,4.0,2.0,...,,2.0,2.0,2.0,3.0,2.0,1.0,,2.0,207.663634
441454,1.0,1.0,2.0,1.0,3.0,2.0,5.0,4.0,2.0,3.0,...,2.0,2.0,2.0,2.0,3.0,1.0,5.0,1.0,2.0,515.758894


In [341]:
for column in ['_RFHYPE5', '_RFCHOL', '_RFDRHV5']:
    df[column] = df[column].apply(lambda x: 1.0 if x == 2.0 else 2.0 if x == 1.0 else x)

In [342]:
for column in ['_FLSHOT6', '_PNEUMO2']:
    df[column] = df[column].cat.add_categories([65.0])
    df[column] = df[column].fillna(65.0)

In [343]:
df['DIABETE3'] = df['DIABETE3'].map({1.0:1.0, 2.0:1.0, 3.0:2.0, 4.0:2.0, 7.0:7.0, 9.0:9.0}).astype('category')

In [344]:
df['_MICHD'] = df['_MICHD'].apply(lambda x: 0.0 if x == 2 else x)

In [345]:
def imputer(dataframe, category_value_tofill=None, columns_drop=None, columns_mode=None, columns_median=None):
    '''Imputes missing values to the input dataframe.
    
       Parameters
       ----------
       dataframe: Pandas dataframe
           dataframe with which to impute missing values.
       
       category_value_tofill: int, float, or string
           Value to used to fill missing values in categorical features.
       
       columns_drop: list-like
           List of columns to drop.
       
       columns_mode: list-like
           List of numeric columns to impute with the mode.
           
       columns_median: list-like
           List of numeric columns to impute with the mean.
    '''
    #Fill null values in categorical features with value_null
    if category_value_tofill != None:
        for column in dataframe.select_dtypes(include='category').columns:
            if any(dataframe[column].isnull()):
                dataframe[column] = dataframe[column].cat.add_categories([category_value_tofill])
                dataframe[column] = dataframe[column].fillna(value=category_value_tofill)
            
    #Droping columns, imputing with mode, and imputing with median.
    if columns_drop != None:
        dataframe = dataframe.drop(columns=columns_drop)
    if columns_mode != None:
        dataframe = dataframe.fillna(dataframe[columns_mode].mode().iloc[0, :])
    if columns_median != None:
        dataframe = dataframe.fillna(dataframe[columns_median].median())
    
    return dataframe

In [346]:
df = imputer(df, 999.0)

In [347]:
trans_list = ['Good_Health', 'Health_Insurance', 'Hypertension', 'High_Cholesterol', 'Asthma_Status', 'Arthritis', 'Race', 'Age_Cat', 'BMI_Cat', 'Education_Level', 'Income_Level', \
    'Smoker_Status', 'Heavy_Drinker', 'Physical_Activity', 'Seatbelt', 'Flu_Shot', 'Pneumonia_Vaccine', 'HIV', 'Bronchitis', 'Depression', 'Kidney_Disease', 'Diabetes', 'SEX', \
        'Marital Status', 'Sodium', 'Heart_Disease', 'Sample_Weights']

In [348]:
trans_dict = dict(zip(feature_names + target_name + weights_name, trans_list))
trans_dict

{'_RFHLTH': 'Good_Health',
 '_HCVU651': 'Health_Insurance',
 '_RFHYPE5': 'Hypertension',
 '_RFCHOL': 'High_Cholesterol',
 '_ASTHMS1': 'Asthma_Status',
 '_DRDXAR1': 'Arthritis',
 '_RACEGR3': 'Race',
 '_AGE_G': 'Age_Cat',
 '_BMI5CAT': 'BMI_Cat',
 '_EDUCAG': 'Education_Level',
 '_INCOMG': 'Income_Level',
 '_SMOKER3': 'Smoker_Status',
 '_RFDRHV5': 'Heavy_Drinker',
 '_PACAT1': 'Physical_Activity',
 '_RFSEAT2': 'Seatbelt',
 '_FLSHOT6': 'Flu_Shot',
 '_PNEUMO2': 'Pneumonia_Vaccine',
 '_AIDTST3': 'HIV',
 'CHCCOPD1': 'Bronchitis',
 'ADDEPEV2': 'Depression',
 'CHCKIDNY': 'Kidney_Disease',
 'DIABETE3': 'Diabetes',
 'SEX': 'SEX',
 'MARITAL': 'Marital Status',
 'DRADVISE': 'Sodium',
 '_MICHD': 'Heart_Disease',
 '_LLCPWT': 'Sample_Weights'}

In [349]:
df = df.rename(columns=trans_dict)
df

Unnamed: 0,Good_Health,Health_Insurance,Hypertension,High_Cholesterol,Asthma_Status,Arthritis,Race,Age_Cat,BMI_Cat,Education_Level,...,HIV,Bronchitis,Depression,Kidney_Disease,Diabetes,SEX,Marital Status,Sodium,Heart_Disease,Sample_Weights
0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,4.0,2.0,...,1.0,1.0,1.0,2.0,2.0,2.0,1.0,999.0,0.0,341.384853
1,1.0,2.0,2.0,2.0,3.0,2.0,1.0,4.0,3.0,4.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,108.060903
2,2.0,9.0,2.0,1.0,3.0,1.0,1.0,6.0,2.0,2.0,...,999.0,2.0,2.0,2.0,2.0,2.0,2.0,999.0,999.0,255.264797
3,2.0,1.0,1.0,1.0,3.0,1.0,1.0,5.0,3.0,2.0,...,9.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,0.0,341.384853
4,2.0,1.0,2.0,2.0,3.0,1.0,1.0,5.0,2.0,3.0,...,1.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,0.0,258.682223
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
441451,2.0,9.0,1.0,1.0,3.0,1.0,5.0,6.0,1.0,1.0,...,2.0,2.0,2.0,2.0,1.0,2.0,3.0,1.0,0.0,531.980410
441452,1.0,1.0,2.0,2.0,3.0,2.0,5.0,2.0,3.0,3.0,...,1.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,0.0,746.416599
441453,2.0,9.0,1.0,1.0,3.0,2.0,5.0,6.0,4.0,2.0,...,999.0,2.0,2.0,2.0,2.0,2.0,1.0,999.0,0.0,207.663634
441454,1.0,1.0,1.0,2.0,3.0,2.0,5.0,4.0,2.0,3.0,...,2.0,2.0,2.0,2.0,2.0,1.0,5.0,1.0,0.0,515.758894


In [350]:
df = df.drop(df[df[trans_dict[target_name[0]]] == 999.0].index)

In [351]:
cols_to_one_hot = [column for column in df.select_dtypes(include='category').columns if column not in ['Heart_Disease']]

column_trans = ColumnTransformer([('categorical', OneHotEncoder(sparse=False), cols_to_one_hot)], remainder='passthrough')
column_trans.fit(df)
column_names_trans = np.concatenate([column_trans.named_transformers_['categorical'].get_feature_names_out(), np.array([trans_dict[target_name[0]], trans_dict[weights_name[0]]])])

In [352]:
df = pd.DataFrame(column_trans.transform(df), columns=column_names_trans)
df

Unnamed: 0,Good_Health_1.0,Good_Health_2.0,Good_Health_9.0,Health_Insurance_1.0,Health_Insurance_2.0,Health_Insurance_9.0,Hypertension_1.0,Hypertension_2.0,Hypertension_9.0,High_Cholesterol_1.0,...,Marital Status_5.0,Marital Status_6.0,Marital Status_9.0,Sodium_1.0,Sodium_2.0,Sodium_7.0,Sodium_9.0,Sodium_999.0,Heart_Disease,Sample_Weights
0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,341.384853
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,108.060903
2,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,341.384853
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,258.682223
4,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,256.518591
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437509,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,531.980410
437510,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,746.416599
437511,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,207.663634
437512,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,515.758894


## Training

In [353]:
y = df[trans_dict[target_name[0]]]
w = df[trans_dict[weights_name[0]]]
X = df.drop(columns=[y.name, w.name])

In [354]:
y[y==1.0].shape[0] / y.shape[0]

0.08830117436242041

In [355]:
X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X, y, w, stratify=y)

In [360]:
log_regressor = LogisticRegression(max_iter=10000)
log_regressor.fit(X_train, y_train, sample_weight=w_train)

In [368]:
y_pred_train = log_regressor.predict(X_train)

In [361]:
y_pred = log_regressor.predict(X_test)

In [370]:
print(classification_report(y_train, y_pred_train, sample_weight=w_train))

              precision    recall  f1-score   support

         0.0       0.94      0.99      0.97 174923511.369091
         1.0       0.55      0.11      0.18 12078149.891089914

    accuracy                           0.94 187001661.26018092
   macro avg       0.75      0.55      0.58 187001661.26018092
weighted avg       0.92      0.94      0.92 187001661.26018092



In [362]:
print(classification_report(y_test, y_pred, sample_weight=w_test))

              precision    recall  f1-score   support

         0.0       0.94      0.99      0.97 58456388.11512658
         1.0       0.53      0.10      0.17 3930751.495180341

    accuracy                           0.94 62387139.61030692
   macro avg       0.74      0.55      0.57 62387139.61030692
weighted avg       0.92      0.94      0.92 62387139.61030692



In [364]:
feature_coef = list(zip(X.columns, log_regressor.coef_[0]))
feature_coef

[('Good_Health_1.0', -0.5413470323669557),
 ('Good_Health_2.0', 0.26461461594776226),
 ('Good_Health_9.0', -0.014114519424592995),
 ('Health_Insurance_1.0', -0.15118988495764496),
 ('Health_Insurance_2.0', -0.1976646764523985),
 ('Health_Insurance_9.0', 0.05800762556647519),
 ('Hypertension_1.0', 0.2549015970282028),
 ('Hypertension_2.0', -0.42111735424542956),
 ('Hypertension_9.0', -0.12463117862647433),
 ('High_Cholesterol_1.0', 0.3637792119252075),
 ('High_Cholesterol_2.0', -0.2393448569977608),
 ('High_Cholesterol_9.0', -0.12369823316227982),
 ('High_Cholesterol_999.0', -0.29158305760847736),
 ('Asthma_Status_1.0', 0.008016994607702065),
 ('Asthma_Status_2.0', 0.039086719466425716),
 ('Asthma_Status_3.0', -0.14060430306593602),
 ('Asthma_Status_9.0', -0.19734634685216962),
 ('Arthritis_1.0', -0.010101557640086273),
 ('Arthritis_2.0', -0.2593302586100651),
 ('Arthritis_999.0', -0.021415119593363962),
 ('Race_1.0', 0.043234546903933274),
 ('Race_2.0', -0.2345770977356101),
 ('Race_3.

In [367]:
feature_coef_sorted = sorted(feature_coef, key=lambda x: abs(x[1]), reverse=True)
feature_coef_sorted

[('Diabetes_9.0', -1.7267393147325047),
 ('Age_Cat_1.0', -1.0754460751611243),
 ('Diabetes_1.0', 0.6977462219882181),
 ('Age_Cat_6.0', 0.6661398147569413),
 ('Age_Cat_5.0', 0.6584331756888703),
 ('Age_Cat_2.0', -0.654214521590607),
 ('Diabetes_7.0', 0.597151759151019),
 ('Sodium_9.0', 0.5892475578656404),
 ('Sodium_2.0', -0.5613012561571468),
 ('Good_Health_1.0', -0.5413470323669557),
 ('SEX_2.0', -0.5164647123083319),
 ('Kidney_Disease_2.0', -0.44793752717243274),
 ('Hypertension_2.0', -0.42111735424542956),
 ('Bronchitis_2.0', -0.4018237532110845),
 ('High_Cholesterol_1.0', 0.3637792119252075),
 ('Diabetes_2.0', 0.3453091801709787),
 ('Depression_2.0', -0.3371714830397174),
 ('Age_Cat_4.0', 0.31577188857121263),
 ('Sodium_999.0', -0.29671384869253586),
 ('High_Cholesterol_999.0', -0.29158305760847736),
 ('Good_Health_2.0', 0.26461461594776226),
 ('Bronchitis_1.0', 0.2607841593130337),
 ('Arthritis_2.0', -0.2593302586100651),
 ('Income_Level_5.0', -0.2569843123930432),
 ('Marital Stat