In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [2]:
import category_encoders as ce

import pandas as pd
import numpy as np
import datetime

import matplotlib.pyplot as plt

import seaborn as sns
sns.set_style(style='white')


from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
# Need to go find this one: from xgboost import XGBClassifier


from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve



from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer

from xgboost import XGBClassifier


from scipy.stats import linregress
from scipy import stats

from sklearn.tree import export_graphviz
import pydot

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import lightgbm as lgb

In [3]:
def ensure_data_types(df, num_list, binary_list, string_list):
    # Ensure datatypes are correct
    for i in df.columns:
        if i in num_list:
            df[i] = df[i].astype('Float64')
        elif i in binary_list:
            df[i] = df[i].astype('Int64')
        elif i in string_list:
            df[i] = df[i].astype('category')
    return df

In [4]:
def ensure_data_types_V2(df, num_list, binary_list, string_list):
    for i in df.columns:
        if i in num_list:
            df[i] = df[i].astype('Float64')
        elif i in binary_list:
            df[i] = df[i].astype('Float64')
        elif i in string_list:
            df[i] = df[i].astype('str')
    return df

In [5]:
# Read in all the necessary files

data_dictionary = pd.read_csv('WiDS Datathon 2020 Dictionary.csv')
data = pd.read_csv('training_v2.csv')
predict_data = pd.read_csv('unlabeled.csv')
sample_submission = pd.read_csv('samplesubmission.csv')
submission_template = pd.read_csv('solution_template.csv')

In [6]:
# Make copies for safe keeping
data_copy = data.copy()
predict_data_copy = predict_data.copy()
data_dictionary_copy = data_dictionary.copy()

In [7]:
# Rename Columns for easier selection in Data Dictionary
data_dictionary.columns = ['Category','VariableName','UnitofMeasure','DataType','Descrption','Example']


# Preoperly relabel the meta data; Meta data list will be used to pull features and process by type
data_dictionary.loc[data_dictionary.VariableName == 'encounter_id', 'DataType'] = 'string'
data_dictionary.loc[data_dictionary.VariableName == 'hospital_id', 'DataType'] = 'string'
data_dictionary.loc[data_dictionary.VariableName == 'patient_id', 'DataType'] = 'string'
data_dictionary.loc[data_dictionary.VariableName == 'hospital_death', 'DataType'] = 'Target'
data_dictionary.loc[data_dictionary.VariableName == 'bmi', 'DataType'] = 'numeric'
data_dictionary.loc[data_dictionary.VariableName == 'icu_id', 'DataType'] = 'string'
data_dictionary.loc[data_dictionary.VariableName == 'apache_2_diagnosis', 'DataType'] = 'string'
data_dictionary.loc[data_dictionary.VariableName == 'apache_3j_diagnosis', 'DataType'] = 'string'


# data_dictionary.head()
# This checks if any DataTypes are null
# print(data_dictionary[data_dictionary.DataType.isnull()])

In [8]:
# Concatenate the train & test data for cleaning & feature engineering

#data = pd.concat([train_data, predict_data])
#data_copy = data.copy()

In [9]:
# Get feature names by type

num_feats_list = []
binary_feats_list = []
string_feats_list =[]

variable_names = list(set(data_dictionary.VariableName))
data_dictionary = data_dictionary.set_index('VariableName')
#print(variable_names)

for i in variable_names:
    #print(data_dictionary.loc[i, 'DataType'])
    if (i == 'VariableName') | (i == 'pred') | (i == 'icu_admit_type'):
        pass
    else :
        if data_dictionary.loc[i, 'DataType'] == 'string':
            # print("Is String:" + i)
            string_feats_list.append(i)    
        elif data_dictionary.loc[i, 'DataType'] == 'binary':
            # print("Is Binary:" + i)
            binary_feats_list.append(i)
        elif data_dictionary.loc[i, 'DataType'] == 'integer':
            #print("Is Numeric: " + i)
             num_feats_list.append(i)
        elif data_dictionary.loc[i, 'DataType'] == 'numeric':
            # print("Is Numeric: " + i)
             num_feats_list.append(i)

In [10]:
# Ensure datatypes are correct
dfs = [data, predict_data]
for i in dfs:
    i = ensure_data_types(i,num_feats_list, binary_feats_list, string_feats_list)


#for i in data.columns:
#    if i in num_feats_list:
#        data[i] = data[i].astype('Float64')
#    elif i in binary_feats_list:
#        data[i] = data[i].astype('Int64')
#    elif i in string_feats_list:
#        data[i] = data[i].astype('category')

In [11]:
# Get train & predicts back in order to perform upsampling and feature selection
not_missing = []
missing_columns = []
for i in data.columns:
    if i in num_feats_list:
        not_missing.append(i)
    elif i in binary_feats_list:
        not_missing.append(i)
    elif i in string_feats_list:
        not_missing.append(i)
    else:
        missing_columns.append(i)



#train_eng = data[data.encounter_id.isin(train_data.encounter_id)]



#predict_eng = data[data.encounter_id.isin(predict_data.encounter_id)]

# Upsampling to address imbalanced data set


In [12]:
dfs = [data, predict_data]
for i in dfs:
    i = ensure_data_types_V2(i,num_feats_list, binary_feats_list, string_feats_list)

In [13]:
# Ensure datatypes are correct
#for i in train_eng.columns:
#    if i in num_feats_list:
#        train_eng[i] = train_eng[i].astype('Float64')
#    elif i in binary_feats_list:
#        train_eng[i] = train_eng[i].astype('Float64')
#    elif i in string_feats_list:
#        train_eng[i] = train_eng[i].astype('str')

## Dropping Colinear Columns

Collinear features are features that are highly correlated with one another. In machine learning, these lead to decreased generalization performance on the test set due to high variance and less model interpretability.

In [14]:
# Threshold for removing correlated variables
threshold = 0.9

# Absolute value correlation matrix
corr_matrix = data.corr().abs()
corr_matrix.head()

Unnamed: 0,hospital_death,age,bmi,elective_surgery,height,pre_icu_los_days,readmission_status,weight,albumin_apache,apache_post_operative,...,apache_4a_hospital_death_prob,apache_4a_icu_death_prob,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis
hospital_death,1.0,0.111017,0.031247,0.093574,0.019526,0.063316,,0.038362,0.193809,0.083674,...,0.311043,0.283913,0.004403,0.039453,0.015784,0.038864,0.043973,0.029788,0.018722,0.051105
age,0.111017,1.0,0.087077,0.06732,0.109937,0.049872,,0.127252,0.116633,0.059246,...,0.143167,0.076275,0.029477,0.028065,0.077908,0.020061,0.025007,0.03031,0.023335,0.025924
bmi,0.031247,0.087077,1.0,0.015921,0.056316,0.001531,,0.877339,0.052009,0.01542,...,0.033546,0.013796,0.020434,0.002377,0.172943,0.001855,0.031144,0.013375,0.010017,0.04338
elective_surgery,0.093574,0.06732,0.015921,1.0,0.02362,0.133704,,0.0269,0.024966,0.908247,...,0.098412,0.06125,0.006229,0.031512,0.001645,0.0347,0.014695,0.017587,0.008215,0.015369
height,0.019526,0.109937,0.056316,0.02362,1.0,0.008075,,0.391967,0.061671,0.025276,...,0.029071,0.011588,0.00929,0.012043,0.00098,0.010481,0.00053,0.001718,0.00837,0.004921


In [15]:
# Upper triangle of correlations
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head()
# Select columns with correlations above threshold
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
print('There are %d columns to remove.' % (len(to_drop)))
#Drop the columns with high correlations
data = data.drop(columns = to_drop)

There are 59 columns to remove.


In [16]:
# Need to remove the collinear columns from the dataset

for i in to_drop:
    if i in num_feats_list:
        num_feats_list.remove(i)
    elif i in binary_feats_list:
        binary_feats_list.remove(i)
    elif i in string_feats_list:
        string_feats_list.remove(i)

## Split back to Train & Predict Data Sets

In [17]:
X = data.drop(columns = ['hospital_death'])


y = data[['hospital_death']]

y['hospital_death'] = y['hospital_death'].astype('Float64')


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


# Pipelines (Data Cleaning, Feature Engineering)

In [None]:
classifiers = [
    #RandomForestClassifier(),
    #AdaBoostClassifier(),
    #GradientBoostingClassifier(),
    #XGBClassifier(),
    lgb.LGBMClassifier()
    ]


encoder_list = [ce.backward_difference.BackwardDifferenceEncoder, 
#               ce.basen.BaseNEncoder,
#               ce.binary.BinaryEncoder,
#                ce.cat_boost.CatBoostEncoder,
#                ce.hashing.HashingEncoder,
                ce.helmert.HelmertEncoder,
#                ce.james_stein.JamesSteinEncoder,
                ce.one_hot.OneHotEncoder,
#               ce.leave_one_out.LeaveOneOutEncoder,
#                ce.m_estimate.MEstimateEncoder,
#                ce.ordinal.OrdinalEncoder,
                ce.polynomial.PolynomialEncoder,
                ce.sum_coding.SumEncoder,
                ce.target_encoder.TargetEncoder,
                ce.woe.WOEEncoder
                ]



# Numeric_transformation list

# Ordinal transformation list (i.e. using label encoder)



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)



for classifier in classifiers:
    binary_features = binary_feats_list
    binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=-1))])

    numeric_features = num_feats_list
    numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))])


    for encoder in encoder_list:
        print("Encoder: ")
        print(encoder)
        categorical_features = string_feats_list
        categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('woe', encoder())])

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features),
                ('binary', binary_transformer, binary_features)])

        pipe = Pipeline(steps=[('preprocessor', preprocessor),
                              ('clf', classifier)])
        pipe.fit(X_train, y_train.values.ravel())  
        print ("Classifier: ")
        print(classifier)
        print("model score: %.3f" % pipe.score(X_test, y_test))
        print(cross_val_score(estimator=pipe, X=X_train, y=y_train.values.ravel(), cv = 3, scoring = 'roc_auc'))
        print("<=================================================================================================>")

In [21]:
winning_classifier = [lgb.LGBMClassifier()]


winning_encoder = [ce.helmert.HelmertEncoder]



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)



for classifier in winning_classifier:
    binary_features = binary_feats_list
    binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=-1))])

    numeric_features = num_feats_list
    numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))])


    for encoder in winning_encoder:
        print("Encoder: ")
        print(encoder)
        categorical_features = string_feats_list
        categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('woe', encoder())])

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features),
                ('binary', binary_transformer, binary_features)])

        pipe = Pipeline(steps=[('preprocessor', preprocessor),
                              ('clf', classifier)])
        pipe.fit(X_train, y_train.values.ravel())  
        print ("Classifier: ")
        print(classifier)
        print("model score: %.3f" % pipe.score(X_test, y_test))
        print(cross_val_score(estimator=pipe, X=X_train, y=y_train.values.ravel(), cv = 3, scoring = 'roc_auc'))
        print("<=================================================================================================>")
        
        
        # Make predictions on the predict_data        
        features = num_feats_list + binary_feats_list + string_feats_list
        
        X_predict = predict_data.drop(columns=['hospital_death'])
        
        predict_data['hospital_death'] = pipe.predict_proba(X_predict[features])[:,1]

        # Prepare submissions
        submission = predict_data[['encounter_id','hospital_death']]

        # Save submission file as .csv
        submission.to_csv('MMBAZEL_WIDS2020_solution.csv')
        
  

Encoder: 
<class 'category_encoders.helmert.HelmertEncoder'>
Classifier: 
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
model score: 0.930
[0.89900409 0.89793564 0.89069815]


## Predict & Prepare File For Submission

# Make predictions on the test data
X_predict = predict_eng.drop(columns=['hospital_death'])
predict_eng['hospital_deaths'] = win_clf.predict(X_predict[features])

# Prepare submissions
submission = predict_eng[['encounter_id','hospital_death']]

# Save submission file as .csv
submission.to_csv('solution_template.csv')