In [1]:
import category_encoders as ce

import pandas as pd
import numpy as np
import datetime

import matplotlib.pyplot as plt

import seaborn as sns
sns.set_style(style='white')


from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
# Need to go find this one: from xgboost import XGBClassifier


from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve



from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer



from scipy.stats import linregress
from scipy import stats

from sklearn.tree import export_graphviz
import pydot

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import lightgbm as lgb

In [2]:
def ensure_data_types(df, num_list, binary_list, string_list):
    # Ensure datatypes are correct
    for i in df.columns:
        if i in num_list:
            df[i] = df[i].astype('Float64')
        elif i in binary_list:
            df[i] = df[i].astype('Int64')
        elif i in string_list:
            df[i] = df[i].astype('category')
    return df

In [3]:
def ensure_data_types_V2(df, num_list, binary_list, string_list):
    for i in df.columns:
        if i in num_list:
            df[i] = df[i].astype('Float64')
        elif i in binary_list:
            df[i] = df[i].astype('Float64')
        elif i in string_list:
            df[i] = df[i].astype('str')
    return df

In [4]:
# Read in all the necessary files

data_dictionary = pd.read_csv('WiDS Datathon 2020 Dictionary.csv')
data = pd.read_csv('training_v2.csv')
predict_data = pd.read_csv('unlabeled.csv')
sample_submission = pd.read_csv('samplesubmission.csv')
submission_template = pd.read_csv('solution_template.csv')

In [5]:
# Make copies for safe keeping
data_copy = data.copy()
predict_data_copy = predict_data.copy()
data_dictionary_copy = data_dictionary.copy()

In [6]:
# Rename Columns for easier selection in Data Dictionary
data_dictionary.columns = ['Category','VariableName','UnitofMeasure','DataType','Descrption','Example']


# Preoperly relabel the meta data; Meta data list will be used to pull features and process by type
data_dictionary.loc[data_dictionary.VariableName == 'encounter_id', 'DataType'] = 'string'
data_dictionary.loc[data_dictionary.VariableName == 'hospital_id', 'DataType'] = 'string'
data_dictionary.loc[data_dictionary.VariableName == 'patient_id', 'DataType'] = 'string'
data_dictionary.loc[data_dictionary.VariableName == 'hospital_death', 'DataType'] = 'Target'
data_dictionary.loc[data_dictionary.VariableName == 'bmi', 'DataType'] = 'numeric'
data_dictionary.loc[data_dictionary.VariableName == 'icu_id', 'DataType'] = 'string'
data_dictionary.loc[data_dictionary.VariableName == 'apache_2_diagnosis', 'DataType'] = 'string'
data_dictionary.loc[data_dictionary.VariableName == 'apache_3j_diagnosis', 'DataType'] = 'string'

In [7]:
# Concatenate the train & test data for cleaning & feature engineering

all_data = pd.concat([data, predict_data])
all_data_copy = all_data.copy()

In [8]:
# Get feature names by type

num_feats_list = []
binary_feats_list = []
string_feats_list =[]

variable_names = list(set(data_dictionary.VariableName))
data_dictionary = data_dictionary.set_index('VariableName')


for i in variable_names:
    #print(data_dictionary.loc[i, 'DataType'])
    if (i == 'VariableName') | (i == 'pred') | (i == 'icu_admit_type'):
        pass
    else :
        if data_dictionary.loc[i, 'DataType'] == 'string':
            string_feats_list.append(i)    
        elif data_dictionary.loc[i, 'DataType'] == 'binary':
            binary_feats_list.append(i)
        elif data_dictionary.loc[i, 'DataType'] == 'integer':
             num_feats_list.append(i)
        elif data_dictionary.loc[i, 'DataType'] == 'numeric':
             num_feats_list.append(i)

In [9]:
string_feats_list.remove('hospital_id')
string_feats_list.remove('encounter_id')
string_feats_list.remove('patient_id')
string_feats_list.remove('icu_id')

In [10]:
dfs = [data, predict_data]
for i in dfs:
    i = ensure_data_types_V2(i,num_feats_list, binary_feats_list, string_feats_list)

In [11]:
train_eng = all_data[all_data.encounter_id.isin(data.encounter_id)]
predict_eng = all_data[all_data.encounter_id.isin(predict_data.encounter_id)]

## Split back to Train & Predict Data Sets

In [12]:
X = train_eng.drop(columns = ['hospital_death'])
y = train_eng[['hospital_death']]

y['hospital_death'] = y['hospital_death'].astype('Float64')

X.drop(columns=['hospital_id','encounter_id','patient_id','icu_id'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


# Pipelines (Data Cleaning, Feature Engineering)

In [17]:
search_space = [{"clf__n_neighbors": [1,2,5,10,15,200,500]}]

winning_encoder = [ce.target_encoder.TargetEncoder]

winning_classifier = [KNeighborsClassifier(n_neighbors=5, n_jobs=-1)]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


for classifier in winning_classifier:
    binary_features = binary_feats_list
    binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=-1))])

    numeric_features = num_feats_list
    numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())])


    for encoder in winning_encoder:
        print("Encoder: ")
        print(encoder)
        categorical_features = string_feats_list
        categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('woe', encoder())])

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features),
                ('binary', binary_transformer, binary_features)])

        pipe = Pipeline(steps=[('preprocessor', preprocessor),
                              ('clf', classifier)])
        print("Classifier: ")
        print(classifier)
        print("<=================================================================================================>")
        grid_classifier = GridSearchCV(pipe, search_space, cv=3, verbose=0).fit(X,y.values.ravel())        
        

Encoder: 
<class 'category_encoders.target_encoder.TargetEncoder'>
Classifier: 
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform')


KeyError: 'knn__n_neighbors'

In [19]:
k_best = grid_classifier.best_estimator_.get_params()["clf__n_neighbors"]
print("k_best: ")
print(k_best)

k_best: 
5
