In [None]:
from random import sample
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Imputer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import cross_val_score
#from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
#from imblearn.over_sampling import SMOTE
pd.set_option('display.max_columns', 200)
%matplotlib inline

In [None]:
def load_file(file):
    """
    Takes a csv file and returns a Pandas DataFrame.
    """
    return pd.read_csv(file)

def downsample_data(df):
    """
    Downsamples data in order to prevent effects of class imbalance problem.
    """
    majority_class_data = df[df['hospital_death'] == 0].reset_index(drop=True)
    idx = list(range(0,83798))
    random.seed(42)
    subset = sample(idx, 7915)
    majority_class_data = majority_class_data.loc[subset].reset_index(drop=True)
    downsampled_data = majority_class_data.append(df[df['hospital_death'] == 1])
    # Shuffling the dataset so the distribution of 0 and 1 is random throughout the dataset
    downsampled_data = downsampled_data.sample(frac=1).reset_index(drop=True)
    return downsampled_data

def feature_target_split(df, target_variable):
    """
    Splits the dataframe into feature variables and target variable.
    """
    feature_df = df.drop(target_variable, axis=1)
    target = df[target_variable]
    return feature_df, target
    
def drop_columns(df, var_list):
    """
    Cleans dataframe column-wise by dealing with missing values.
    """
    df = df.drop(columns = var_list, axis = 1)
    return df

def drop_rows(df, idx):
    """
    Cleans dataframe row-wise by dealing with missing values.
    """
    df = df.drop(df.index[idx]).reset_index(drop=True)
    return df

def BMI_to_cat(x):
    """
    Transforms BMI into a categorical variable.
    """
    try :
        x = float(x)
        if (x >= 0) and (x < 18.5) : return 'Underweight'
        elif ((x >= 18.5 ) and (x <= 24.9)) : return 'Normal Weight'
        elif ((x >= 25 )   and (x <= 29.9)) : return 'Overweight'
        else : return 'Obese'
    except ValueError:
        return 'Other'
    
def PILD_to_cat(x):
    """
    Transforms pre_icu_los_days into a categorical variable.
    """
    try :
        x = float(x)
        if (x <= 0)                : return 'Range[- to 0]'
        elif (x > 0) and (x <= 5) : return  'Range[1 to 5]'
        elif (x > 5) and (x <= 10) : return 'Range[5 to 10]'
        elif (x > 10) and (x <= 15) : return 'Range[11 to 15]'
        elif ((x > 15) and (x <= 20)) : return 'Range[16 to 20]'
        elif ((x > 20) and (x <= 25)) : return 'Range[21 to 25]'
        elif ((x > 25 ) and (x <= 30)) : return 'Range[25 to 30]'
               
        else : return 'Range[> 30]'
    except ValueError:
        return 'Other'
    
def num_cat_col(df, file):
    """
    Splits the columns of the dataframe into categorical and numerical columns 
    and returns them as 2 separate lists.
    Columns having binary values (0 or 1) are added to the list of categorical columns.
    """
    num_col = []
    cat_col = []
    dictionary = load_file(file)
    new_dictionary = dictionary[dictionary['Data Type'] == 'binary']
    
    for column in df.columns:
        if df[column].dtypes == 'object':
            cat_col.append(column)
        elif column in [value for key, value in new_dictionary['Variable Name'].iteritems()]:
            cat_col.append(column)
        else:
            num_col.append(column)
    return cat_col, num_col

def impute_cat_col(df):
    """
    Imputes null values in categorical variables.
    """
    # Imputing values for ethnicity
    df.loc[:, 'ethnicity'] = df['ethnicity'].fillna('Other/Unknown')
    
    # Imputing values for gender and then encoding it as a numerical variable
    np.random.seed(0)
    gender_prob = df['gender'].value_counts(normalize=True).tolist()
    df.loc[:, 'gender'] = df['gender'].fillna(pd.Series(np.random.choice(['M', 'F'], p=gender_prob, size=len(df))))
    df.loc[:, 'gender'] = df['gender'].replace({'F': 0, 'M': 1})
    
    # Imputing missing values for icu_admit_source
    df.loc[:, 'icu_admit_source'] = df['icu_admit_source'].fillna('Unknown')
    
    # Imputing missing values for apache_3j_bodysystem
    df.loc[:, 'apache_3j_bodysystem'] = df['apache_3j_bodysystem'].fillna('Unknown')

    return df

def impute_num_col(df):
    """
    Imputes null values in numerical variables with the mean.
    """
    return df.fillna(df.mean())

def one_hot_encode(df):
    """
    One hot encoding the categorical variables.
    """
    return pd.get_dummies(df)

def concat_num_cat(df1, df2):
    """
    Concatenating the numerical and categorical columns to get the final consolidated dataset.
    """
    return pd.concat([df1, df2], axis=1)

def ordinal_temp(x):
    if x > 38:
        return 3
    elif x < 36:
        return 1
    else:
        return 2
    
def ordinal_heartrate(x):
    if (x > 90) & (x < 100):
        return 4
    elif x >= 100:
        return 5
    elif (x >= 70) and (x <= 90):
        return 3
    elif (x >= 60) and (x < 70):
        return 2
    else:
        return 1

def get_group_stats(df, col, target):
    groups = df.groupby(col)
    group_stats = pd.DataFrame({'group_mean': groups[target].mean()})
    group_stats['group_max'] = groups[target].max()
    group_stats['group_min'] = groups[target].min()
    group_stats['group_median'] = groups[target].median()
    group_stats.reset_index(inplace=True)
    return group_stats


def imputeMissingVal(X_train=None, X_test=None,  train_or_test='train'):
    if train_or_test == 'train':
        imp = Imputer(strategy="mean")
        X_imp = imp.fit_transform(X_train)
        return X_imp
    else:
        imp = Imputer(strategy='mean')
        X_obj = imp.fit(X_train)
        X_imp = imp.transform(X_test)
        return X_imp
    
def scaleData(X_train=None, X_test=None, train_or_test="train"):
    if train_or_test == 'train':
        scaler = StandardScaler()
        X_sc = scaler.fit_transform(X_train)
        return X_sc
    else:
        scaler = StandardScaler()
        X_sc = scaler.fit(X_train).transform(X_test)
        return X_sc
    
def model_train_cv(model_def, X, y, cv=5, scoring='roc_auc'):
    model = model_def.fit(X,y)
    score = cross_val_score(model, X, y, cv=cv, scoring=scoring).mean()
    return model, score

def model_predictProba(model, X_ToPredict):
    return model.predict_proba(X_ToPredict)[:,1]

    
cat_cols_group_stats = ['bmi', 'ethnicity', 'icu_admit_source', 'icu_stay_type', 
           'pre_icu_los_days', 'icu_type', 'apache_3j_bodysystem']

temp_cols = ['temp_apache', 'd1_temp_max', 'd1_temp_min', 'h1_temp_max', 'h1_temp_min']
new_temp_cols = ['temp_cat', 'd1_temp_max_cat', 'd1_temp_min_cat', 'h1_temp_max_cat', 'h1_temp_min_cat']

heart_cols = ['h1_heartrate_max', 'h1_heartrate_min', 'd1_heartrate_max', 'd1_heartrate_min']
new_heart_cols = ['h1_heartrate_max_cat', 'h1_heartrate_min_cat', 'd1_heartrate_max_cat', 'd1_heartrate_min_cat']

In [None]:
# Define required files
data_file = 'training_v2.csv'
dictionary_file = 'WiDS Datathon 2020 Dictionary.csv'
test_file = 'unlabeled.csv'

print('*************************Training Data*************************')

print('\nLoading data...')
data = load_file(data_file)
print('Training data shape: {}'.format(data.shape))

print('\nDownsampling training data...')
new_data = downsample_data(data)
print('Downsampled data shape: {}'.format(new_data.shape))

print('\nSplitting training data into feature variables and target variable...')
train_X, train_y = feature_target_split(new_data, 'hospital_death')
print('Training feature data shape: {}'.format(train_X.shape))
print('Training target data shape: {}'.format(train_y.shape))

print('\nDealing with missing values - COLUMN WISE...')
# Columns to be dropped based on manual evaluation
feature_list = ['encounter_id', 'patient_id', 'hospital_id', 'icu_id', 'height', 'weight', 'readmission_status']
# Columns to be dropped that have greater than 75% of their data missing
columns = train_X.columns[train_X.isnull().mean() > 0.75].tolist()
for column in columns:
    feature_list.append(column)
print('Dropping {} columns'.format(len(feature_list)))
column_majority_train_X = drop_columns(train_X, feature_list)
print('Training feature data shape: {}'.format(column_majority_train_X.shape))

print('\nDealing with missing values - ROW WISE...')
# Rows to be dropped that have greater than 50% of their data missing
idx1 = column_majority_train_X.index[column_majority_train_X.isnull().mean(axis=1) > 0.5].tolist()
print('Dropping {} rows'.format(len(idx1)))
clean_train_X = drop_rows(column_majority_train_X, idx1)
print('Training feature data shape: {}'.format(clean_train_X.shape))
clean_train_y = drop_rows(train_y, idx1)
print('Training target data shape: {}'.format(clean_train_y.shape))

print('\nTransforming features...')
clean_train_X['bmi'] = clean_train_X['bmi'].apply(BMI_to_cat)
clean_train_X['pre_icu_los_days'] = clean_train_X['pre_icu_los_days'].apply(PILD_to_cat)

print('\nSplitting categorical variables and numerical variables...')
cat_col, num_col = num_cat_col(clean_train_X, dictionary_file)
print('Categorical data shape: {}'.format(clean_train_X[cat_col].shape))
print('Numerical data shape: {}'.format(clean_train_X[num_col].shape))

print('\nCleaning up categorical variables...')
# Dropping columns hospital_admit_source, apache_2_bodysystem, gcs_unable_apache
cat_list = ['hospital_admit_source', 'apache_2_bodysystem', 'gcs_unable_apache']
print('Dropping {} columns'.format(len(cat_list)))
X_cat = drop_columns(clean_train_X[cat_col], cat_list)
# Imputing missing values based on evaluation of data
X_cat_imputed = impute_cat_col(X_cat)
# Obtaining the index values of rows where arf_apache is null and then dropping them
idx2 = X_cat_imputed.index[X_cat_imputed['arf_apache'].isnull()].tolist()
print('Dropping {} rows'.format(len(idx2)))
X_cat_final = drop_rows(X_cat_imputed, idx2)
print('Categorical data shape: {}'.format(X_cat_final.shape))

print('\nCleaning up numerical variables...')
# Dropping rows where arf_apache is null (obtained from clean_cat_col function)
print('Dropping {} rows'.format(len(idx2)))
X_num = drop_rows(clean_train_X[num_col], idx2)
num_column_list = X_num.columns[X_num.isnull().mean() > 0.50].tolist()
print('Dropping {} columns'.format(len(num_column_list)))
X_num = drop_columns(X_num, num_column_list)
X_num = impute_num_col(X_num)
print('Numerical data shape: {}'.format(X_num.shape))

print('\nCleaning up the target variable...')
# Dropping rows where arf_apache is null (obtained from clean_cat_col function)
print('Dropping {} rows'.format(len(idx2)))
final_train_y = drop_rows(clean_train_y, idx2)
print('Training target shape: {}'.format(final_train_y.shape))

print('\nConcatenating the numerical and categorical variables...')
new_dataset = concat_num_cat(X_num, X_cat_final)
print('New training feature dataset shape: {}'.format(new_dataset.shape))

print('\nTransforming categorical features into ordinal features')
print('\nTemperature features first')
for i in range(len(temp_cols)):
    new_dataset[new_temp_cols[i]] = new_dataset[temp_cols[i]].apply(ordinal_temp)

print('\nHeart rate features')
for i in range(len(heart_cols)):
    new_dataset[new_heart_cols[i]] = new_dataset[heart_cols[i]].apply(ordinal_heartrate)

print('\nConcatenating the numerical and categorical variables...')
new_dataset = concat_num_cat(final_train_y, new_dataset)

print('\nCreating a dataframe with group description statisitics...')
group_stats = get_group_stats(new_dataset, cat_cols_group_stats, 'hospital_death')

print('\nMerging the group statistics with the training set...')
tmp_df = pd.merge(new_dataset, group_stats, on=cat_cols_group_stats, how='left')

print('\nRemoving the target column from the training set...')
tmp = drop_columns(tmp_df, 'hospital_death')

print('\nOne Hot Encoding the categorical variables...')
X_final = one_hot_encode(tmp)
print('Categorcal data shape: {}'.format(X_final.shape))

print('\nImputing missing value for the training set...')
X_final_imp = imputeMissingVal(X_final, None, 'train')

print('\nStandardizing the training set...')
X_final_imp_sc = scaleData(X_final_imp, None, 'train')

print('\nTraining the model with the training set and generating a metric...')
gbm_model, score = model_train_cv(GradientBoostingClassifier(random_state=42, n_estimators=200,
                                   max_features='auto' , max_depth=5, subsample=0.8),
                 X_final_imp_sc, final_train_y)

print("Cross validation Score:", score)
y_pred_train = model_predictProba(gbm_model, X_final_imp_sc)
print(y_pred_train)

print('\n\n*************************Test Data*************************')

print('\nLoading data...')
test_data = load_file(test_file)
print('Test data shape: {}'.format(test_data.shape))

print('\nSplitting test data into feature variables and target variable...')
test_X, test_y = feature_target_split(test_data, 'hospital_death')
print('Test feature data shape: {}'.format(test_X.shape))
print('Test target data shape: {}'.format(test_y.shape))

print('\nDealing with missing values - COLUMN WISE...')
print('Dropping {} columns'.format(len(feature_list)))
clean_test_X = drop_columns(test_X, feature_list)
print('Training feature data shape: {}'.format(clean_test_X.shape))

# print('\nTransforming features...')
clean_test_X['bmi'] = clean_test_X['bmi'].apply(BMI_to_cat)
clean_test_X['pre_icu_los_days'] = clean_test_X['pre_icu_los_days'].apply(PILD_to_cat)

print('\nSplitting categorical variables and numerical variables...')
test_cat_col, test_num_col = num_cat_col(clean_test_X, dictionary_file)
print('Categorical data shape: {}'.format(clean_test_X[test_cat_col].shape))
print('Numerical data shape: {}'.format(clean_test_X[test_num_col].shape))

print('\nCleaning up categorical variables...')
# Dropping columns hospital_admit_source, apache_2_bodysystem, gcs_unable_apache
print('Dropping {} columns'.format(len(cat_list)))
X_test_cat = drop_columns(clean_test_X[test_cat_col], cat_list)
# Imputing missing values based on evaluation of data
X_test_cat_imputed = impute_cat_col(X_test_cat)

# Imputing values instead of dropping rows
np.random.seed(0)
my_list = ['arf_apache', 'intubated_apache', 'ventilated_apache', 'aids', 'cirrhosis', 
            'diabetes_mellitus', 'hepatic_failure', 'immunosuppression', 'leukemia', 'lymphoma', 
            'solid_tumor_with_metastasis']
for my_column in my_list:
    keys_list = X_test_cat_imputed[my_column].value_counts().keys().tolist()
    prob = X_test_cat_imputed[my_column].value_counts(normalize=True).tolist()
    X_test_cat_imputed.loc[:, my_column] = X_test_cat_imputed[my_column].fillna(
        pd.Series(np.random.choice(keys_list, p=prob, size=len(X_test_cat_imputed))))
    
print('Categorical data shape: {}'.format(X_test_cat_imputed.shape))

print('\nCleaning up numerical variables...')
print('Dropping {} columns'.format(len(num_column_list)))
X_test_num = drop_columns(clean_test_X[test_num_col], num_column_list)
X_test_num = impute_num_col(X_test_num)
print('Numerical data shape: {}'.format(X_test_num.shape))


print('\nConcatenating the numerical and categorical variables...')
test_dataset = concat_num_cat(X_test_num, X_test_cat_imputed)
print('New training feature dataset shape: {}'.format(test_dataset.shape))

print('\nTransforming categorical features into ordinal features')
print('\nTemperature features first')
for i in range(len(temp_cols)):
    test_dataset[new_temp_cols[i]] = test_dataset[temp_cols[i]].apply(ordinal_temp)

print('\nHeart rate features')
for i in range(len(heart_cols)):
    test_dataset[new_heart_cols[i]] = test_dataset[heart_cols[i]].apply(ordinal_heartrate)

print('\nMerging the group statistics from the training set with the testing set...')
test_df = pd.merge(test_dataset, group_stats, on=cat_cols_group_stats, how='left')

print('\nOne Hot Encoding the categorical variables...')
X_test_final = one_hot_encode(test_df)
print('Test data shape: {}'.format(X_test_final.shape))

print('\nImputing missing value from the training set ...')
X_test_final = imputeMissingVal(X_final, X_test_final, 'test')

print('\nStandardizing from the training set...')
X_test_final = scaleData(X_final_imp, X_test_final, 'test')

print('\nGenerating the probability predictions for the test set...')
y_pred = model_predictProba(gbm_model, X_test_final)
print('y_pred:', y_pred)