In [1]:
import pandas as pd   
from matplotlib import pyplot
import matplotlib.pyplot as plt
from numpy import mean

from sklearn.impute import SimpleImputer, KNNImputer
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import learning_curve, train_test_split, validation_curve
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import MinMaxScaler

from sklearn.pipeline import Pipeline


import warnings 
warnings.filterwarnings('ignore')

In [2]:
df1 = pd.read_csv("dataset/train.csv")

In [3]:
df1 = pd.read_csv("dataset/train.csv")

# Deleted irrelevant columns from df1
df1.drop(['RecordID'], axis=1, inplace=True)
df1.drop(['hospital_id'], axis=1, inplace=True)
df1.drop(['icu_id'], axis=1, inplace=True)
df1.drop(['icu_stay_type'], axis=1, inplace=True)
df1.drop(['ethnicity'], axis=1, inplace=True)
df1.drop(['gender'], axis=1, inplace=True)
df1.drop(['apache_3j_bodysystem'], axis=1, inplace=True)
df1.drop(['apache_2_bodysystem'], axis=1, inplace=True)
df1.drop(['icu_type'], axis=1, inplace=True)
df1.drop(['icu_admit_source'], axis=1, inplace=True)

# label_encoder = LabelEncoder()

# df1['apache_3j_bodysystem'] = label_encoder.fit_transform(df1['apache_3j_bodysystem'])
# df1['apache_2_bodysystem'] = label_encoder.fit_transform(df1['apache_2_bodysystem'])
# df1['icu_type'] = label_encoder.fit_transform(df1['icu_type'])
# df1['icu_admit_source'] = label_encoder.fit_transform(df1['icu_admit_source'])

In [4]:
df_onehot = pd.get_dummies(df1)
# df_onehot.dtypes

In [5]:
# Simple imputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputed_data = imputer.fit_transform(df_onehot.values)
X = imputed_data[:, :-1] 
y = imputed_data[:, -1]

In [None]:
# KNN imputer
knn_imputer = KNNImputer(n_neighbors=2, weights='uniform')
imputed_data = knn_imputer.fit_transform(df_onehot.values)
X = imputed_data[:, :-1] 
y = imputed_data[:, -1]

In [None]:
# print the distribution of y
target_variable_counts = np.bincount(y.astype(int))
print("Value counts of the target variable 'hospital_death':")
for value, count in enumerate(target_variable_counts):
    print(f"Value: {value}, Count: {count}")

In [6]:
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.3, random_state=2)

LEARNING CURVE FOR TRAINING AND TESTING VALIDATION

In [None]:
#pipe_lr = make_pipeline(StandardScaler(),
#                        LogisticRegression(penalty='l2', max_iter=10000))

pipe_lr = Pipeline([('classifier', DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0))])

train_sizes, train_scores, test_scores =\
                learning_curve(estimator=pipe_lr,
                               X=trainX,
                               y=trainy,
                               train_sizes=np.linspace(0.1, 1.0, 10),
                               cv=10,
                               n_jobs=1)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(train_sizes, train_mean,
         color='blue', marker='o',
         markersize=5, label='Training accuracy')

plt.fill_between(train_sizes,
                 train_mean + train_std,
                 train_mean - train_std,
                 alpha=0.15, color='blue')

plt.plot(train_sizes, test_mean,
         color='green', linestyle='--',
         marker='s', markersize=5,
         label='Validation accuracy')

plt.fill_between(train_sizes,
                 test_mean + test_std,
                 test_mean - test_std,
                 alpha=0.15, color='green')

plt.grid()
plt.xlabel('Number of training examples')
plt.ylabel('Accuracy')
plt.legend(loc='upper right')
plt.ylim([0.8, 1.03])
plt.tight_layout()

plt.show()

Bagging and Boosting

In [7]:
import time
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from catboost import CatBoostClassifier
import xgboost as xgb
import lightgbm as lgb

In [8]:
def fit_model(model, model_name):
    model.fit(trainX,trainy)
    md_probs = model.predict_proba(testX)
    md_probs = md_probs[:,1]
    md_auc = roc_auc_score(testy, md_probs)
    print(model_name, " : ", md_auc)

In [9]:
num_of_models = 300
depth_level = 3

In [None]:
#record the start time
start_time = time.time()
rf = RandomForestClassifier(max_depth=10,n_estimators=num_of_models)
fit_model(rf, "Random Forest")
#record the end time
end_time = time.time()
#calculate the total time
total_time = end_time - start_time
print("Total time RF: ", total_time)

#record the start time
start_time = time.time()
gb = GradientBoostingClassifier(max_depth=depth_level,n_estimators=num_of_models)
fit_model(gb, "Graident Boosting")
#record the end time
end_time = time.time()
#calculate the total time
total_time = end_time - start_time
print("Total time GB: ", total_time)

Random Forest with n_jobs=-1

In [None]:
#record the start time
start_time = time.time()
rf = RandomForestClassifier(max_depth=10,n_estimators=num_of_models, n_jobs=-1 )
fit_model(rf, "Random Forest")
#record the end time
end_time = time.time()
#calculate the total time
total_time = end_time - start_time
print("Total time RF: ", total_time)

ADABOOST CLASSIFIER

In [None]:
#record the start time
start_time = time.time()
ab = AdaBoostClassifier(n_estimators=num_of_models)
fit_model(ab, "Adaptive Boosting")
#record the end time
end_time = time.time()
#calculate the total time
total_time = end_time - start_time
print("Total time AB: ", total_time)

XG BOOST 

In [None]:
xgb_model = xgb.XGBClassifier(max_depth=depth_level, n_estimators=num_of_models, learning_rate=0.1)
start_time = time.time()
#fit xgb_model
xgb_model.fit(trainX,trainy)
md_probs = xgb_model.predict_proba(testX)
md_probs = md_probs[:,1]
md_auc = roc_auc_score(testy, md_probs)
print("XG Boost", " : ", md_auc)
#record the end time
end_time = time.time()
#calculate the total time
total_time = end_time - start_time
print("Total time XGB: ", total_time)

LIGHT GBM

In [None]:
lgb_model = lgb.LGBMClassifier(max_depth=depth_level, n_estimators=num_of_models, learning_rate=0.1, force_col_wise='true')
start_time = time.time()
#fit xgb_model
lgb_model.fit(trainX,trainy)
md_probs = lgb_model.predict_proba(testX)
md_probs = md_probs[:,1]
md_auc = roc_auc_score(testy, md_probs)
print("LG Boost", " : ", md_auc)
#record the end time
end_time = time.time()
#calculate the total time
total_time = end_time - start_time
print("Total time LGB: ", total_time)

TEST

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier

# Define your dataset (X and y)

# Define the hyperparameter grid
param_grid = {
    'learning_rate': [0.1, 0.01],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'reg_alpha': [0, 0.1, 1.0],
    'reg_lambda': [0, 0.1, 1.0],
    'boosting_type': ['gbdt', 'dart'],  # Specific to LightGBM
    'num_leaves': [31, 63, 127],  # Specific to LightGBM
    'min_child_samples': [1, 5, 10]  # Specific to LightGBM
}

# Create an XGBoost classifier
lgbm_model = LGBMClassifier(device='gpu', device_id=0)

# Perform grid search with cross-validation
grid_search = GridSearchCV(
  estimator=lgbm_model,
  param_grid=param_grid,
  scoring='accuracy',
  cv=5,
  n_jobs=-1
)

# Fit the grid search to the training data
grid_search.fit(trainX,trainy)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the final XGBoost model with the best hyperparameters
final_model = LGBMClassifier(**best_params)
final_model.fit(trainX,trainy)

In [None]:
#  Simple Imputer

df_test = pd.read_csv("dataset/test.csv")

df_test.drop(['RecordID', 'hospital_id', 'icu_id', 'icu_stay_type', 'ethnicity', 'gender', 'apache_3j_bodysystem', 'apache_2_bodysystem', 'icu_type', 'icu_admit_source'], axis=1, inplace=True)


df_test_onehot = pd.get_dummies(df_test)

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X_test = imputer.fit_transform(df_test_onehot)

record_ids = np.arange(50001, 80001)


predictions = lgb_model.predict_proba(X_test)[:, 1]


output_df = pd.DataFrame({'RecordID': record_ids, 'hospital_death': predictions})
output_df.to_csv('predictions.csv', index=False)

OLD STUFF

In [None]:
dt_5_2 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=10)  
dt_5_2.fit(trainX,trainy)
y_probs = dt_5_2.predict_proba(testX)[:,1]
y_probs

In [None]:
md_auc = roc_auc_score(testy, y_probs)
md_auc

In [None]:
def fit_model(model, model_name):
    model.fit(trainX,trainy)
    y_probs = model.predict_proba(testX)
    y_probs = y_probs[:,1]
    md_auc = roc_auc_score(testy, y_probs)
    print(model_name, " : ", md_auc)
    md_fpr, md_tpr, _ = roc_curve(testy, y_probs)
    pyplot.plot(md_fpr, md_tpr, marker='.', label=model_name)
    #return (md_fpr, md_tpr)

DT, GNB AND KNN (No longer to be used)

In [None]:
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.3, random_state=42)
dt_5_2 = DecisionTreeClassifier(max_depth=5)  
fit_model(dt_5_2, "Depth 5 Split 2") 

dt_7_2 = DecisionTreeClassifier(max_depth=7)  
fit_model(dt_7_2, "Depth 7 Split 2") 

dt_5_5 = DecisionTreeClassifier(max_depth=5, min_samples_split=5)  
fit_model(dt_5_5, "Depth 5 Split 5") 

dt_7_5 = DecisionTreeClassifier(max_depth=7, min_samples_split=5)  
fit_model(dt_7_5, "Depth 7 Split 5") 

dt_3_2 = DecisionTreeClassifier(max_depth=3, min_samples_split=2)  
fit_model(dt_3_2, "Depth 3 Split 2") 

dt_3_5 = DecisionTreeClassifier(max_depth=3, min_samples_split=5)  
fit_model(dt_3_5, "Depth 3 Split 5") 

dt_5_4 = DecisionTreeClassifier(max_depth=5, min_samples_split=4)  
fit_model(dt_5_4, "Depth 5 Split 4") 

gnb = GaussianNB()
fit_model(gnb, "Naive Bayes")

kn = KNeighborsClassifier(n_neighbors=10)
fit_model(kn, "k-NN")

pipe_kn = Pipeline([("scaler", MinMaxScaler()), ("knr", KNeighborsClassifier(n_neighbors=10))])
fit_model(pipe_kn, "Scaled k-NN")

pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

In [None]:
#  Simple Imputer

df_test = pd.read_csv("dataset/test.csv")

df_test.drop(['RecordID', 'hospital_id', 'icu_id', 'icu_stay_type', 'ethnicity', 'gender', 'apache_3j_bodysystem', 'apache_2_bodysystem', 'icu_type', 'icu_admit_source'], axis=1, inplace=True)

# label_encoder = LabelEncoder()

# df_test['apache_3j_bodysystem'] = label_encoder.fit_transform(df_test['apache_3j_bodysystem'])
# df_test['apache_2_bodysystem'] = label_encoder.fit_transform(df_test['apache_2_bodysystem'])
# df_test['icu_type'] = label_encoder.fit_transform(df_test['icu_type'])
# df_test['icu_admit_source'] = label_encoder.fit_transform(df_test['icu_admit_source'])


df_test_onehot = pd.get_dummies(df_test)

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X_test = imputer.fit_transform(df_test_onehot)

record_ids = np.arange(50001, 80001)


predictions = dt_5_2.predict_proba(X_test)[:, 1]


output_df = pd.DataFrame({'RecordID': record_ids, 'hospital_death': predictions})
output_df.to_csv('predictions.csv', index=False)

In [None]:
# KNN Imputer

df_test = pd.read_csv("dataset/test.csv")


df_test.drop(['RecordID', 'hospital_id', 'icu_id', 'icu_stay_type', 'ethnicity', 'gender'], axis=1, inplace=True)

label_encoder = LabelEncoder()

df_test['apache_3j_bodysystem'] = label_encoder.fit_transform(df_test['apache_3j_bodysystem'])
df_test['apache_2_bodysystem'] = label_encoder.fit_transform(df_test['apache_2_bodysystem'])
df_test['icu_type'] = label_encoder.fit_transform(df_test['icu_type'])
df_test['icu_admit_source'] = label_encoder.fit_transform(df_test['icu_admit_source'])

df_test_onehot = pd.get_dummies(df_test)

imputer = KNNImputer(n_neighbors=2, weights='uniform')
X_test = imputer.fit_transform(df_test_onehot)

record_ids = np.arange(50001, 80001)

predictions = dt_7_5.predict_proba(X_test)[:, 1]


output_df = pd.DataFrame({'RecordID': record_ids, 'hospital_death': predictions})
output_df.to_csv('predictions.csv', index=False)

In [None]:
# Load the test dataset
df_test = pd.read_csv("dataset/test.csv")

# Drop columns that were removed during training
columns_to_drop = ['RecordID', 'hospital_id', 'icu_id', 'icu_stay_type', 'ethnicity', 'gender']
df_test_cleaned = df_test.drop(columns_to_drop, axis=1)

# Ensure the order of columns is consistent with training data
df_test_cleaned = df_test_cleaned[X.columns]
df_test_naremoved = df_test_cleaned.dropna() 

# One-hot encode categorical columns
df_test_onehot = pd.get_dummies(df_test_naremoved)

# Generate 'RecordID' values from 50,001 to 80,000
record_ids = np.arange(50001, 50001 + len(df_test_onehot))

# Extract features (X_test) from the cleaned DataFrame
X_test = df_test_onehot

# Use the trained model to make predictions on the test dataset
predictions = dt_5_2.predict_proba(X_test)[:, 1]

# Create a DataFrame with 'RecordID' and 'hospital_death' columns
output_df = pd.DataFrame({'RecordID': record_ids, 'hospital_death': predictions})

# Save the predictions to a CSV file
output_df.to_csv('predictions.csv', index=False)