# Package

In [3]:
# PyTorch package and submodules
import torch
import torch.nn as nn
from torch.optim import SGD #gradient descent optimizer

# NumPy for math operations, and Pandas for processing tabular data.
import numpy as np
import pandas as pd

# Plotly plotting package
import plotly.graph_objects as go
import plotly.express as px

# Import matplotlib.pyplot to visualize tree models
import matplotlib.pyplot as plt

# Use imbalanced learn package
from imblearn.over_sampling import SMOTE

from genetic_selection import GeneticSelectionCV

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, precision_recall_curve
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, precision_score, recall_score

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import StratifiedKFold, train_test_split as tts, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
from sklearn.feature_selection import SequentialFeatureSelector as SFS

In [4]:
def predict_evaluation(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    result={'r2':r2,
            'rmse':rmse,
            'mse':mse,
            'mae':mae}
    return result


# Import Data

In [5]:
df = pd.read_csv("/Users/Siqi/Desktop/SPH6004/Group/3Step_1Line.csv")

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,vent_duration,stay_id,calcium_48,creatinine_48,glucose_48,sodium_48,chloride_48,hemoglobin_48,wbc_48,...,glucose+24,sodium+24,chloride+24,hemoglobin+24,wbc+24,alt+24,ast+24,alp+24,bilirubin_total+24,pt+24
0,0,2.0,37607624,9.1,0.6,145.0,145.0,103.0,7.6,1.7,...,79.5,145.0,104.0,7.15,1.5,21.0,21.0,72.0,0.5,12.2
1,1,4.0,30515259,8.5,2.6,109.0,134.0,94.0,7.1,2.2,...,198.0,140.0,104.0,7.9,7.8,168.0,330.5,39.0,3.65,18.85
2,2,16.0,33572547,8.2,1.4,100.0,139.0,111.0,8.5,0.1,...,142.0,134.0,106.0,9.0,0.1,13.0,25.0,77.0,1.2,14.3
3,3,15.0,34776632,9.5,0.7,143.0,131.0,88.0,7.7,9.6,...,117.5,148.0,106.0,8.75,12.95,75.0,275.5,171.0,33.2,37.6
4,4,104.133333,36606626,8.7,0.7,107.0,143.0,113.0,8.1,2.8,...,85.0,141.0,109.0,9.4,3.7,17.0,20.0,78.0,1.4,12.4


In [7]:

df = df.drop(['Unnamed: 0','stay_id'], axis=1)

In [8]:
df.head()

Unnamed: 0,vent_duration,calcium_48,creatinine_48,glucose_48,sodium_48,chloride_48,hemoglobin_48,wbc_48,alt_48,ast_48,...,glucose+24,sodium+24,chloride+24,hemoglobin+24,wbc+24,alt+24,ast+24,alp+24,bilirubin_total+24,pt+24
0,2.0,9.1,0.6,145.0,145.0,103.0,7.6,1.7,19.0,25.0,...,79.5,145.0,104.0,7.15,1.5,21.0,21.0,72.0,0.5,12.2
1,4.0,8.5,2.6,109.0,134.0,94.0,7.1,2.2,34.0,39.0,...,198.0,140.0,104.0,7.9,7.8,168.0,330.5,39.0,3.65,18.85
2,16.0,8.2,1.4,100.0,139.0,111.0,8.5,0.1,13.0,18.0,...,142.0,134.0,106.0,9.0,0.1,13.0,25.0,77.0,1.2,14.3
3,15.0,9.5,0.7,143.0,131.0,88.0,7.7,9.6,69.0,263.0,...,117.5,148.0,106.0,8.75,12.95,75.0,275.5,171.0,33.2,37.6
4,104.133333,8.7,0.7,107.0,143.0,113.0,8.1,2.8,17.0,20.0,...,85.0,141.0,109.0,9.4,3.7,17.0,20.0,78.0,1.4,12.4


In [9]:
feature_columns = df.columns[~df.columns.isin(['vent_duration'])]
target_column = df['vent_duration']

for i in feature_columns:
  df[i] = pd.to_numeric(df[i])

#df.replace({'outcome': {'False': 0, 'True': 1}})
#df['gender'].replace(to_replace = 'M', value = 1, inplace = True)

df.head()
#df.info()
#df.shape

Unnamed: 0,vent_duration,calcium_48,creatinine_48,glucose_48,sodium_48,chloride_48,hemoglobin_48,wbc_48,alt_48,ast_48,...,glucose+24,sodium+24,chloride+24,hemoglobin+24,wbc+24,alt+24,ast+24,alp+24,bilirubin_total+24,pt+24
0,2.0,9.1,0.6,145.0,145.0,103.0,7.6,1.7,19.0,25.0,...,79.5,145.0,104.0,7.15,1.5,21.0,21.0,72.0,0.5,12.2
1,4.0,8.5,2.6,109.0,134.0,94.0,7.1,2.2,34.0,39.0,...,198.0,140.0,104.0,7.9,7.8,168.0,330.5,39.0,3.65,18.85
2,16.0,8.2,1.4,100.0,139.0,111.0,8.5,0.1,13.0,18.0,...,142.0,134.0,106.0,9.0,0.1,13.0,25.0,77.0,1.2,14.3
3,15.0,9.5,0.7,143.0,131.0,88.0,7.7,9.6,69.0,263.0,...,117.5,148.0,106.0,8.75,12.95,75.0,275.5,171.0,33.2,37.6
4,104.133333,8.7,0.7,107.0,143.0,113.0,8.1,2.8,17.0,20.0,...,85.0,141.0,109.0,9.4,3.7,17.0,20.0,78.0,1.4,12.4


# Feature Selection & TTS

In [10]:
# corr_matrix=df_norm.corr().abs()
# upper_triangle=corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))
# to_drop=[column for column in upper_triangle.columns if any(upper_triangle[column]>0.80)]
# data_select1=data_norm.drop(to_drop, axis=1)


# selector=VarianceThreshold(0)
# selector.fit(data_select1)
# selected_features=selector.get_support(indices=True)
# selected_column_names=data_select1.columns[selected_features].tolist()
# data_select2=data_select1[selected_column_names]

In [11]:

X_raw = df.loc[:, ~df.columns.isin(['vent_duration', 'stay_id'])]
Y_raw = df['vent_duration']  
##df['vent_hours'].map(target_map)


# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# Y_raw = le.fit_transform(Y_raw)

# # Check data balance 
# print("0-12 hours:", len(Y_raw[Y_raw=='0-12 hour']))
# print("12-24 hrs:", len(Y_raw[Y_raw == '12-24 hrs']))
# print(">24 hrs:", len(Y_raw[Y_raw == '>24 hrs']))


vent_duration_counts = df['vent_duration'].value_counts()
print(vent_duration_counts)


13.000000    45
7.000000     40
5.000000     36
10.000000    34
6.000000     33
             ..
7.816667      1
5.116667      1
9.133333      1
8.366667      1
18.483333     1
Name: vent_duration, Length: 859, dtype: int64


In [12]:
Xtrain, Xtest, Ytrain, Ytest = tts(X_raw, Y_raw , test_size=0.2, random_state=1)

# XGboost

In [32]:
dtrain = xgb.DMatrix(Xtrain, label=Ytrain)
dtest = xgb.DMatrix(Xtest, label=Ytest)

In [33]:
## Baseline
xgb_model = xgb.XGBRegressor(objective ='reg:squarederror', n_estimators=100)
xgb_model.fit(Xtrain, Ytrain)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [34]:
predy = xgb_model.predict(Xtest)

In [35]:
rmse = np.sqrt(mean_squared_error(Ytest, predy))
print("RMSE: %f" % (rmse))

RMSE: 63.787171


In [36]:
result=predict_evaluation(Ytest, predy)
result

{'r2': -0.20850581399412715,
 'rmse': 63.78717141446835,
 'mse': 4068.803237058768,
 'mae': 39.78427433616214}

In [69]:
# GridSearch

xgb_model = xgb.XGBRegressor()

# Define hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'objective': ['reg:squarederror', 'reg:logistic', 'reg:gamma']
}

# Create grid search object
grid_search = GridSearchCV(xgb_model, param_grid, cv=5)
grid_search.fit(Xtrain, Ytrain)

print(f'Best hyperparameters: {grid_search.best_params_}')

Best hyperparameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200, 'objective': 'reg:gamma'}


180 fits failed out of a total of 540.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/Siqi/Library/Python/3.7/lib/python/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/Siqi/Library/Python/3.7/lib/python/site-packages/xgboost/core.py", line 575, in inner_f
    return f(**kwargs)
  File "/Users/Siqi/Library/Python/3.7/lib/python/site-packages/xgboost/sklearn.py", line 972, in fit
    callbacks=callbacks,
  File "/Users/Siqi/Library/Python/3.7/lib/python/site-packages/xgboost/core.py", line 575, in inner_f
    return f(**kwargs)
  File "/Users/Siqi/Lib

In [70]:
print(f'Best hyperparameters: {grid_search.best_params_}')

Best hyperparameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200, 'objective': 'reg:gamma'}


In [77]:

xgb_model_2 = xgb.XGBRegressor(objective ='reg:squarederror', n_estimators=200, max_depth=3, learning_rate=0.01)
xgb_model_2.fit(Xtrain, Ytrain)
pred_y = xgb_model_2.predict(Xtest)

In [78]:
result=predict_evaluation(Ytest, pred_y)
result

{'r2': -0.009561643078361381,
 'rmse': 58.30091798020457,
 'mse': 3398.99703733454,
 'mae': 33.21179032831609}

# Lightgbm

In [1]:
import lightgbm as lgb

In [18]:
#Baseline
model = lgb.LGBMRegressor()
model.fit(Xtrain, Ytrain)

LGBMRegressor()

In [19]:
y1 = model.predict(Xtest)
result=predict_evaluation(Ytest, y1)
result


{'r2': -0.08035981928746239,
 'rmse': 60.3105354017002,
 'mse': 3637.360680439733,
 'mae': 38.076518221573906}

In [17]:
lgb_1 = lgb.LGBMRegressor()
param_grid = {
    'learning_rate': [0.01, 0.1,0.5, 1],
    'n_estimators': [20, 40,100],
    'num_leaves': [10, 20, 50, 100]
}

gridsearch = GridSearchCV(lgb_1, param_grid)
gridsearch.fit(Xtrain, Ytrain)
print(gridsearch.best_params_)

{'learning_rate': 0.01, 'n_estimators': 40, 'num_leaves': 10}


In [20]:

# create the LGBMClassifier object
best_estimator = lgb.LGBMRegressor(learning_rate=0.01, n_estimators=40, num_leaves = 10)

best_estimator.fit(Xtrain, Ytrain)
y2 = best_estimator.predict(Xtest)
result=predict_evaluation(Ytest, y2)
result


{'r2': 0.0070110197359329884,
 'rmse': 57.82041262198684,
 'mse': 3343.2001157768154,
 'mae': 35.61809528374974}

# RandomForest

In [21]:
rf_model = RandomForestRegressor()

param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [3, 5, 10, 15, None],
    'min_samples_split': [2,4,6,8,10],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 4, 8, 10],
}

# Perform grid search using 5-fold cross validation
rf_grid_search = GridSearchCV(rf_model, param_grid=param_grid, cv=5, verbose=3, n_jobs=-1)
rf_grid_search.fit(Xtrain, Ytrain)

print("Best parameters:", rf_grid_search.best_params_)
print("Best score:", rf_grid_search.best_score_)


Fitting 5 folds for each of 800 candidates, totalling 4000 fits
[CV 2/5] END max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.006 total time=   0.3s
[CV 4/5] END max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=-0.117 total time=   0.3s
[CV 3/5] END max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=-0.004 total time=   0.3s
[CV 1/5] END max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=-0.007 total time=   0.3s
[CV 5/5] END max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.003 total time=   0.4s
[CV 2/5] END max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.014 total time=   0.6s
[CV 1/5] END max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.004 total

In [22]:
print("Best parameters:", rf_grid_search.best_params_)

Best parameters: {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 100}


In [None]:
# gridsearch = GridSearchCV(rf_model, param_grid)

# # fit the grid search
# gridsearch.fit(Xtrain, Ytrain)

# # print the best parameters
# print(gridsearch.best_params_)

In [23]:
best_rf_model = RandomForestRegressor(max_depth = 3, max_features = 'sqrt', min_samples_leaf = 8, min_samples_split = 8, n_estimators = 100)
best_rf_model.fit(Xtrain, Ytrain)


best_rf_model.fit(Xtrain, Ytrain)
y3 = best_rf_model.predict(Xtest)
result=predict_evaluation(Ytest, y3)
result

{'r2': 0.017112428108645883,
 'rmse': 57.525565143380256,
 'mse': 3309.1906450652855,
 'mae': 35.197976222903165}

# Logistics Regression

In [30]:
lg = LogisticRegression()
param_grid = {
    'C': [0.1, 1, 10, 50, 100, 1000],
    'penalty': [ 'l2', 'none']
}

lg_model = GridSearchCV(lg, param_grid=param_grid , cv = 5)
lg_model.fit(Xtrain, Ytrain)

print("Best parameters:", rf_grid_search.best_params_)
print("Best score:", rf_grid_search.best_score_)


  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ign

ValueError: Unknown label type: 'continuous'

In [None]:
# fit the grid search
lg_model.fit(Xtrain, Ytrain)