In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, KFold

from sklearn.linear_model import LinearRegression, Ridge, Lasso

from sklearn.tree import DecisionTreeRegressor

from sklearn.ensemble import RandomForestRegressor,BaggingRegressor

from sklearn.preprocessing import LabelEncoder

# For tuning the model

from sklearn.model_selection import GridSearchCV

# To check the model performance

from sklearn.metrics import make_scorer,mean_squared_error, r2_score, mean_absolute_error

In [99]:
#loading the dataset to a dataframe
df= pd.read_csv('/content/healthcare_data (1).csv')
#data exploration
df.info()

df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 15 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Available Extra Rooms in Hospital  500000 non-null  int64  
 1   Department                         500000 non-null  object 
 2   Ward_Facility_Code                 500000 non-null  object 
 3   doctor_name                        500000 non-null  object 
 4   staff_available                    500000 non-null  int64  
 5   patientid                          500000 non-null  int64  
 6   Age                                500000 non-null  object 
 7   gender                             500000 non-null  object 
 8   Type of Admission                  500000 non-null  object 
 9   Severity of Illness                500000 non-null  object 
 10  health_conditions                  500000 non-null  object 
 11  Visitors with Patient              5000

Unnamed: 0,Available Extra Rooms in Hospital,Department,Ward_Facility_Code,doctor_name,staff_available,patientid,Age,gender,Type of Admission,Severity of Illness,health_conditions,Visitors with Patient,Insurance,Admission_Deposit,Stay (in days)
0,4,gynecology,D,Dr Sophia,0,33070,41-50,Female,Trauma,Extreme,Diabetes,4,Yes,2966.408696,8
1,4,gynecology,B,Dr Sophia,2,34808,31-40,Female,Trauma,Minor,Heart disease,2,No,3554.835677,9
2,2,gynecology,B,Dr Sophia,8,44577,21-30,Female,Trauma,Extreme,Diabetes,2,Yes,5624.733654,7
3,4,gynecology,D,Dr Olivia,7,3695,31-40,Female,Urgent,Moderate,,4,No,4814.149231,8
4,2,anesthesia,E,Dr Mark,10,108956,71-80,Male,Trauma,Moderate,Diabetes,2,No,5169.269637,34


In [100]:
#checking for duplicate values
df.duplicated().sum()

0

In [101]:
column_mapping = ({'Available Extra Rooms in Hospital':"Available_Extra_Rooms_in_Hospital",
                   "Visitors with Patient":"Visitors_with_Patient",
                   "Stay (in days)":'Stay_in_Days',
                   "Type of Admission":"Type_of_Admission",
                   "Severity of Illness":"Severity_of_Illness"})
df= df.rename(columns=column_mapping)

In [102]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 15 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Available_Extra_Rooms_in_Hospital  500000 non-null  int64  
 1   Department                         500000 non-null  object 
 2   Ward_Facility_Code                 500000 non-null  object 
 3   doctor_name                        500000 non-null  object 
 4   staff_available                    500000 non-null  int64  
 5   patientid                          500000 non-null  int64  
 6   Age                                500000 non-null  object 
 7   gender                             500000 non-null  object 
 8   Type_of_Admission                  500000 non-null  object 
 9   Severity_of_Illness                500000 non-null  object 
 10  health_conditions                  500000 non-null  object 
 11  Visitors_with_Patient              5000

# Data Preparation for Model Building

In [103]:
df = pd.get_dummies(
    df,
    columns = df.select_dtypes(include = ["object", "category"]).columns.tolist(),
    drop_first = True,
)

In [104]:
df.head()

Unnamed: 0,Available_Extra_Rooms_in_Hospital,staff_available,patientid,Visitors_with_Patient,Admission_Deposit,Stay_in_Days,Department_anesthesia,Department_gynecology,Department_radiotherapy,Department_surgery,...,Type_of_Admission_Trauma,Type_of_Admission_Urgent,Severity_of_Illness_Minor,Severity_of_Illness_Moderate,health_conditions_Diabetes,health_conditions_Heart disease,health_conditions_High Blood Pressure,health_conditions_None,health_conditions_Other,Insurance_Yes
0,4,0,33070,4,2966.408696,8,0,1,0,0,...,1,0,0,0,1,0,0,0,0,1
1,4,2,34808,2,3554.835677,9,0,1,0,0,...,1,0,1,0,0,1,0,0,0,0
2,2,8,44577,2,5624.733654,7,0,1,0,0,...,1,0,0,0,1,0,0,0,0,1
3,4,7,3695,4,4814.149231,8,0,1,0,0,...,0,1,0,1,0,0,0,1,0,0
4,2,10,108956,2,5169.269637,34,1,0,0,0,...,1,0,0,1,1,0,0,0,0,0


In [105]:
#dropping less correlated features
df = df.drop(['patientid'],axis = 1)

In [106]:
#seperating the feature variables and target variables
X = df.drop('Stay_in_Days',axis = 1)

y = df['Stay_in_Days']

In [107]:
X.columns


Index(['Available_Extra_Rooms_in_Hospital', 'staff_available',
       'Visitors_with_Patient', 'Admission_Deposit', 'Department_anesthesia',
       'Department_gynecology', 'Department_radiotherapy',
       'Department_surgery', 'Ward_Facility_Code_B', 'Ward_Facility_Code_C',
       'Ward_Facility_Code_D', 'Ward_Facility_Code_E', 'Ward_Facility_Code_F',
       'doctor_name_Dr John', 'doctor_name_Dr Mark', 'doctor_name_Dr Nathan',
       'doctor_name_Dr Olivia', 'doctor_name_Dr Sam', 'doctor_name_Dr Sarah',
       'doctor_name_Dr Simon', 'doctor_name_Dr Sophia', 'Age_11-20',
       'Age_21-30', 'Age_31-40', 'Age_41-50', 'Age_51-60', 'Age_61-70',
       'Age_71-80', 'Age_81-90', 'Age_91-100', 'gender_Male', 'gender_Other',
       'Type_of_Admission_Trauma', 'Type_of_Admission_Urgent',
       'Severity_of_Illness_Minor', 'Severity_of_Illness_Moderate',
       'health_conditions_Diabetes', 'health_conditions_Heart disease',
       'health_conditions_High Blood Pressure', 'health_conditions

In [None]:
#splitting the training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True, random_state = 1)

#checking the shape of the train and test set
print('Train Shape:',X_train.shape)
print('Test Shape:',X_test.shape)

Train Shape: (400000, 42)
Test Shape: (100000, 42)


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 43 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   Available_Extra_Rooms_in_Hospital      500000 non-null  int64  
 1   staff_available                        500000 non-null  int64  
 2   Visitors_with_Patient                  500000 non-null  int64  
 3   Admission_Deposit                      500000 non-null  float64
 4   Stay_in_Days                           500000 non-null  int64  
 5   Department_anesthesia                  500000 non-null  uint8  
 6   Department_gynecology                  500000 non-null  uint8  
 7   Department_radiotherapy                500000 non-null  uint8  
 8   Department_surgery                     500000 non-null  uint8  
 9   Ward_Facility_Code_B                   500000 non-null  uint8  
 10  Ward_Facility_Code_C                   500000 non-null  

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 43 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   Available_Extra_Rooms_in_Hospital      500000 non-null  int64  
 1   staff_available                        500000 non-null  int64  
 2   Visitors_with_Patient                  500000 non-null  int64  
 3   Admission_Deposit                      500000 non-null  float64
 4   Stay_in_Days                           500000 non-null  int64  
 5   Department_anesthesia                  500000 non-null  uint8  
 6   Department_gynecology                  500000 non-null  uint8  
 7   Department_radiotherapy                500000 non-null  uint8  
 8   Department_surgery                     500000 non-null  uint8  
 9   Ward_Facility_Code_B                   500000 non-null  uint8  
 10  Ward_Facility_Code_C                   500000 non-null  

# Model Building

In [None]:
from sklearn.model_selection import cross_val_score

# Build the regression model using sklearn linear regression
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2) Score:", r2)

Mean Squared Error (MSE): 9.884965445902619
Mean Absolute Error (MAE): 2.155759034562716
R-squared (R2) Score: 0.8430298956456241


# Random Forest Regressor

In [None]:
# Random Forest Regressor
regressor = RandomForestRegressor(n_estimators = 100,random_state = 1)

# Fitting the model
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2) Score:", r2)

Mean Squared Error (MSE): 1.6995923960000001
Mean Absolute Error (MAE): 0.8650502000000002
R-squared (R2) Score: 0.9730110138249792


In [None]:
#fitting the gradient boost regressor
from sklearn.ensemble import GradientBoostingRegressor

GBreg = GradientBoostingRegressor(n_estimators = 100, random_state = 1, learning_rate=0.1, max_depth=3)

GBreg.fit(X_train, y_train)

y_pred = GBreg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2) Score:", r2)


Mean Squared Error (MSE): 3.213847966912557
Mean Absolute Error (MAE): 1.2127491554423462
R-squared (R2) Score: 0.9489651174294724


In [None]:
#fitting XGBoost
import xgboost as xgb

xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=3, random_state=1)
#fitting the model
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2) Score:", r2)


Mean Squared Error (MSE): 3.2179812213485643
Mean Absolute Error (MAE): 1.2135936201810837
R-squared (R2) Score: 0.9488994826648702


In [None]:
#fitting lightGBM
import lightgbm as lgb

# Define the dataset format for LightGBM
train_data = lgb.Dataset(X_train, label=y_train)

# Set the hyperparameters for the model
params = {
    'objective': 'regression',
    'boosting_type': 'gbdt',
    'metric': 'mae',
    'num_leaves': 31,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'random_state': 1
}
#training the model
lgb_model = lgb.train(params, train_data, num_boost_round=100)


y_pred = lgb_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2) Score:", r2)


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 388
[LightGBM] [Info] Number of data points in the train set: 400000, number of used features: 42
[LightGBM] [Info] Start training from score 12.375650
Mean Squared Error (MSE): 2.505736080127015
Mean Absolute Error (MAE): 1.0928246579135998
R-squared (R2) Score: 0.9602097087607829


In [None]:
#hyperparameter tuning using grid search
rf_tuned = RandomForestRegressor(n_estimators = 100,random_state = 1)

# Define the hyperparameters to be tuned
param_grid = {

    "max_depth": [5, 7],
    "max_features": [0.8, 1]
}


# Type of scoring used to compare parameter combinations
scorer = make_scorer(r2_score)

# Run the grid search
grid_obj = GridSearchCV(rf_tuned, param_grid, cv = 5)

grid_obj = grid_obj.fit(X_train, y_train)

# Set the rf_tuned_regressor to the best combination of parameters
rf_tuned_regressor = grid_obj.best_estimator_

rf_tuned_regressor.fit(X_train, y_train)

y_pred = rf_tuned_regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2) Score:", r2)

Mean Squared Error (MSE): 2.9687492649150133
Mean Absolute Error (MAE): 1.1564083439719384
R-squared (R2) Score: 0.9528572067888362


# Neural Network

In [None]:
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten

#define the model
NN_model = Sequential()

# The Input Layer :
NN_model.add(Dense(128, kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu'))

# The Hidden Layers :
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))

# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

# Compile the network :
NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
NN_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               5504      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 256)               65792     
                                                                 
 dense_3 (Dense)             (None, 256)               65792     
                                                                 
 dense_4 (Dense)             (None, 1)                 257       
                                                                 
Total params: 170,369
Trainable params: 170,369
Non-trainable params: 0
_________________________________________________________________


In [None]:
checkpoint_name = 'NN_model.hdf5'
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

In [None]:
NN_model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split = 0.2, callbacks=callbacks_list)

In [None]:

y_pred = NN_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2) Score:", r2)

Mean Squared Error (MSE): 4.125034535961448
Mean Absolute Error (MAE): 1.414161374759674
R-squared (R2) Score: 0.9344957647936272


The RandomForestRegressor still has the best performance. with the lowest MSE and highest Rsquared

In [115]:
import pickle
from google.colab import files

with open('LOSModel7.pkl','wb') as f:
     pickle.dump(rf_tuned_regressor,f)
files.download('LOSModel7.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [97]:
rf_tuned_regressor

In [114]:
def preprocess_data(data):
    column_mapping = {
        'Available Extra Rooms in Hospital': 'Available_Extra_Rooms_in_Hospital',
        'Visitors with Patient': 'Visitors_with_Patient',
        'Stay (in days)': 'Stay_in_Days',
        'Type of Admission': 'Type_of_Admission',
        'Severity of Illness': 'Severity_of_Illness'
    }
    data = data.rename(columns=column_mapping)
    categorical_columns = ['Age', 'gender', 'Type_of_Admission', 'Severity_of_Illness', 'health_conditions', 'Insurance',
                           'Ward_Facility_Code', 'doctor_name', 'Department']
    data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

    expected_columns = ['Available_Extra_Rooms_in_Hospital', 'staff_available',
       'Visitors_with_Patient', 'Admission_Deposit', 'Department_anesthesia',
       'Department_gynecology', 'Department_radiotherapy',
       'Department_surgery', 'Ward_Facility_Code_B', 'Ward_Facility_Code_C',
       'Ward_Facility_Code_D', 'Ward_Facility_Code_E', 'Ward_Facility_Code_F',
       'doctor_name_Dr John', 'doctor_name_Dr Mark', 'doctor_name_Dr Nathan',
       'doctor_name_Dr Olivia', 'doctor_name_Dr Sam', 'doctor_name_Dr Sarah',
       'doctor_name_Dr Simon', 'doctor_name_Dr Sophia', 'Age_11-20',
       'Age_21-30', 'Age_31-40', 'Age_41-50', 'Age_51-60', 'Age_61-70',
       'Age_71-80', 'Age_81-90', 'Age_91-100', 'gender_Male', 'gender_Other',
       'Type_of_Admission_Trauma', 'Type_of_Admission_Urgent',
       'Severity_of_Illness_Minor', 'Severity_of_Illness_Moderate',
       'health_conditions_Diabetes', 'health_conditions_Heart disease',
       'health_conditions_High Blood Pressure', 'health_conditions_None',
       'health_conditions_Other', 'Insurance_Yes'
                        ]
    for column in expected_columns:
        if column not in data.columns:
            data[column] = 0

    data = data[expected_columns]

    return data





def make_prediction(model, data):
    preprocessed_data = preprocess_data(data)

    predictions = model.predict(preprocessed_data)
    predictions = [round(pred) for pred in predictions]

    data['predicted_los'] = predictions


    return data

data = pd.read_excel('/content/Untitled spreadsheet (1).xlsx')

predicted_data = make_prediction(rf_tuned_regressor,data)
predicted_data

Unnamed: 0,Available Extra Rooms in Hospital,Department,Ward_Facility_Code,doctor_name,staff_available,Age,gender,Type of Admission,Severity of Illness,health_conditions,Visitors with Patient,Insurance,Admission_Deposit,predicted_los
0,4,gynecology,D,Dr Sophia,0,41-50,Female,Trauma,Extreme,Diabetes,4,Yes,2966.409,9
1,4,gynecology,B,Dr Sophia,2,31-40,Female,Trauma,Minor,Heart disease,2,No,3554.836,9
2,2,gynecology,B,Dr Sophia,8,21-30,Female,Trauma,Extreme,Diabetes,2,Yes,5624.734,9
3,4,gynecology,D,Dr Olivia,7,31-40,Female,Urgent,Moderate,Diabetes,4,No,4814.149,9
4,2,anesthesia,E,Dr Mark,10,71-80,Male,Trauma,Moderate,Diabetes,2,No,5169.27,33
5,2,gynecology,F,Dr Olivia,2,21-30,Female,Trauma,Moderate,Other,2,Yes,4539.268,9
6,7,gynecology,D,Dr Olivia,5,31-40,Female,Emergency,Moderate,Other,3,Yes,4669.962,8
7,3,gynecology,B,Dr Sophia,8,21-30,Female,Emergency,Moderate,Heart disease,3,Yes,5101.675,9
8,2,TB & Chest disease,A,Dr John,7,31-40,Other,Trauma,Moderate,Other,2,No,4899.768,11
9,4,gynecology,D,Dr Sarah,7,41-50,Female,Trauma,Extreme,Asthama,4,No,3350.367,9
