In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import warnings

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve

warnings.filterwarnings("ignore", category=UserWarning, module="xgboost")


In [2]:
try:
    # Try loading the file from your laptop path
    df = pd.read_csv('C:/Users/Eugene/Documents/GitHub/TSDN-BoyWithLuv/Source/Prototype/patient_stay_cost/patient_cost_and_stay_data.csv')
except FileNotFoundError:
    # If the file is not found, try loading from the PC path
    df = pd.read_csv('C:/Users/user/OneDrive/Documents/GitHub/TSDN-BoyWithLuv/Source/Prototype/patient_stay_cost/patient_cost_and_stay_data.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,patient_id,gender,age,insurance_type,smoking_status,e_cigarette_usage,alcohol_consumption_rate,previous_admission_count,surgery_name,surgery_type,surgery_duration,room_type,medical_equipment_count,stay_duration,ward_cost,surgery_cost,medication_cost,total_cost
0,0,1,Male,52,Private,Never,False,Occasional,2,Gallbladder Removal,Minor,74,Regular Ward,1,3,300.0,6455.90,105.00,7460.90
1,1,2,Female,18,Social Security Agency,Never,False,Occasional,1,Breast Cancer Surgery,Major,194,Regular Ward,3,8,300.0,21266.34,1609.47,25275.81
2,2,3,Female,46,Private,Never,False,Occasional,0,Gallbladder Removal,Minor,101,Regular Ward,0,4,300.0,5315.61,377.22,6892.83
3,3,4,Female,85,Social Security Agency,Never,False,,2,Appendectomy,Minor,98,ICU,0,5,800.0,6098.06,613.54,10711.60
4,4,5,Male,21,Private,Never,False,Occasional,1,Hip Replacement,Major,165,VIP Ward,3,5,1000.0,12361.15,2384.62,19745.77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,499995,499996,Female,67,Social Security Agency,Current,False,,0,Hernia Repair,Minor,51,ICU,1,5,800.0,3000.00,132.85,7132.85
499996,499996,499997,Female,65,Social Security Agency,Never,False,,1,Liver Transplant,Major,197,Regular Ward,2,12,300.0,12420.77,1519.81,17540.58
499997,499997,499998,Female,36,Private,Never,False,Occasional,1,Breast Cancer Surgery,Major,221,Private Ward,4,13,500.0,12020.26,1466.69,19986.95
499998,499998,499999,Female,56,Social Security Agency,Former,False,,2,Knee Replacement,Major,176,Regular Ward,1,11,300.0,11255.24,1774.89,16330.13


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 19 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Unnamed: 0                500000 non-null  int64  
 1   patient_id                500000 non-null  int64  
 2   gender                    500000 non-null  object 
 3   age                       500000 non-null  int64  
 4   insurance_type            500000 non-null  object 
 5   smoking_status            500000 non-null  object 
 6   e_cigarette_usage         500000 non-null  bool   
 7   alcohol_consumption_rate  312032 non-null  object 
 8   previous_admission_count  500000 non-null  int64  
 9   surgery_name              500000 non-null  object 
 10  surgery_type              500000 non-null  object 
 11  surgery_duration          500000 non-null  int64  
 12  room_type                 500000 non-null  object 
 13  medical_equipment_count   500000 non-null  i

In [5]:
df = df.drop(columns=['patient_id', 'Unnamed: 0'])

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 17 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   gender                    500000 non-null  object 
 1   age                       500000 non-null  int64  
 2   insurance_type            500000 non-null  object 
 3   smoking_status            500000 non-null  object 
 4   e_cigarette_usage         500000 non-null  bool   
 5   alcohol_consumption_rate  312032 non-null  object 
 6   previous_admission_count  500000 non-null  int64  
 7   surgery_name              500000 non-null  object 
 8   surgery_type              500000 non-null  object 
 9   surgery_duration          500000 non-null  int64  
 10  room_type                 500000 non-null  object 
 11  medical_equipment_count   500000 non-null  int64  
 12  stay_duration             500000 non-null  int64  
 13  ward_cost                 500000 non-null  f

In [7]:
df.head()

Unnamed: 0,gender,age,insurance_type,smoking_status,e_cigarette_usage,alcohol_consumption_rate,previous_admission_count,surgery_name,surgery_type,surgery_duration,room_type,medical_equipment_count,stay_duration,ward_cost,surgery_cost,medication_cost,total_cost
0,Male,52,Private,Never,False,Occasional,2,Gallbladder Removal,Minor,74,Regular Ward,1,3,300.0,6455.9,105.0,7460.9
1,Female,18,Social Security Agency,Never,False,Occasional,1,Breast Cancer Surgery,Major,194,Regular Ward,3,8,300.0,21266.34,1609.47,25275.81
2,Female,46,Private,Never,False,Occasional,0,Gallbladder Removal,Minor,101,Regular Ward,0,4,300.0,5315.61,377.22,6892.83
3,Female,85,Social Security Agency,Never,False,,2,Appendectomy,Minor,98,ICU,0,5,800.0,6098.06,613.54,10711.6
4,Male,21,Private,Never,False,Occasional,1,Hip Replacement,Major,165,VIP Ward,3,5,1000.0,12361.15,2384.62,19745.77


In [10]:
df['total_cost'].describe()

count    500000.000000
mean      17506.124281
std        9271.173399
min        3400.000000
25%        7347.287500
50%       20028.635000
75%       25194.812500
max       51033.560000
Name: total_cost, dtype: float64

In [11]:
cost_class = df['total_cost'].quantile([0, 0.2, 0.4, 0.6, 0.8, 1]).tolist()

In [12]:
cost_label = ['Very Low', 'Low', 'Medium', 'High', 'Very High']

df['total_cost_class'] = pd.cut(df['total_cost'], bins=cost_class, labels=cost_label, include_lowest=True)

In [17]:
# Print the ranges for each class
for label, lower, upper in zip(cost_label, cost_class[:-1], cost_class[1:]):
    print(f"Class '{label}': {lower:.2f} - {upper:.2f}")

Class 'V': 3400.00 - 6879.53
Class 'e': 6879.53 - 13643.17
Class 'r': 13643.17 - 22268.01
Class 'y': 22268.01 - 26257.94
Class ' ': 26257.94 - 51033.56


In [16]:
df['total_cost_class'].value_counts()

total_cost_class
Very Low     100001
Medium       100000
High         100000
Very High    100000
Low           99999
Name: count, dtype: int64

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 18 columns):
 #   Column                    Non-Null Count   Dtype   
---  ------                    --------------   -----   
 0   gender                    500000 non-null  object  
 1   age                       500000 non-null  int64   
 2   insurance_type            500000 non-null  object  
 3   smoking_status            500000 non-null  object  
 4   e_cigarette_usage         500000 non-null  bool    
 5   alcohol_consumption_rate  312032 non-null  object  
 6   previous_admission_count  500000 non-null  int64   
 7   surgery_name              500000 non-null  object  
 8   surgery_type              500000 non-null  object  
 9   surgery_duration          500000 non-null  int64   
 10  room_type                 500000 non-null  object  
 11  medical_equipment_count   500000 non-null  int64   
 12  stay_duration             500000 non-null  int64   
 13  ward_cost                 500

In [27]:
df.head()

Unnamed: 0,gender,age,insurance_type,smoking_status,e_cigarette_usage,alcohol_consumption_rate,previous_admission_count,surgery_name,surgery_type,surgery_duration,room_type,medical_equipment_count,stay_duration,ward_cost,surgery_cost,medication_cost,total_cost,total_cost_class
0,Male,52,Private,Never,False,Occasional,2,Gallbladder Removal,Minor,74,Regular Ward,1,3,300.0,6455.9,105.0,7460.9,Low
1,Female,18,Social Security Agency,Never,False,Occasional,1,Breast Cancer Surgery,Major,194,Regular Ward,3,8,300.0,21266.34,1609.47,25275.81,High
2,Female,46,Private,Never,False,Occasional,0,Gallbladder Removal,Minor,101,Regular Ward,0,4,300.0,5315.61,377.22,6892.83,Low
3,Female,85,Social Security Agency,Never,False,,2,Appendectomy,Minor,98,ICU,0,5,800.0,6098.06,613.54,10711.6,Low
4,Male,21,Private,Never,False,Occasional,1,Hip Replacement,Major,165,VIP Ward,3,5,1000.0,12361.15,2384.62,19745.77,Medium


In [28]:
df['alcohol_consumption_rate'].fillna('None', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['alcohol_consumption_rate'].fillna('None', inplace=True)


In [29]:
df.head()

Unnamed: 0,gender,age,insurance_type,smoking_status,e_cigarette_usage,alcohol_consumption_rate,previous_admission_count,surgery_name,surgery_type,surgery_duration,room_type,medical_equipment_count,stay_duration,ward_cost,surgery_cost,medication_cost,total_cost,total_cost_class
0,Male,52,Private,Never,False,Occasional,2,Gallbladder Removal,Minor,74,Regular Ward,1,3,300.0,6455.9,105.0,7460.9,Low
1,Female,18,Social Security Agency,Never,False,Occasional,1,Breast Cancer Surgery,Major,194,Regular Ward,3,8,300.0,21266.34,1609.47,25275.81,High
2,Female,46,Private,Never,False,Occasional,0,Gallbladder Removal,Minor,101,Regular Ward,0,4,300.0,5315.61,377.22,6892.83,Low
3,Female,85,Social Security Agency,Never,False,,2,Appendectomy,Minor,98,ICU,0,5,800.0,6098.06,613.54,10711.6,Low
4,Male,21,Private,Never,False,Occasional,1,Hip Replacement,Major,165,VIP Ward,3,5,1000.0,12361.15,2384.62,19745.77,Medium


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 18 columns):
 #   Column                    Non-Null Count   Dtype   
---  ------                    --------------   -----   
 0   gender                    500000 non-null  object  
 1   age                       500000 non-null  int64   
 2   insurance_type            500000 non-null  object  
 3   smoking_status            500000 non-null  object  
 4   e_cigarette_usage         500000 non-null  bool    
 5   alcohol_consumption_rate  500000 non-null  object  
 6   previous_admission_count  500000 non-null  int64   
 7   surgery_name              500000 non-null  object  
 8   surgery_type              500000 non-null  object  
 9   surgery_duration          500000 non-null  int64   
 10  room_type                 500000 non-null  object  
 11  medical_equipment_count   500000 non-null  int64   
 12  stay_duration             500000 non-null  int64   
 13  ward_cost                 500

In [31]:
df['gender'].unique()

array(['Male', 'Female'], dtype=object)

In [32]:
df['insurance_type'].unique()

array(['Private', 'Social Security Agency', 'Self-Pay'], dtype=object)

In [33]:
df['smoking_status'].unique()

array(['Never', 'Current', 'Former'], dtype=object)

In [34]:
df['e_cigarette_usage'].unique()

array([False,  True])

In [35]:
df['alcohol_consumption_rate'].unique()

array(['Occasional', 'None', 'Heavy', 'Moderate'], dtype=object)

In [36]:
df['surgery_name'].unique()

array(['Gallbladder Removal', 'Breast Cancer Surgery', 'Appendectomy',
       'Hip Replacement', 'Cataract Surgery', 'Hernia Repair',
       'Knee Replacement', 'Liver Transplant', 'Heart Bypass',
       'Spinal Fusion'], dtype=object)

In [37]:
df['surgery_type'].unique()

array(['Minor', 'Major'], dtype=object)

In [38]:
df['room_type'].unique()

array(['Regular Ward', 'ICU', 'VIP Ward', 'Private Ward'], dtype=object)

In [39]:
df['total_cost_class'].unique()

['Low', 'High', 'Medium', 'Very Low', 'Very High']
Categories (5, object): ['Very Low' < 'Low' < 'Medium' < 'High' < 'Very High']

In [41]:
gender_mapping = {
    'Male': 0, 
    'Female': 1

}

insurance_type_mapping= {
    'Private': 0, 
    'Social Security Agency': 1, 
    'Self-Pay': 2
}

smoking_status_mapping = {
    'Never': 0, 
    'Current': 1, 
    'Former': 2
}

e_cigarette_usage_mapping = {
    False: 0,
    True: 1
}

alcohol_consumption_rate_mapping = {
    'Occasional': 0, 
    'None': 1, 
    'Heavy': 2, 
    'Moderate': 3
}

surgery_name_mapping = {
    'Gallbladder Removal': 0, 
    'Breast Cancer Surgery': 1, 
    'Appendectomy' : 2,
    'Hip Replacement': 3, 
    'Cataract Surgery' : 4, 
    'Hernia Repair' : 5,
    'Knee Replacement' : 6, 
    'Liver Transplant': 7, 
    'Heart Bypass' : 8,
    'Spinal Fusion': 9
}

surgery_type_mapping = {
    'Minor': 0, 
    'Major' : 1
}

room_type_mapping = {
    'Regular Ward' : 0, 
    'ICU': 1, 
    'VIP Ward': 2, 
    'Private Ward': 3
}

total_cost_class_mapping = {
    'Very Low': 0, 
    'Low': 1, 
    'Medium' : 2, 
    'High': 3, 
    'Very High': 4
}

In [42]:
df['gender'] = df['gender'].map(gender_mapping)
df['insurance_type'] = df['insurance_type'].map(insurance_type_mapping)
df['smoking_status'] = df['smoking_status'].map(smoking_status_mapping)
df['e_cigarette_usage'] = df['e_cigarette_usage'].map(e_cigarette_usage_mapping)
df['alcohol_consumption_rate'] = df['alcohol_consumption_rate'].map(alcohol_consumption_rate_mapping)
df['surgery_name'] = df['surgery_name'].map(surgery_name_mapping)
df['surgery_type'] = df['surgery_type'].map(surgery_type_mapping)
df['room_type'] = df['room_type'].map(room_type_mapping)
df['total_cost_class'] = df['total_cost_class'].map(total_cost_class_mapping)

In [22]:
# Define parameter grids for both models
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0]
}

## Cost Prediction

In [43]:
cost_df = df[[
    'total_cost_class',
    'gender',
    'age',
    'insurance_type',
    'smoking_status',
    'e_cigarette_usage',
    'alcohol_consumption_rate',
    'previous_admission_count',
    'surgery_name',
    'room_type',
    'stay_duration',
    'medical_equipment_count'
]]

cost_df

Unnamed: 0,total_cost_class,gender,age,insurance_type,smoking_status,e_cigarette_usage,alcohol_consumption_rate,previous_admission_count,surgery_name,room_type,stay_duration,medical_equipment_count
0,1,0,52,0,0,0,0,2,0,0,3,1
1,3,1,18,1,0,0,0,1,1,0,8,3
2,1,1,46,0,0,0,0,0,0,0,4,0
3,1,1,85,1,0,0,1,2,2,1,5,0
4,2,0,21,0,0,0,0,1,3,2,5,3
...,...,...,...,...,...,...,...,...,...,...,...,...
499995,1,1,67,1,1,0,1,0,5,1,5,1
499996,2,1,65,1,0,0,1,1,7,0,12,2
499997,2,1,36,0,0,0,0,1,1,3,13,4
499998,2,1,56,1,2,0,1,2,6,0,11,1


In [50]:
cost_df_resample= cost_df.sample(n=100000, random_state=42)

cost_df_resample

Unnamed: 0,total_cost_class,gender,age,insurance_type,smoking_status,e_cigarette_usage,alcohol_consumption_rate,previous_admission_count,surgery_name,room_type,stay_duration,medical_equipment_count
104241,3,1,51,0,2,0,0,2,8,1,13,2
199676,4,1,46,0,1,0,1,1,7,3,10,4
140199,4,0,32,0,0,0,0,2,7,2,12,1
132814,4,0,47,1,0,0,0,1,3,1,11,1
408697,2,0,64,1,2,0,3,1,9,0,11,2
...,...,...,...,...,...,...,...,...,...,...,...,...
66361,4,0,64,0,2,0,0,2,7,1,9,3
497228,0,0,59,0,0,0,3,2,4,0,3,1
152728,4,1,38,0,2,0,1,1,1,2,9,5
50155,0,0,48,0,0,0,3,1,2,0,1,1


In [51]:
cost_df_resample['total_cost_class'].value_counts()

total_cost_class
1    20302
2    20062
3    19935
4    19922
0    19779
Name: count, dtype: int64

In [52]:
x = cost_df_resample.drop('total_cost_class', axis = 1)
y = cost_df_resample['total_cost_class']

In [53]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

#### Random Forest (Heart Attack)

In [54]:
# Initialize the RF models
rf_model = RandomForestClassifier(random_state=42)

In [55]:
# Set up GridSearchCV for RF model
rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)

In [56]:
# Fit the grid search models
rf_grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [57]:
# Get the best models
best_rf_model = rf_grid_search.best_estimator_

In [58]:
# Evaluate on test data
rf_preds = best_rf_model.predict(x_test)

In [59]:
# Evaluate the best Random Forest model
print("Best Random Forest Model Accuracy:", accuracy_score(y_test, rf_preds))
print("Best Random Forest Model Classification Report:")
print(classification_report(y_test, rf_preds))

Best Random Forest Model Accuracy: 0.6622333333333333
Best Random Forest Model Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.80      0.75      6014
           1       0.77      0.67      0.71      6109
           2       0.65      0.73      0.69      6039
           3       0.47      0.39      0.42      5938
           4       0.69      0.72      0.71      5900

    accuracy                           0.66     30000
   macro avg       0.66      0.66      0.66     30000
weighted avg       0.66      0.66      0.66     30000



In [60]:
print("Best parameters for Random Forest:", rf_grid_search.best_params_)

Best parameters for Random Forest: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}


#### XGBoost (Heart Attack)

In [61]:
# Initialize the XGB models
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

In [62]:
# Set up GridSearchCV for XGB model
xgb_grid_search = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)

In [63]:
# Fit the grid search models
xgb_grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits




BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.

In [None]:
# Get the best models
best_xgb_model = xgb_grid_search.best_estimator_

In [None]:
# Evaluate on test data
xgb_preds = best_xgb_model.predict(x_test)

In [None]:
# Evaluate the best XGBoost model
print("Best XGBoost Model Accuracy:", accuracy_score(y_test, xgb_preds))
print("Best XGBoost Model Classification Report:")
print(classification_report(y_test, xgb_preds))

Best XGBoost Model Accuracy: 0.7966294262450293
Best XGBoost Model Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.82      0.80      2587
           1       0.81      0.78      0.80      2694

    accuracy                           0.80      5281
   macro avg       0.80      0.80      0.80      5281
weighted avg       0.80      0.80      0.80      5281



In [None]:
print("Best parameters for XGBoost:", xgb_grid_search.best_params_)

Best parameters for XGBoost: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 300, 'subsample': 0.7}
