In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [8]:
path = "C:\\Users\\Republic Of Gamers\\OneDrive\\Documents\\GitHub\\TSDN-BoyWithLuv\\Source\\Data\\sdm_preprocessed.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results,day_stayed
0,Bobby Jackson,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal,2
1,Leslie Terry,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive,6
2,Danny Smith,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal,15
3,Andrew Watts,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal,30
4,Adrienne Bell,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal,20


In [9]:
def tsFeature(df):
    df = df.copy()
    df['month'] = df['Date of Admission'].dt.month
    df['day'] = df['Date of Admission'].dt.day
    df['year'] = df['Date of Admission'].dt.year
    df['quarter'] = df['Date of Admission'].dt.quarter
    df['dayofweek'] = df['Date of Admission'].dt.dayofweek
    df['dayofyear'] = df['Date of Admission'].dt.dayofyear
    return df

df['Date of Admission'] = pd.to_datetime(df['Date of Admission'])
ts_df = tsFeature(df)

In [10]:
ts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55392 entries, 0 to 55391
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Name                55392 non-null  object        
 1   Age                 55392 non-null  int64         
 2   Gender              55392 non-null  object        
 3   Blood Type          55392 non-null  object        
 4   Medical Condition   55392 non-null  object        
 5   Date of Admission   55392 non-null  datetime64[ns]
 6   Doctor              55392 non-null  object        
 7   Hospital            55392 non-null  object        
 8   Insurance Provider  55392 non-null  object        
 9   Billing Amount      55392 non-null  float64       
 10  Room Number         55392 non-null  int64         
 11  Admission Type      55392 non-null  object        
 12  Discharge Date      55392 non-null  object        
 13  Medication          55392 non-null  object    

In [11]:
ts_df['Date of Admission'] = pd.to_datetime(ts_df['Date of Admission'])
ts_df.set_index('Date of Admission', inplace=True)
daily_df = ts_df.groupby('Admission Type').resample('D').size().unstack(fill_value=0)
daily_df = daily_df.T
daily_df.index.name = 'Date of Admission'

In [12]:
daily_df.head()

Admission Type,Elective,Emergency,Urgent
Date of Admission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-05-08,11,14,2
2019-05-09,14,9,15
2019-05-10,7,1,9
2019-05-11,11,5,9
2019-05-12,14,10,12


In [13]:
daily_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1827 entries, 2019-05-08 to 2024-05-07
Freq: D
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   Elective   1827 non-null   int64
 1   Emergency  1827 non-null   int64
 2   Urgent     1827 non-null   int64
dtypes: int64(3)
memory usage: 121.6 KB


In [14]:
'''# Create a new DataFrame to hold blood type counts against admission types
blood_type_counts = ts_df.groupby(['Admission Type', 'RhFactor']).resample('D').size().unstack(fill_value=0)

# Transpose the new DataFrame
blood_type_counts = blood_type_counts.T

# Rename the index to 'Date of Admission'
blood_type_counts.index.name = 'Date of Admission'

# Concatenate the blood type counts to the existing monthly_df
# This assumes monthly_df is already defined
daily_df = pd.concat([daily_df, blood_type_counts], axis=1)

# Display the updated DataFrame
daily_df.head()'''

"# Create a new DataFrame to hold blood type counts against admission types\nblood_type_counts = ts_df.groupby(['Admission Type', 'RhFactor']).resample('D').size().unstack(fill_value=0)\n\n# Transpose the new DataFrame\nblood_type_counts = blood_type_counts.T\n\n# Rename the index to 'Date of Admission'\nblood_type_counts.index.name = 'Date of Admission'\n\n# Concatenate the blood type counts to the existing monthly_df\n# This assumes monthly_df is already defined\ndaily_df = pd.concat([daily_df, blood_type_counts], axis=1)\n\n# Display the updated DataFrame\ndaily_df.head()"

In [15]:
# Create a new DataFrame to hold blood type counts against admission types
gender_counts = ts_df.groupby(['Admission Type', 'Gender']).resample('D').size().unstack(fill_value=0)

# Transpose the new DataFrame
gender_counts = gender_counts.T

# Rename the index to 'Date of Admission'
gender_counts.index.name = 'Date of Admission'

# Concatenate the blood type counts to the existing monthly_df
# This assumes monthly_df is already defined
daily_df = pd.concat([daily_df, gender_counts], axis=1)

# Display the updated DataFrame
daily_df.head()

Unnamed: 0_level_0,Elective,Emergency,Urgent,"(Elective, Female)","(Elective, Male)","(Emergency, Female)","(Emergency, Male)","(Urgent, Female)","(Urgent, Male)"
Date of Admission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-05-08,11,14,2,6,5,6,8,0,2
2019-05-09,14,9,15,6,8,3,6,5,10
2019-05-10,7,1,9,4,3,1,0,4,5
2019-05-11,11,5,9,5,6,3,2,6,3
2019-05-12,14,10,12,8,6,7,3,7,5


In [16]:
target_columns = ['Elective', 'Urgent', 'Emergency']

# Loop through only the specified columns
for admission_type in target_columns:
    if admission_type in daily_df.columns:
        for lag in range(1, 4):
            daily_df[f'{admission_type}_Lag_{lag}'] = daily_df[admission_type].shift(lag)

In [17]:
daily_df.head()

Unnamed: 0_level_0,Elective,Emergency,Urgent,"(Elective, Female)","(Elective, Male)","(Emergency, Female)","(Emergency, Male)","(Urgent, Female)","(Urgent, Male)",Elective_Lag_1,Elective_Lag_2,Elective_Lag_3,Urgent_Lag_1,Urgent_Lag_2,Urgent_Lag_3,Emergency_Lag_1,Emergency_Lag_2,Emergency_Lag_3
Date of Admission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2019-05-08,11,14,2,6,5,6,8,0,2,,,,,,,,,
2019-05-09,14,9,15,6,8,3,6,5,10,11.0,,,2.0,,,14.0,,
2019-05-10,7,1,9,4,3,1,0,4,5,14.0,11.0,,15.0,2.0,,9.0,14.0,
2019-05-11,11,5,9,5,6,3,2,6,3,7.0,14.0,11.0,9.0,15.0,2.0,1.0,9.0,14.0
2019-05-12,14,10,12,8,6,7,3,7,5,11.0,7.0,14.0,9.0,9.0,15.0,5.0,1.0,9.0


In [18]:
daily_df_cleaned = daily_df.dropna()
daily_df_cleaned.head()

Unnamed: 0_level_0,Elective,Emergency,Urgent,"(Elective, Female)","(Elective, Male)","(Emergency, Female)","(Emergency, Male)","(Urgent, Female)","(Urgent, Male)",Elective_Lag_1,Elective_Lag_2,Elective_Lag_3,Urgent_Lag_1,Urgent_Lag_2,Urgent_Lag_3,Emergency_Lag_1,Emergency_Lag_2,Emergency_Lag_3
Date of Admission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2019-05-11,11,5,9,5,6,3,2,6,3,7.0,14.0,11.0,9.0,15.0,2.0,1.0,9.0,14.0
2019-05-12,14,10,12,8,6,7,3,7,5,11.0,7.0,14.0,9.0,9.0,15.0,5.0,1.0,9.0
2019-05-13,6,9,8,5,1,5,4,5,3,14.0,11.0,7.0,12.0,9.0,9.0,10.0,5.0,1.0
2019-05-14,12,13,8,6,6,6,7,3,5,6.0,14.0,11.0,8.0,12.0,9.0,9.0,10.0,5.0
2019-05-15,12,15,12,4,8,8,7,6,6,12.0,6.0,14.0,8.0,8.0,12.0,13.0,9.0,10.0


In [19]:
daily_df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1824 entries, 2019-05-11 to 2024-05-07
Freq: D
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Elective             1824 non-null   int64  
 1   Emergency            1824 non-null   int64  
 2   Urgent               1824 non-null   int64  
 3   (Elective, Female)   1824 non-null   int64  
 4   (Elective, Male)     1824 non-null   int64  
 5   (Emergency, Female)  1824 non-null   int64  
 6   (Emergency, Male)    1824 non-null   int64  
 7   (Urgent, Female)     1824 non-null   int64  
 8   (Urgent, Male)       1824 non-null   int64  
 9   Elective_Lag_1       1824 non-null   float64
 10  Elective_Lag_2       1824 non-null   float64
 11  Elective_Lag_3       1824 non-null   float64
 12  Urgent_Lag_1         1824 non-null   float64
 13  Urgent_Lag_2         1824 non-null   float64
 14  Urgent_Lag_3         1824 non-null   float64
 15  Emergency_La

In [20]:
daily_df_cleaned = daily_df_cleaned.astype(int)

In [21]:
elective_columns = [col for col in daily_df_cleaned.columns if 'Elective' in col]
urgent_columns = [col for col in daily_df_cleaned.columns if 'Urgent' in col]
emergency_columns = [col for col in daily_df_cleaned.columns if 'Emergency' in col]

# Create a new DataFrame with the column names only
elective_df = daily_df_cleaned[elective_columns].copy()
urgent_df = daily_df_cleaned[urgent_columns].copy()
emergency_df = daily_df_cleaned[emergency_columns].copy()

In [22]:
emergency_df.head()

Unnamed: 0_level_0,Emergency,"(Emergency, Female)","(Emergency, Male)",Emergency_Lag_1,Emergency_Lag_2,Emergency_Lag_3
Date of Admission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-05-11,5,3,2,1,9,14
2019-05-12,10,7,3,5,1,9
2019-05-13,9,5,4,10,5,1
2019-05-14,13,6,7,9,10,5
2019-05-15,15,8,7,13,9,10


In [23]:
emergency_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1824 entries, 2019-05-11 to 2024-05-07
Freq: D
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   Emergency            1824 non-null   int32
 1   (Emergency, Female)  1824 non-null   int32
 2   (Emergency, Male)    1824 non-null   int32
 3   Emergency_Lag_1      1824 non-null   int32
 4   Emergency_Lag_2      1824 non-null   int32
 5   Emergency_Lag_3      1824 non-null   int32
dtypes: int32(6)
memory usage: 57.0 KB


In [24]:
elective_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1824 entries, 2019-05-11 to 2024-05-07
Freq: D
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   Elective            1824 non-null   int32
 1   (Elective, Female)  1824 non-null   int32
 2   (Elective, Male)    1824 non-null   int32
 3   Elective_Lag_1      1824 non-null   int32
 4   Elective_Lag_2      1824 non-null   int32
 5   Elective_Lag_3      1824 non-null   int32
dtypes: int32(6)
memory usage: 57.0 KB


In [25]:
urgent_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1824 entries, 2019-05-11 to 2024-05-07
Freq: D
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Urgent            1824 non-null   int32
 1   (Urgent, Female)  1824 non-null   int32
 2   (Urgent, Male)    1824 non-null   int32
 3   Urgent_Lag_1      1824 non-null   int32
 4   Urgent_Lag_2      1824 non-null   int32
 5   Urgent_Lag_3      1824 non-null   int32
dtypes: int32(6)
memory usage: 57.0 KB


In [26]:
def tsFeatureType(df):
    df = df.copy()
    df['month'] = df.index.month
    df['day'] = df.index.day
    df['year'] = df.index.year
    df['quarter'] = df.index.quarter
    df['dayofweek'] = df.index.dayofweek
    df['dayofyear'] = df.index.dayofyear
    return df

emergency_df = tsFeatureType(emergency_df)
elective_df = tsFeatureType(elective_df)
urgent_df = tsFeatureType(urgent_df)

In [27]:
emergency_df.to_csv('sdm_ts_emergency_daily.csv', index = True)
urgent_df.to_csv('sdm_ts_urgent_daily.csv', index = True)
elective_df.to_csv('sdm_ts_elective_daily.csv', index = True)

In [None]:
trainEmergencySize = int(len(emergency_df) * 0.7)
trainEmergency, testEmergency= emergency_df[:trainEmergencySize], emergency_df[trainEmergencySize:]

In [None]:
X_train = trainEmergency.drop(columns=['Emergency'])
y_train = trainEmergency['Emergency']
X_test = testEmergency.drop(columns=['Emergency'])
y_test = testEmergency['Emergency']
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

In [None]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Initialize the model
rf = RandomForestRegressor()

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

# Fit to the data
grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters found: ", grid_search.best_params_)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best parameters found:  {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}


In [None]:
# Extract the best parameters from the grid search
best_params = grid_search.best_params_

# Initialize the RandomForestRegressor with the best parameters
best_rf = RandomForestRegressor(**best_params)

# Fit the model to the full training data
best_rf.fit(X_train, y_train)

# Make predictions on the test set or future data
y_pred = best_rf.predict(X_test)

# Display predictions
print(y_pred)

[ 9.          8.         12.99        3.          7.         14.
 16.01        8.          6.         17.01       12.99        3.
 10.          9.          9.         12.         13.          6.02
  4.         13.          8.         11.          7.         12.
  6.         15.99363636  9.          6.         11.         11.
 10.          9.          6.         11.         12.          8.
 14.          9.          9.         10.98        9.          8.
  6.          3.         16.          8.          3.          9.
 17.08833333 10.         10.          3.04        7.         10.
 18.91        7.          8.         10.         16.08       10.
 11.         12.          6.          3.08       11.         15.99
  6.         11.         12.99       11.         10.03        9.
  5.         14.         11.         11.         10.          6.
  6.         10.         11.          7.         13.         10.
 10.         16.97       12.         11.         13.         14.01
  8.         13.   

In [None]:
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

# Display the metrics
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R²):", r2)

Root Mean Squared Error (RMSE): 0.1725845737211181
Mean Absolute Error (MAE): 0.024384969917756783
R-squared (R²): 0.9973915155972495


In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
xgb_model = XGBRegressor()

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

# Fit to the data
grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters found: ", grid_search.best_params_)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best parameters found:  {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}


Parameters: { "min_samples_split" } are not used.



In [None]:
# Extract the best parameters from the grid search
best_params = grid_search.best_params_

# Initialize the XGBRegressor with the best parameters
best_xgb = XGBRegressor(**best_params)

# Fit the model to the full training data
best_xgb.fit(X_train, y_train)

# Make predictions on the test set or future data
y_pred = best_xgb.predict(X_test)

# Display predictions
print("Predictions:", y_pred)

Predictions: [ 9.000086    8.001299   13.010904    3.000685    6.997571   13.997227
 15.996727    8.0019      6.0006824  17.000725   13.010904    3.0004241
 10.000198    9.000248    8.998876   11.998702   13.002624    6.000657
  4.000276   13.002624    8.001299   10.996132    6.9982285  11.999142
  6.000871   15.998155    9.001222    6.0027156  11.001888   11.000063
 10.00295     9.000413    6.000546   11.004042   12.001374    8.000673
 13.996867    9.000248    9.000248   10.999775    9.001222    8.00088
  6.000871    3.000685   15.997424    8.00088     3.000685    8.998876
 16.992899    9.9971075   9.999021    3.0005875   6.9982285  10.00295
 18.997532    6.9982285   8.0019      9.997975   16.88007     9.997975
 11.001888   12.001374    6.000871    3.000012   11.001053   16.00027
  6.000871   11.004042   13.010904   11.001053    9.99811     8.998876
  5.000036   13.997227   11.004042   11.001053   10.000198    6.000871
  6.000546   10.00295    10.996132    6.998283   13.00085    10.00

Parameters: { "min_samples_split" } are not used.



In [None]:
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

# Display the metrics
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R²):", r2)

Root Mean Squared Error (RMSE): 0.07028031399745832
Mean Absolute Error (MAE): 0.009126368007590428
R-squared (R²): 0.999567449092865
