In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [6]:
path = "C:\\Users\\Republic Of Gamers\\OneDrive\\Documents\\GitHub\\TSDN-BoyWithLuv\\Source\\Data\\sdm_ts_elective_daily.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,Date of Admission,Elective,"('Elective', 'Female')","('Elective', 'Male')",Elective_Lag_1,Elective_Lag_2,Elective_Lag_3,month,day,year,quarter,dayofweek,dayofyear
0,2019-05-11,11,5,6,7,14,11,5,11,2019,2,5,131
1,2019-05-12,14,8,6,11,7,14,5,12,2019,2,6,132
2,2019-05-13,6,5,1,14,11,7,5,13,2019,2,0,133
3,2019-05-14,12,6,6,6,14,11,5,14,2019,2,1,134
4,2019-05-15,12,4,8,12,6,14,5,15,2019,2,2,135


In [7]:
df.set_index('Date of Admission', inplace=True)

In [17]:
df.head()

Unnamed: 0_level_0,Elective,"('Elective', 'Female')","('Elective', 'Male')",Elective_Lag_1,Elective_Lag_2,Elective_Lag_3,month,day,year,quarter,dayofweek,dayofyear
Date of Admission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-05-11,11,5,6,7,14,11,5,11,2019,2,5,131
2019-05-12,14,8,6,11,7,14,5,12,2019,2,6,132
2019-05-13,6,5,1,14,11,7,5,13,2019,2,0,133
2019-05-14,12,6,6,6,14,11,5,14,2019,2,1,134
2019-05-15,12,4,8,12,6,14,5,15,2019,2,2,135


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1824 entries, 2019-05-11 to 2024-05-07
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   Elective                1824 non-null   int64
 1   ('Elective', 'Female')  1824 non-null   int64
 2   ('Elective', 'Male')    1824 non-null   int64
 3   Elective_Lag_1          1824 non-null   int64
 4   Elective_Lag_2          1824 non-null   int64
 5   Elective_Lag_3          1824 non-null   int64
 6   month                   1824 non-null   int64
 7   day                     1824 non-null   int64
 8   year                    1824 non-null   int64
 9   quarter                 1824 non-null   int64
 10  dayofweek               1824 non-null   int64
 11  dayofyear               1824 non-null   int64
dtypes: int64(12)
memory usage: 185.2+ KB


In [8]:
trainElectiveSize = int(len(df) * 0.7)
trainElective, testElective= df[:trainElectiveSize], df[trainElectiveSize:]

In [9]:
X_train = trainElective.drop(columns=['Elective'])
y_train = trainElective['Elective']
X_test = testElective.drop(columns=['Elective'])
y_test = testElective['Elective']
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

In [10]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Initialize the model
rf = RandomForestRegressor()

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

# Fit to the data
grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters found: ", grid_search.best_params_)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best parameters found:  {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}


In [11]:
# Extract the best parameters from the grid search
best_params = grid_search.best_params_

# Initialize the RandomForestRegressor with the best parameters
best_rf = RandomForestRegressor(**best_params)

# Fit the model to the full training data
best_rf.fit(X_train, y_train)

# Make predictions on the test set or future data
y_pred = best_rf.predict(X_test)

# Display predictions
print(y_pred)

[10.    8.   13.91  7.    9.   10.   11.   10.   18.71 13.06 13.    7.
 13.06 12.   19.43  4.07  9.   13.01 14.07  7.88 11.    7.    5.89  9.
 11.   10.    4.6  11.96 17.15  8.99  7.    7.96  8.95 11.31  8.    7.14
 18.12  8.   10.   13.09  9.    7.   15.   14.96  9.49 11.17 12.    6.
 10.   13.09 13.08 12.    7.    9.   11.77 10.    7.    9.    5.03  7.
 15.05 13.01  7.    7.   17.99  4.25 13.02  5.97 12.49  9.    6.   17.35
 16.12 11.   12.   14.01  9.    7.   15.41  5.63  9.99 10.   11.   11.
 10.   12.    8.   11.   13.   11.   10.   11.    9.89  6.   11.    6.96
 11.   11.   17.03 10.   16.27  5.94 11.   11.    5.03  4.22  5.98 14.01
 11.37 11.    8.   11.    3.83  8.01 12.    9.   12.    8.   12.66  6.
  9.    2.82 14.4  11.36 11.   11.    5.64 15.05  5.   10.    3.98 12.
  5.01  9.01 14.99  9.   11.   10.    6.   16.07 11.   15.03  7.   11.
 12.    7.   13.02  9.   10.    6.   16.27 18.32  8.    6.    8.    9.
  4.37  8.    9.   13.    9.   12.01 15.47  4.85  9.   11.   12.    7

In [12]:
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

# Display the metrics
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R²):", r2)

Root Mean Squared Error (RMSE): 0.2561182009282342
Mean Absolute Error (MAE): 0.08874087591240876
R-squared (R²): 0.9947032295431079


In [13]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
xgb_model = XGBRegressor()

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

# Fit to the data
grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters found: ", grid_search.best_params_)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best parameters found:  {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}


Parameters: { "min_samples_split" } are not used.



In [14]:
# Extract the best parameters from the grid search
best_params = grid_search.best_params_

# Initialize the XGBRegressor with the best parameters
best_xgb = XGBRegressor(**best_params)

# Fit the model to the full training data
best_xgb.fit(X_train, y_train)

# Make predictions on the test set or future data
y_pred = best_xgb.predict(X_test)

# Display predictions
print("Predictions:", y_pred)

Predictions: [ 9.999271   7.9980965 14.460104   6.999839   9.000277   9.999367
 11.00002    9.999723  18.42316   13.004138  12.99968    6.9976716
 13.0020275 12.001421  20.137669   4.037081   9.000373  12.817682
 13.993757   7.995817  10.999655   6.999813   5.996152   9.000625
 11.001556   9.999866   5.0088096 11.989297  18.20721    9.000244
  6.99878    7.998104   9.000995  11.207013   7.999685   7.014558
 17.94303    7.999822   9.999372  12.999696   9.00026    6.9993577
 14.998166  15.50209    8.954444  11.375457  11.999762   5.998878
  9.997047  13.020005  13.019117  12.001089   6.998993   9.002423
 11.766731   9.99901    7.231327   9.0006     5.0064273  6.9990206
 15.0247345 12.99568    7.0003805  7.000056  18.044708   4.0289907
 12.999085   5.9998236 11.7122965  8.999993   5.998799  16.985361
 15.997911  10.998269  11.997846  14.002548   9.000859   6.9991326
 16.210926   6.0074124  9.999362  10.000562  11.000206  10.999304
  9.9992485 12.000072   7.9989047 11.000153  12.999945  11

Parameters: { "min_samples_split" } are not used.



In [15]:
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

# Display the metrics
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R²):", r2)

Root Mean Squared Error (RMSE): 0.15559111323826694
Mean Absolute Error (MAE): 0.0440390518982045
R-squared (R²): 0.9980452060699463


In [16]:
joblib.dump(best_xgb, 'xgb_elective.pkl')

['xgb_elective.pkl']