In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [2]:
path = "C:\\Users\\Republic Of Gamers\\OneDrive\\Documents\\GitHub\\TSDN-BoyWithLuv\\Source\\Data\\sdm_ts_urgent_daily.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,Date of Admission,Urgent,"('Urgent', 'Female')","('Urgent', 'Male')",Urgent_Lag_1,Urgent_Lag_2,Urgent_Lag_3,month,day,year,quarter,dayofweek,dayofyear
0,2019-05-11,9,6,3,9,15,2,5,11,2019,2,5,131
1,2019-05-12,12,7,5,9,9,15,5,12,2019,2,6,132
2,2019-05-13,8,5,3,12,9,9,5,13,2019,2,0,133
3,2019-05-14,8,3,5,8,12,9,5,14,2019,2,1,134
4,2019-05-15,12,6,6,8,8,12,5,15,2019,2,2,135


In [3]:
df.set_index('Date of Admission', inplace=True)

In [4]:
trainUrgentSize = int(len(df) * 0.7)
trainUrgent, testUrgent= df[:trainUrgentSize], df[trainUrgentSize:]

In [5]:
X_train = trainUrgent.drop(columns=['Urgent'])
y_train = trainUrgent['Urgent']
X_test = testUrgent.drop(columns=['Urgent'])
y_test = testUrgent['Urgent']
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

In [6]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Initialize the model
rf = RandomForestRegressor()

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

# Fit to the data
grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters found: ", grid_search.best_params_)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best parameters found:  {'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 300}


In [7]:
# Extract the best parameters from the grid search
best_params = grid_search.best_params_

# Initialize the RandomForestRegressor with the best parameters
best_rf = RandomForestRegressor(**best_params)

# Fit the model to the full training data
best_rf.fit(X_train, y_train)

# Make predictions on the test set or future data
y_pred = best_rf.predict(X_test)

# Display predictions
print(y_pred)

[ 8.         12.          7.99666667  7.98666667  6.         12.99
  6.67333333 10.          7.          9.99333333  8.         11.
 11.97333333 13.          1.87333333  7.98666667  7.          8.
 12.02333333  6.         16.46666667  8.         10.         16.42333333
 11.02333333 10.99666667 12.96        6.68666667  9.         15.06666667
 10.          9.99666667  9.         13.26666667 12.         12.
  7.          8.03666667  6.         13.         13.12666667 10.
 15.81333333  5.89333333 12.          7.         15.03666667 12.96333333
 15.97666667 10.          9.          7.         11.          6.99666667
 14.98       14.         13.         10.          1.69333333  9.86
 14.89        8.          6.         12.99666667  4.99        7.99333333
  9.          8.         14.00333333  7.         10.99666667  9.09
 11.99        7.96666667  5.01        8.         10.          8.
 10.          9.         15.04333333  9.          5.          3.50333333
 15.64666667 11.         14.53333333

In [8]:
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

# Display the metrics
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R²):", r2)

Root Mean Squared Error (RMSE): 0.2912379887633921
Mean Absolute Error (MAE): 0.09465328467153286
R-squared (R²): 0.9928658560044674


In [9]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
xgb_model = XGBRegressor()

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

# Fit to the data
grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters found: ", grid_search.best_params_)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best parameters found:  {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}


Parameters: { "min_samples_split" } are not used.



In [10]:
# Extract the best parameters from the grid search
best_params = grid_search.best_params_

# Initialize the XGBRegressor with the best parameters
best_xgb = XGBRegressor(**best_params)

# Fit the model to the full training data
best_xgb.fit(X_train, y_train)

# Make predictions on the test set or future data
y_pred = best_xgb.predict(X_test)

# Display predictions
print("Predictions:", y_pred)

Parameters: { "min_samples_split" } are not used.



Predictions: [ 8.001598   12.0230055   8.002477    8.02176     5.9789233  12.980762
  6.9679356   9.987465    6.966935    9.952893    8.000276   10.998242
 11.990795   12.945383    1.8673964   8.01638     6.980525    7.9486
 11.970782    5.987874   17.472473    7.9540873   9.974253   16.227406
 11.026211   10.946767   12.956558    6.985918    9.011996   14.964556
  9.9769945   9.966504    9.001406   13.34433    11.976382   11.968582
  6.9908895   7.9458175   6.027978   12.992252   12.666922    9.9808
 15.9016      6.0008206  11.975997    6.991905   14.89358    12.95516
 15.901685    9.969878    9.004966    6.9931636  11.010959    6.9914804
 15.007497   14.090322   13.005444   10.02631     2.085723   10.018078
 14.970526    8.027578    6.038591   13.036308    4.981764    8.068464
  8.995652    8.009533   14.0246315   6.9765124  10.959566    9.014357
 12.000068    8.015311    5.008975    8.00067     9.983415    8.003458
  9.984698    9.00282    14.911491    9.010171    5.0002494   2.9774

In [11]:
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

# Display the metrics
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R²):", r2)

Root Mean Squared Error (RMSE): 0.27854613112538207
Mean Absolute Error (MAE): 0.07478642561574922
R-squared (R²): 0.9934741258621216


In [12]:
joblib.dump(best_xgb, 'xgb_urgent.pkl')

['xgb_urgent.pkl']