### Random forest regressor

In [48]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
import plotly.express as px
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint

### Importing the scaled dataset

In [52]:
df = pd.read_csv("/Users/asheshlalshrestha/Desktop/UB/Research/CarePathways/Models/Datasets/preprocessed_dataset.csv")

In [53]:
df = df.drop(["Unnamed: 0","id"],axis=1)
df

Unnamed: 0,hospital_length_of_stay,age,sex,height,weight,smoking_history,previous_er_visit_within_14_days,admission_disposition,Hypertension,Chronic cardiac disease (not hypertension),...,Respiractin,Immunity Advance,Vesicare,Zaxine,Quinine Sulfate,Desvenlafaxine,Glucosamine,Turmeric,Cogentin Tab,Elavil
0,21,0.485152,1,0.003233,0.001623,0.0,0,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,-0.308119,0,-1.497964,-0.362476,0.0,0,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7,-0.491182,0,0.003233,0.001623,0.0,0,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9,1.705571,1,1.989114,-0.766671,0.0,0,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9,1.522508,1,0.003233,0.001623,1.0,0,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,2,-0.613224,0,-0.235617,-0.417097,0.0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
502,13,0.485152,0,0.003233,-1.378426,0.0,1,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
503,19,0.912299,1,-1.110511,-0.832216,0.0,1,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
504,9,0.485152,1,0.003233,0.079954,0.0,1,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [54]:
y = df.hospital_length_of_stay
X = df.drop('hospital_length_of_stay',axis=1)

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [56]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

In [57]:
y_pred = rf.predict(X_test)

In [58]:
mse_y_train = mean_squared_error(y_test, y_pred)
print("Mean Squared Error train:", mse_y_train)

Mean Squared Error train: 138.79976764705884


### Hperparameter tuning for train dataset (Grid Search)

In [59]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search Cross-Validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters and best score from the Grid Search
best_params = grid_search.best_params_
best_score = -grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Mean Squared Error: {best_score}")

Best Parameters: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}
Best Mean Squared Error: 131.16608622664302


### Hyperparameter tuning for test dataset (Grid search)

In [60]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search Cross-Validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_test, y_test)

# Get the best parameters and best score from the Grid Search
best_params = grid_search.best_params_
best_score = -grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Mean Squared Error: {best_score}")

Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 150}
Best Mean Squared Error: 134.02962227278545


### Hyperparameter tuning for train dataset (Randomized search)

In [46]:
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5)
}

# Perform Randomized Search Cross-Validation
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=10, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

# Get the best parameters and best score from the Randomized Search
best_params = random_search.best_params_
best_score = -random_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Mean Squared Error: {best_score}")

Best Parameters: {'max_depth': 5, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 124}
Best Mean Squared Error: 133.0891263880671


###  Hyperparameter tuning for test dataset (Randomized search)

In [61]:
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5)
}

# Perform Randomized Search Cross-Validation
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=10, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, random_state=42)
random_search.fit(X_test, y_test)

# Get the best parameters and best score from the Randomized Search
best_params = random_search.best_params_
best_score = -random_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Mean Squared Error: {best_score}")

Best Parameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 9, 'n_estimators': 70}
Best Mean Squared Error: 135.487168096041
