In [1]:
# Initial imports.
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
file_path = "./bethany_data_groupby_month_range/all_data.csv"
rforest_df = pd.read_csv(file_path)
rforest_df = pd.DataFrame(rforest_df)
rforest_df

Unnamed: 0,hospital_code,pickup_date,apot,impression,arrival_hour,pickup_date_num,apot_num
0,508,Wednesday,benchmark,T14.90,1,4,0
1,508,Sunday,benchmark,G89.1,13,1,0
2,197,Friday,benchmark,T14.90,14,6,0
3,475,Thursday,benchmark,R10.84,2,5,0
4,5,Monday,benchmark,I95.9,17,2,0
...,...,...,...,...,...,...,...
543824,280,Tuesday,extreme,J98.01,17,3,1
543825,288,Thursday,extreme,R10.84,4,5,1
543826,5,Monday,extreme,G89.1,19,2,1
543827,288,Friday,extreme,R10.84,12,6,1


In [3]:
# assign numbers to hospital codes
hospital_codes_num = {
   5: 1,
    196: 2,
    197:3,
    280:4,
    282:5,
    286:6,
    288:7,
    475:8,
    481:9,
    508:10,
    549:11,
}

In [4]:
rforest_df["hospital_num"] = rforest_df["hospital_code"].apply(lambda x: hospital_codes_num[x])
rforest_df

Unnamed: 0,hospital_code,pickup_date,apot,impression,arrival_hour,pickup_date_num,apot_num,hospital_num
0,508,Wednesday,benchmark,T14.90,1,4,0,10
1,508,Sunday,benchmark,G89.1,13,1,0,10
2,197,Friday,benchmark,T14.90,14,6,0,3
3,475,Thursday,benchmark,R10.84,2,5,0,8
4,5,Monday,benchmark,I95.9,17,2,0,1
...,...,...,...,...,...,...,...,...
543824,280,Tuesday,extreme,J98.01,17,3,1,4
543825,288,Thursday,extreme,R10.84,4,5,1,7
543826,5,Monday,extreme,G89.1,19,2,1,1
543827,288,Friday,extreme,R10.84,12,6,1,7


In [5]:
# drop columns no longer needed 
rforest_df = rforest_df.drop(["apot","pickup_date","hospital_code"], axis=1)
rforest_df

Unnamed: 0,impression,arrival_hour,pickup_date_num,apot_num,hospital_num
0,T14.90,1,4,0,10
1,G89.1,13,1,0,10
2,T14.90,14,6,0,3
3,R10.84,2,5,0,8
4,I95.9,17,2,0,1
...,...,...,...,...,...
543824,J98.01,17,3,1,4
543825,R10.84,4,5,1,7
543826,G89.1,19,2,1,1
543827,R10.84,12,6,1,7


In [6]:
# assign impression numbers
# Impression dictionary
impressions_num = {
    "A41.9":"0", "E13.64":"1", "E13.65":"2", "F10.92":"3","F19":"4", "F99":"5","G24.0":"6",
"G40.901":"7", "G40.909":"8", "G89.1":"9", "H57.9":"10", "H93.90":"11", "I10":"12","I20.9":"13", "I21.3":"14", "I46.8":"15",
"I46.9":"16","I49.9":"17", "I63.9":"18", "I95.9":"19","J00":"20", "J68.9":"21","J70.5":"22","J80":"23","J81.0":"24","J96.9":"25",
"J98.01":"26", "K59.1":"27", "K92.0":"28", "K92.1":"29","M79.60":"30", "N39.9":"31", "N93.9":"32", "O60.0":"33",
"O80":"34", "O99":"35", "R00.2":"36", "R04.0":"37", "R07.89":"38", "R10.84":"39", "R11.2":"40","R41.0":"41", "R41.82":"42",
"R42":"43", "R50.9":"44", "R51":"45", "R53.1":"46", "R55":"47", "R68.13":"48", "R99":"49", "T14.90":"50", "T30.0":"51", "T58":"52",
"T63":"53", "T67.0":"54", "T68":"55", "T71.9":"56", "T75.1XXA":"57", "T75.4":"58", "T78.2":"59", "T78.40":"60", "Z00.00":"61",
"Z38.2":"62", "Z77.9":"63",

    }

In [7]:
# Impression names encoded using the dictionary values
rforest_df["impressions_num"] = rforest_df["impression"].apply(lambda x: impressions_num[x])
rforest_df

Unnamed: 0,impression,arrival_hour,pickup_date_num,apot_num,hospital_num,impressions_num
0,T14.90,1,4,0,10,50
1,G89.1,13,1,0,10,9
2,T14.90,14,6,0,3,50
3,R10.84,2,5,0,8,39
4,I95.9,17,2,0,1,19
...,...,...,...,...,...,...
543824,J98.01,17,3,1,4,26
543825,R10.84,4,5,1,7,39
543826,G89.1,19,2,1,1,9
543827,R10.84,12,6,1,7,39


In [8]:
# drop impression column
rforest_df = rforest_df.drop(["impression"], axis=1)
rforest_df

Unnamed: 0,arrival_hour,pickup_date_num,apot_num,hospital_num,impressions_num
0,1,4,0,10,50
1,13,1,0,10,9
2,14,6,0,3,50
3,2,5,0,8,39
4,17,2,0,1,19
...,...,...,...,...,...
543824,17,3,1,4,26
543825,4,5,1,7,39
543826,19,2,1,1,9
543827,12,6,1,7,39


In [9]:
# Define the features set.
X = rforest_df.copy()
X = X.drop("apot_num", axis=1)
X.head()

Unnamed: 0,arrival_hour,pickup_date_num,hospital_num,impressions_num
0,1,4,10,50
1,13,1,10,9
2,14,6,3,50
3,2,5,8,39
4,17,2,1,19


In [12]:
# Define the target set.
y = rforest_df["apot_num"].ravel()
y[:5]

array([0, 0, 0, 0, 0])

In [13]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [14]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [15]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=500, random_state=78) 

In [16]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [17]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [18]:
predictions

array([1, 0, 0, ..., 1, 0, 1])

In [19]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,65761,18135
Actual 1,28213,23849


In [20]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

0.659100604598479

In [21]:
acc_score

0.659100604598479

In [22]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,65761,18135
Actual 1,28213,23849


Accuracy Score : 0.659100604598479
Classification Report
              precision    recall  f1-score   support

           0       0.70      0.78      0.74     83896
           1       0.57      0.46      0.51     52062

    accuracy                           0.66    135958
   macro avg       0.63      0.62      0.62    135958
weighted avg       0.65      0.66      0.65    135958



In [23]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.22968547, 0.12602738, 0.27304995, 0.3712372 ])

In [24]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.37123719893827756, 'impressions_num'),
 (0.27304994660459636, 'hospital_num'),
 (0.22968547120802785, 'arrival_hour'),
 (0.1260273832490983, 'pickup_date_num')]