In [1]:
# Initial imports.
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import datetime as dt

In [2]:
file_path = "./bethany_data_groupby_month_range/all_data.csv"
boosttree_df = pd.read_csv(file_path)
boosttree_df = pd.DataFrame(boosttree_df)
boosttree_df

Unnamed: 0,hospital_code,pickup_date,apot,impression,arrival_hour,pickup_date_num,apot_num
0,508,Wednesday,benchmark,T14.90,1,4,0
1,508,Sunday,benchmark,G89.1,13,1,0
2,197,Friday,benchmark,T14.90,14,6,0
3,475,Thursday,benchmark,R10.84,2,5,0
4,5,Monday,benchmark,I95.9,17,2,0
...,...,...,...,...,...,...,...
543824,280,Tuesday,extreme,J98.01,17,3,1
543825,288,Thursday,extreme,R10.84,4,5,1
543826,5,Monday,extreme,G89.1,19,2,1
543827,288,Friday,extreme,R10.84,12,6,1


In [3]:
# determine distribution of 'apot' buckets
boosttree_df.groupby("apot").size()

apot
benchmark    336889
extreme      206940
dtype: int64

In [4]:
# determine distribution of each hospital code record counts
boosttree_df.groupby("hospital_code").size()

hospital_code
5      79977
196    31742
197    76345
280    38514
282    22945
286    86721
288    43421
475    68846
481    18166
508    68753
549     8399
dtype: int64

In [5]:
# assign numbers to hospital codes
hospital_codes_num = {
   5: 1,
    196: 2,
    197:3,
    280:4,
    282:5,
    286:6,
    288:7,
    475:8,
    481:9,
    508:10,
    549:11,
}

In [6]:
boosttree_df["hospital_num"] = boosttree_df["hospital_code"].apply(lambda x: hospital_codes_num[x])
boosttree_df

Unnamed: 0,hospital_code,pickup_date,apot,impression,arrival_hour,pickup_date_num,apot_num,hospital_num
0,508,Wednesday,benchmark,T14.90,1,4,0,10
1,508,Sunday,benchmark,G89.1,13,1,0,10
2,197,Friday,benchmark,T14.90,14,6,0,3
3,475,Thursday,benchmark,R10.84,2,5,0,8
4,5,Monday,benchmark,I95.9,17,2,0,1
...,...,...,...,...,...,...,...,...
543824,280,Tuesday,extreme,J98.01,17,3,1,4
543825,288,Thursday,extreme,R10.84,4,5,1,7
543826,5,Monday,extreme,G89.1,19,2,1,1
543827,288,Friday,extreme,R10.84,12,6,1,7


In [7]:
# drop columns no longer needed 
boosttree_df = boosttree_df.drop(["apot","pickup_date","hospital_code"], axis=1)
boosttree_df

Unnamed: 0,impression,arrival_hour,pickup_date_num,apot_num,hospital_num
0,T14.90,1,4,0,10
1,G89.1,13,1,0,10
2,T14.90,14,6,0,3
3,R10.84,2,5,0,8
4,I95.9,17,2,0,1
...,...,...,...,...,...
543824,J98.01,17,3,1,4
543825,R10.84,4,5,1,7
543826,G89.1,19,2,1,1
543827,R10.84,12,6,1,7


In [8]:
# assign impression numbers
# Impression dictionary
impressions_num = {
    "A41.9":"0", "E13.64":"1", "E13.65":"2", "F10.92":"3","F19":"4", "F99":"5","G24.0":"6",
"G40.901":"7", "G40.909":"8", "G89.1":"9", "H57.9":"10", "H93.90":"11", "I10":"12","I20.9":"13", "I21.3":"14", "I46.8":"15",
"I46.9":"16","I49.9":"17", "I63.9":"18", "I95.9":"19","J00":"20", "J68.9":"21","J70.5":"22","J80":"23","J81.0":"24","J96.9":"25",
"J98.01":"26", "K59.1":"27", "K92.0":"28", "K92.1":"29","M79.60":"30", "N39.9":"31", "N93.9":"32", "O60.0":"33",
"O80":"34", "O99":"35", "R00.2":"36", "R04.0":"37", "R07.89":"38", "R10.84":"39", "R11.2":"40","R41.0":"41", "R41.82":"42",
"R42":"43", "R50.9":"44", "R51":"45", "R53.1":"46", "R55":"47", "R68.13":"48", "R99":"49", "T14.90":"50", "T30.0":"51", "T58":"52",
"T63":"53", "T67.0":"54", "T68":"55", "T71.9":"56", "T75.1XXA":"57", "T75.4":"58", "T78.2":"59", "T78.40":"60", "Z00.00":"61",
"Z38.2":"62", "Z77.9":"63",

    }

In [9]:
# Impression names encoded using the dictionary values
boosttree_df["impressions_num"] = boosttree_df["impression"].apply(lambda x: impressions_num[x])
boosttree_df

Unnamed: 0,impression,arrival_hour,pickup_date_num,apot_num,hospital_num,impressions_num
0,T14.90,1,4,0,10,50
1,G89.1,13,1,0,10,9
2,T14.90,14,6,0,3,50
3,R10.84,2,5,0,8,39
4,I95.9,17,2,0,1,19
...,...,...,...,...,...,...
543824,J98.01,17,3,1,4,26
543825,R10.84,4,5,1,7,39
543826,G89.1,19,2,1,1,9
543827,R10.84,12,6,1,7,39


In [10]:
# drop impression column
boosttree_df = boosttree_df.drop(["impression"], axis=1)
boosttree_df

Unnamed: 0,arrival_hour,pickup_date_num,apot_num,hospital_num,impressions_num
0,1,4,0,10,50
1,13,1,0,10,9
2,14,6,0,3,50
3,2,5,0,8,39
4,17,2,0,1,19
...,...,...,...,...,...
543824,17,3,1,4,26
543825,4,5,1,7,39
543826,19,2,1,1,9
543827,12,6,1,7,39


In [11]:
# separate the feature columns from the target column
X = boosttree_df.copy()
X = X.drop("apot_num", axis=1)
y = boosttree_df["apot_num"].values

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X,
   y, random_state=1)

In [13]:
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
from sklearn.ensemble import GradientBoostingClassifier

# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,
                                            learning_rate=learning_rate,
                                            max_features=5,
                                            max_depth=3,
                                            random_state=0)

    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train_scaled,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test_scaled,
            y_test)))
    print()

Learning rate:  0.05
Accuracy score (training): 0.659
Accuracy score (validation): 0.664

Learning rate:  0.1
Accuracy score (training): 0.662
Accuracy score (validation): 0.667

Learning rate:  0.25
Accuracy score (training): 0.676
Accuracy score (validation): 0.682

Learning rate:  0.5
Accuracy score (training): 0.681
Accuracy score (validation): 0.685

Learning rate:  0.75
Accuracy score (training): 0.683
Accuracy score (validation): 0.687

Learning rate:  1
Accuracy score (training): 0.683
Accuracy score (validation): 0.687



In [15]:
classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=0.75, max_features=5, max_depth=3, random_state=0)

classifier.fit(X_train_scaled, y_train)
predictions = classifier.predict(X_test_scaled)

In [19]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.687271069006605


In [17]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0 benchmark", "Actual 1 extreme"], columns=["Predicted 0 benchmark", "Predicted 1 extreme"])

cm_df

Unnamed: 0,Predicted 0 benchmark,Predicted 1 extreme
Actual 0 benchmark,71914,12590
Actual 1 extreme,29928,21526


###  True Positives: 71914
### False Positives: 12590
### False Negative: 29928
### True Negative: 21526

In [20]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0 benchmark,Predicted 1 extreme
Actual 0 benchmark,71914,12590
Actual 1 extreme,29928,21526


Accuracy Score : 0.687271069006605
Classification Report
              precision    recall  f1-score   support

           0       0.71      0.85      0.77     84504
           1       0.63      0.42      0.50     51454

    accuracy                           0.69    135958
   macro avg       0.67      0.63      0.64    135958
weighted avg       0.68      0.69      0.67    135958



In [21]:
# Calculate feature importance 
importances = classifier.feature_importances_
importances

array([0.22562623, 0.02353166, 0.61703476, 0.13380736])

In [22]:
# We can sort the features by their importance.
sorted(zip(classifier.feature_importances_, X.columns), reverse=True)

[(0.6170347562967454, 'hospital_num'),
 (0.2256262281556584, 'arrival_hour'),
 (0.13380735773703833, 'impressions_num'),
 (0.02353165781055777, 'pickup_date_num')]

### Results: 

### Feature importances seem to focus mainly on hospital_num (= 0.617) and arrival_hour (=0.225) 
### Feature importances 'impressions_num' (=0.133) and 'pickup_date_num' (=0.02) values show  that these two features do not do much when it comes to helping the model predict apot times within this dataset