In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [6]:
# Hospital_280_all.csv columns
columns = [
    "id", "record", "hospital_code", "pickup_date",
    "arrival_time", "apot", "impression", "postal_code",
    "agencynumber", "agency_unit", "latitude", "longitude", "status",
    
]

target = ["status"]

In [8]:
# Load the data
file_path = Path('Hospital_280_all.csv')
df = pd.read_csv(file_path, encoding='utf-8')

df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

df = df.drop(["apot"], axis=1)

df.head()



Unnamed: 0,id,record,hospital_code,pickup_date,arrival_time,impression,postal_code,agencynumber,agency_unit,latitude,longitude,status
0,12,1181,280,2/15/2018,12:50:00,J80,95819,92905,92905-M102,38.57026,-121.452408,0 (< 20 min)
1,28,1391,280,8/12/2018,12:35:00,M79.60,95820,92905,92905-M17,38.57026,-121.452408,0 (< 20 min)
2,53,1899,280,2/26/2019,17:46:00,G89.1,95843,92905,92905-M23,38.57026,-121.452408,0 (< 20 min)
3,92,62394,280,1/2/2018,12:16:00,A41.9,95816,605,605-S-751,38.57026,-121.452408,0 (< 20 min)
4,96,62769,280,1/4/2018,13:50:00,G89.1,95829,605,605-S-751,38.57026,-121.452408,1 (21 - 60 min)


In [9]:
# get data types
df.dtypes

id                 int64
record            object
hospital_code      int64
pickup_date       object
arrival_time      object
impression        object
postal_code        int64
agencynumber       int64
agency_unit       object
latitude         float64
longitude        float64
status            object
dtype: object

In [10]:
# Convert the target column values to low_risk and high_risk based on their values
x = {'0 (< 20 min)': 'benchmark'}   
df = df.replace(x)

x = dict.fromkeys(['1 (21 - 60 min)', '2 (61 - 120 min)', '3 (121 - 180 min)', '4 (181 + min)'], 'extreme')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,id,record,hospital_code,pickup_date,arrival_time,impression,postal_code,agencynumber,agency_unit,latitude,longitude,status
0,12,1181,280,2/15/2018,12:50:00,J80,95819,92905,92905-M102,38.57026,-121.452408,benchmark
1,28,1391,280,8/12/2018,12:35:00,M79.60,95820,92905,92905-M17,38.57026,-121.452408,benchmark
2,53,1899,280,2/26/2019,17:46:00,G89.1,95843,92905,92905-M23,38.57026,-121.452408,benchmark
3,92,62394,280,1/2/2018,12:16:00,A41.9,95816,605,605-S-751,38.57026,-121.452408,benchmark
4,96,62769,280,1/4/2018,13:50:00,G89.1,95829,605,605-S-751,38.57026,-121.452408,extreme


# Split the Data into Training and Testing

In [11]:
# Create our features
X = df.drop(columns='status')
X = pd.get_dummies(X)

# Create our target
y = df[target]

In [12]:
X.describe()

Unnamed: 0,id,hospital_code,postal_code,agencynumber,latitude,longitude,record_000225abe0e440bb9856c10e993171a0,record_00085af2c55d481898fdbd89b85a4a3f,record_00098461e70d419095ef9430809055d3,record_0014ab2ab6224f2ba4f653d301d3789a,...,agency_unit_90305-R74,agency_unit_90305-TR74,agency_unit_92905-M102,agency_unit_92905-M17,agency_unit_92905-M23,agency_unit_99305-203-05,agency_unit_99305-204-09,agency_unit_99305-215-06,agency_unit_99305-217-07,agency_unit_99305-218-10
count,38514.0,38514.0,38514.0,38514.0,38514.0,38514.0,38514.0,38514.0,38514.0,38514.0,...,38514.0,38514.0,38514.0,38514.0,38514.0,38514.0,38514.0,38514.0,38514.0,38514.0
mean,200779.122735,280.0,95808.359947,15013.536636,38.57026,-121.4524,2.6e-05,2.6e-05,2.6e-05,2.6e-05,...,2.6e-05,5.2e-05,2.6e-05,2.6e-05,2.6e-05,2.6e-05,2.6e-05,2.6e-05,2.6e-05,2.6e-05
std,149390.300229,0.0,361.616661,25490.697625,3.093033e-11,7.875758e-11,0.005096,0.005096,0.005096,0.005096,...,0.005096,0.007206,0.005096,0.005096,0.005096,0.005096,0.005096,0.005096,0.005096,0.005096
min,12.0,280.0,25818.0,605.0,38.57026,-121.4524,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,85231.75,280.0,95815.0,1805.0,38.57026,-121.4524,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,157061.0,280.0,95821.0,1805.0,38.57026,-121.4524,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,309013.25,280.0,95827.0,11805.0,38.57026,-121.4524,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,543811.0,280.0,95866.0,99305.0,38.57026,-121.4524,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
# Check the balance of our target values
y['status'].value_counts()

benchmark    19396
extreme      19118
Name: status, dtype: int64

In [14]:
from sklearn.model_selection import train_test_split

# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

X_train.shape

(28885, 42344)

### Balanced Random Forest Classifier

In [15]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
model.fit(X_train, y_train)



BalancedRandomForestClassifier(random_state=1)

In [16]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6079359690628416

In [17]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)


array([[3269, 1569],
       [2203, 2588]], dtype=int64)

In [18]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  benchmark       0.60      0.68      0.54      0.63      0.60      0.37      4838
    extreme       0.62      0.54      0.68      0.58      0.60      0.36      4791

avg / total       0.61      0.61      0.61      0.61      0.60      0.37      9629



In [19]:
# List the features sorted in descending order by feature importance
importances = model.feature_importances_
sorted(zip(model.feature_importances_, X.columns), reverse=True)

[(0.04096851188505429, 'id'),
 (0.02298175152934374, 'postal_code'),
 (0.01417743582562561, 'agencynumber'),
 (0.00669822742828012, 'impression_I63.9'),
 (0.003902391419538061, 'impression_I21.3'),
 (0.0029828385640353434, 'agency_unit_71715-M250'),
 (0.002790959008275975, 'impression_T14.90'),
 (0.0027556115210207416, 'agency_unit_71715-M260'),
 (0.0026996350986363813, 'impression_R10.84'),
 (0.0025968227752352756, 'impression_R53.1'),
 (0.0022971551516062635, 'impression_J80'),
 (0.002254715064640556, 'impression_I20.9'),
 (0.001955990683044703, 'impression_G89.1'),
 (0.0019430550408681899, 'impression_R41.82'),
 (0.0019004821409835573, 'agency_unit_1805-M4'),
 (0.0018802278262337456, 'impression_F99'),
 (0.0018657693335473317, 'agency_unit_11805-M105'),
 (0.0018261764811417593, 'agency_unit_1805-M2'),
 (0.0017939793424149733, 'impression_M79.60'),
 (0.0016871592471377564, 'agency_unit_1805-M8'),
 (0.0016602511478454882, 'impression_R55'),
 (0.001628495950445688, 'agency_unit_1805-M1