In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

# Read the CSV and Perform Basic Data Cleaning

In [3]:
columns = [
    "index", "ID", "Hospital_Code", "Date",
    "APOT", "Impression", "Postal_Code", "AgencyNumber",
    "Agency_Unit", "Lat", "Long", "Status",
    
]

target = ["Status"]

In [5]:
# Load the data
file_path = Path('final_ems_data_slice_test.csv')
df = pd.read_csv(file_path, encoding='utf-8')

df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

df = df.drop(["APOT"], axis=1)
df.head()



Unnamed: 0,index,ID,Hospital_Code,Date,Impression,Postal_Code,AgencyNumber,Agency_Unit,Lat,Long,Status
0,0,776,508,7/19/2017,T14.90,95820,92905,92905-M22,38.554938,-121.456751,0 (< 20 min)
1,1,824,508,8/20/2017,G89.1,95831,92905,92905-M17,38.554938,-121.456751,0 (< 20 min)
2,2,854,197,9/8/2017,T14.90,95608,92905,92905-M17,38.60186,-121.391908,0 (< 20 min)
3,3,867,475,9/14/2017,R53.1,95624,92905,92905-M23,38.57089,-121.469532,1 (21 - 60 min)
4,4,951,475,10/19/2017,R10.84,95816,92905,92905-M17,38.57089,-121.469532,0 (< 20 min)


In [6]:
# Convert the target column values to low_risk and high_risk based on their values
x = dict.fromkeys(['0 (< 20 min)', '1 (21 - 60 min)'], 'benchmark')    
df = df.replace(x)

x = dict.fromkeys(['2 (61 - 120 min)', '3 (121 - 180 min)', '4 (181 + min)'], 'extreme')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,index,ID,Hospital_Code,Date,Impression,Postal_Code,AgencyNumber,Agency_Unit,Lat,Long,Status
0,0,776,508,7/19/2017,T14.90,95820,92905,92905-M22,38.554938,-121.456751,benchmark
1,1,824,508,8/20/2017,G89.1,95831,92905,92905-M17,38.554938,-121.456751,benchmark
2,2,854,197,9/8/2017,T14.90,95608,92905,92905-M17,38.60186,-121.391908,benchmark
3,3,867,475,9/14/2017,R53.1,95624,92905,92905-M23,38.57089,-121.469532,benchmark
4,4,951,475,10/19/2017,R10.84,95816,92905,92905-M17,38.57089,-121.469532,benchmark


# Split the Data into Training and Testing

In [7]:
# Create our features
X = pd.get_dummies(df)

# Create our target
y = df[target]

In [8]:
X.describe()

Unnamed: 0,index,ID,Hospital_Code,AgencyNumber,Lat,Long,Date_1/1/2018,Date_1/10/2018,Date_1/11/2018,Date_1/12/2018,...,Agency_Unit_605-S-755,Agency_Unit_92905-M102,Agency_Unit_92905-M17,Agency_Unit_92905-M2,Agency_Unit_92905-M21,Agency_Unit_92905-M22,Agency_Unit_92905-M23,Agency_Unit_92905-M3,Status_benchmark,Status_extreme
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,499.5,101355.36,361.748,7640.8,223.159364,-119.991811,0.001,0.005,0.005,0.002,...,0.05,0.009,0.026,0.002,0.013,0.002,0.021,0.003,0.902,0.098
std,288.819436,44261.754278,147.587345,24466.173628,1937.984056,15.118604,0.031623,0.070569,0.070569,0.044699,...,0.218054,0.094488,0.159215,0.044699,0.113331,0.044699,0.143456,0.054717,0.297463,0.297463
min,0.0,776.0,5.0,605.0,38.463174,-121.469532,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,249.75,71723.25,280.0,605.0,38.554938,-121.469532,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,499.5,106003.5,475.0,605.0,38.57089,-121.456751,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,749.25,143083.25,475.0,605.0,38.57089,-121.416364,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,999.0,152172.0,549.0,92905.0,20549.0,38.57306,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
# Check the balance of our target values
y['Status'].value_counts()

benchmark    902
extreme       98
Name: Status, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

X_train.shape

(750, 467)

# Oversampling

### Naive Random Oversampling

In [11]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'Status': 1})

In [12]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [13]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5311111111111111

In [14]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[140,  85],
       [ 14,  11]], dtype=int64)

In [15]:
# Display the confusion matrix
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.5311111111111111

In [16]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  benchmark       0.91      0.62      0.44      0.74      0.52      0.28       225
    extreme       0.11      0.44      0.62      0.18      0.52      0.27        25

avg / total       0.83      0.60      0.46      0.68      0.52      0.28       250



### SMOTE Oversampling

In [17]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train
)
Counter(y_resampled)

Counter({'Status': 1})

In [18]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [19]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5222222222222221

In [20]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[127,  98],
       [ 13,  12]], dtype=int64)

In [21]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  benchmark       0.91      0.56      0.48      0.70      0.52      0.27       225
    extreme       0.11      0.48      0.56      0.18      0.52      0.27        25

avg / total       0.83      0.56      0.49      0.64      0.52      0.27       250



# Undersampling

In [22]:
# Resample the data using the ClusterCentroids resampler
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'Status': 1})

In [23]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)


LogisticRegression(random_state=1)

In [24]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.52

In [25]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[117, 108],
       [ 12,  13]], dtype=int64)

In [26]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  benchmark       0.91      0.52      0.52      0.66      0.52      0.27       225
    extreme       0.11      0.52      0.52      0.18      0.52      0.27        25

avg / total       0.83      0.52      0.52      0.61      0.52      0.27       250



# Combination (Over and Under) Sampling

In [27]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

Counter(y_resampled)

Counter({'Status': 1})

In [28]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [29]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5177777777777778

In [30]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[116, 109],
       [ 12,  13]], dtype=int64)

In [31]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  benchmark       0.91      0.52      0.52      0.66      0.52      0.27       225
    extreme       0.11      0.52      0.52      0.18      0.52      0.27        25

avg / total       0.83      0.52      0.52      0.61      0.52      0.27       250

