# Setup

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

# SKlearn 
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

## Multi-drug use

In [2]:
incidents = pd.read_csv('../data/Aggregated/incidents.csv', )
incidents['Multiple Drugs'] = 0
incidents.loc[incidents['All Drugs'].str.contains(' | ', regex = False), 'Multiple Drugs'] = 1
incidents[incidents['Multiple Drugs'] == 1].shape

(8050, 17)

In [3]:
incidents.columns

Index(['Incident ID', 'Incident Date', 'Incident Time', 'Day',
       'Incident County Name', 'Incident State', 'Victim ID', 'Gender Desc',
       'Age Range', 'Race', 'Ethnicity Desc', 'Naloxone Administered',
       'Survive', 'Response Desc', 'All Drugs', 'Incident Date ym',
       'Multiple Drugs'],
      dtype='object')

### Year, Month and Age Range

In [4]:
incidents["year"] = incidents['Incident Date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').year)
incidents["month"] = incidents['Incident Date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').month)

# Fix value typo
incidents["Age Range"] = incidents["Age Range"].apply(lambda x: "10 - 14" if x == "14-Oct" else x)

### OneHotEncoder - Day

In [5]:
ohe = OneHotEncoder(categories = [["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"]])
transformed = ohe.fit_transform(incidents[["Day"]])

# Sanity Check
transformed.toarray()

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.]])

## Pipeline Definition

In [6]:
# Final list of all chosen features including the target variable
final_features_ls = ["Day", "Age Range", "year", "month", "Multiple Drugs", "Naloxone Administered"]
features_df = incidents[final_features_ls]
target = incidents["Survive"]


# Convert several column's type to categorical to ensure OneHotEncoder process them.
categorical_ls = ["Day", "Age Range", "year", "month"]
for i in categorical_ls:
    incidents[i] = pd.Categorical(incidents[i])
    
# Split data
x_train, x_test, y_train, y_test = train_test_split(features_df, target, test_size=0.20, random_state=42)

In [12]:
pipe = Pipeline([('ohe', OneHotEncoder(sparse=False)),
                 ('clf', RandomForestClassifier(max_depth=50, random_state=0, n_estimators=20))])

pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)

In [13]:
target_names = ['Died', 'Survived']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

        Died       0.57      0.44      0.49      1057
    Survived       0.82      0.89      0.85      3063

    accuracy                           0.77      4120
   macro avg       0.69      0.66      0.67      4120
weighted avg       0.76      0.77      0.76      4120

