# Setup

In [33]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

# SKlearn 
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score,accuracy_score

from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

## Multi-drug use

In [96]:
incidents = pd.read_csv('../data/Aggregated/incidents.csv', )
incidents['Multiple Drugs'] = 0
incidents.loc[incidents['All Drugs'].str.contains(' | ', regex = False), 'Multiple Drugs'] = 1
incidents[incidents['Multiple Drugs'] == 1].shape

(8050, 17)

In [97]:
### fenytanl

incidents['Fentnyl'] = 0
incidents.loc[incidents['All Drugs'].str.contains('FENTANYL', regex = False), 'Fentnyl'] = 1
incidents[incidents['Fentnyl'] == 1].shape

(7255, 18)

In [98]:
incidents['Fentnyl'].value_counts()

0    13342
1     7255
Name: Fentnyl, dtype: int64

In [99]:
incidents.columns

Index(['Incident ID', 'Incident Date', 'Incident Time', 'Day',
       'Incident County Name', 'Incident State', 'Victim ID', 'Gender Desc',
       'Age Range', 'Race', 'Ethnicity Desc', 'Naloxone Administered',
       'Survive', 'Response Desc', 'All Drugs', 'Incident Date ym',
       'Multiple Drugs', 'Fentnyl'],
      dtype='object')

### Year, Month and Age Range

In [100]:
incidents["year"] = incidents['Incident Date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').year)
incidents["month"] = incidents['Incident Date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').month)

# Fix value typo
incidents["Age Range"] = incidents["Age Range"].apply(lambda x: "10 - 14" if x == "14-Oct" else x)

In [101]:
incidents.head(2)

Unnamed: 0,Incident ID,Incident Date,Incident Time,Day,Incident County Name,Incident State,Victim ID,Gender Desc,Age Range,Race,Ethnicity Desc,Naloxone Administered,Survive,Response Desc,All Drugs,Incident Date ym,Multiple Drugs,Fentnyl,year,month
0,1,1/4/2018,0:42:00,Thursday,Delaware,Pennsylvania,1,Male,50 - 59,White,Not Hispanic,1,0,NO RESPONSE TO NALOXONE,COCAINE/CRACK | HEROIN,2018,1,0,2018,1
1,26,1/26/2018,9:14:00,Friday,Chester,Pennsylvania,5,Male,25 - 29,White,Not Hispanic,0,0,,HEROIN,2018,0,0,2018,1


In [102]:
incidents['Age Range'].value_counts()

30 - 39    7653
25 - 29    4082
40 - 49    3434
20 - 24    2030
50 - 59    1871
60 - 69     634
0 - 9       530
15 - 19     276
70 - 79      70
80 - *       10
10 - 14       7
Name: Age Range, dtype: int64

In [110]:
def agerange_weekend(df):
    for i, row in df.iterrows():
        if row['Day'] == 'Friday' or row['Day'] == 'Saturday' or row['Day'] == 'Sunday':
            df.loc[i,'Weekend_OD'] = 1
        else:
            df.loc[i,'Weekend_OD'] = 0
        
        if row['month'] == 1 or row['month'] == 12 or row['month'] == 2:
            df.loc[i,'Season'] = 'Winter'
        elif row['month'] == 3 or row['month'] == 4 or row['month'] == 5:
            df.loc[i,'Season'] = 'Spring'
        elif row['month'] == 6 or row['month'] == 7 or row['month'] == 8:
            df.loc[i,'Season'] = 'Summer'
        elif row['month'] == 9 or row['month'] == 10 or row['month'] == 11:
            df.loc[i,'Season'] = 'Fall' 
            
        if row['Age Range'] == '0 - 9' or row['Age Range'] == '10 - 14' or row['Age Range'] == '15 - 19' or row['Age Range'] == '20 - 24' or row['Age Range'] == '30 - 39':
            df.loc[i,'Over 40'] = 0
        else:    
        
            df.loc[i,'Over 40'] = 1
            
    return(df)            
                
            
            
            

In [139]:
incidents2 = agerange_weekend(incidents.copy())

In [142]:
incidents2.head(2)

Unnamed: 0,Incident ID,Incident Date,Incident Time,Day,Incident County Name,Incident State,Victim ID,Gender Desc,Age Range,Race,...,Response Desc,All Drugs,Incident Date ym,Multiple Drugs,Fentnyl,year,month,Weekend_OD,Season,Over 40
0,1,1/4/2018,0:42:00,Thursday,Delaware,Pennsylvania,1,Male,50 - 59,White,...,NO RESPONSE TO NALOXONE,COCAINE/CRACK | HEROIN,2018,1,0,2018,1,0,Winter,1
1,26,1/26/2018,9:14:00,Friday,Chester,Pennsylvania,5,Male,25 - 29,White,...,,HEROIN,2018,0,0,2018,1,1,Winter,1


In [146]:
incidents2['Over 40'] = incidents2['Over 40'].astype(int)
incidents2['Weekend_OD'] = incidents2['Weekend_OD'].astype(int)

In [147]:
incidents2.dtypes

Incident ID               int64
Incident Date            object
Incident Time            object
Day                      object
Incident County Name     object
Incident State           object
Victim ID                 int64
Gender Desc              object
Age Range                object
Race                     object
Ethnicity Desc           object
Naloxone Administered     int64
Survive                   int64
Response Desc            object
All Drugs                object
Incident Date ym          int64
Multiple Drugs            int64
Fentnyl                   int64
year                      int64
month                     int64
Weekend_OD                int32
Season                   object
Over 40                   int32
dtype: object

In [148]:
#### NOT GOOD

#X = incidents2[['Over 40','Naloxone Administered','Weekend_OD','Fentnyl','Season']]
#y = incidents2[['Survive']]

In [165]:
#### Good

X = incidents2[['Over 40','Naloxone Administered','Fentnyl']]
y = incidents2[['Survive']]

In [166]:
y['Survive'].value_counts()

1    15492
0     5105
Name: Survive, dtype: int64

In [167]:
class_weight_m = {0:15492,1:5105}

## Pipeline Definition

In [168]:
##pipeline  NOT GOOD

#label_features =['Season']
#label_transformer = OneHotEncoder(sparse = False, handle_unknown = 'ignore')



#preprocessor = ColumnTransformer(
    #transformers=[
        #('label', label_transformer, label_features)
        #])   


#model =  Pipeline(steps=[('preprocessor', preprocessor),
                      ('randomforest', RandomForestClassifier(n_estimators=1000,
                                                              class_weight=class_weight_m))])

In [174]:
 
####Good

model =  Pipeline(steps=[
                      ('randomforest', RandomForestClassifier(n_estimators=1000,
                                                              class_weight=class_weight_m))])

In [175]:
# Split data with stratify
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=42)

In [176]:
model.fit(X_train,y_train)


  self._final_estimator.fit(Xt, y, **fit_params)


Pipeline(memory=None,
     steps=[('randomforest', RandomForestClassifier(bootstrap=True, class_weight={0: 15492, 1: 5105},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=None, oob_score=False,
            random_state=None, verbose=0, warm_start=False))])

In [177]:
predictions_classweights = model.predict(X_test)
print("accuracy:",accuracy_score(y_test, predictions_classweights))

accuracy: 0.7821359223300971


In [184]:
from sklearn.model_selection import StratifiedKFold
import numpy
kf = StratifiedKFold(n_splits=5,random_state=0, shuffle=True)

 
acc_score = []
pred = []
y_test_all = []
predprob = []
 
for train_index , test_index in kf.split(X,y):
    X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
    y_train , y_test = y.iloc[train_index] , y.iloc[test_index]
     
    model.fit(X_train,y_train)
    print(model.score(X_test, y_test))
    num = model.score(X_test, y_test)
    num = num.astype(float)
    acc_score.append(num)
    predictions_classweights = model.predict(X_test)
    proba = model.predict_proba(X_test)
    predprob.append(proba)                           
    
    #predict_class = np.argmax(test_predictions, axis=1)
    #predict_class = predict_class.tolist()
    
    pred.append(predictions_classweights)
    y_test_all.append(y_test)


  self._final_estimator.fit(Xt, y, **fit_params)


0.7762135922330097


  self._final_estimator.fit(Xt, y, **fit_params)


0.7776699029126214


  self._final_estimator.fit(Xt, y, **fit_params)


0.7836853605243991


  self._final_estimator.fit(Xt, y, **fit_params)


0.7882981306142267


  self._final_estimator.fit(Xt, y, **fit_params)


0.7747025977178927


In [185]:
pred_master = np.hstack(([pred[0],pred[1],pred[2],pred[3],pred[4]]))
y_test_master = pd.concat(y_test_all)

In [190]:
pr0 = pd.DataFrame(predprob[0],columns=['zero','one'])
pr1 = pd.DataFrame(predprob[1],columns=['zero','one'])
pr2 = pd.DataFrame(predprob[2],columns=['zero','one'])
pr3 = pd.DataFrame(predprob[3],columns=['zero','one'])
pr4 = pd.DataFrame(predprob[4],columns=['zero','one'])

In [237]:
prdf1 = pd.concat([pr0,pr1,pr2,pr3,pr4])
prdf1 = prdf1[['one']]
prdf2 =prdf1.copy()

In [238]:
prdf2['one'] = np.select([prdf2['one']>.64],  ### setting threshold
                        [1],
                        default=0)

In [239]:
confusion = confusion_matrix(y_test_master,prdf2)

In [240]:
print(confusion)

[[ 3568  1537]
 [ 3345 12147]]


In [241]:
target_names = ['Died', 'Survived']
print(classification_report(y_test_master, prdf2, target_names=target_names))

              precision    recall  f1-score   support

        Died       0.52      0.70      0.59      5105
    Survived       0.89      0.78      0.83     15492

   micro avg       0.76      0.76      0.76     20597
   macro avg       0.70      0.74      0.71     20597
weighted avg       0.80      0.76      0.77     20597

