## Modules

In [88]:
# modules for data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# modules for metrics
from sklearn.metrics import confusion_matrix, f1_score

# modules for building model
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

## 1.Load data and split into train and test set

In [3]:
# load data to pandas
dui = pd.read_csv('data/dui.csv')

# encode the gender
gender = dui.iloc[:,1]
le = LabelEncoder()
le.fit(gender)
encoded_column = le.transform(gender)
dui['Gender'] = encoded_column

# convert the fatality for binary classification
dui['Fatality'].where(dui['Fatality']<=0, 1, True)

# split the variables and labels
X = dui.iloc[:,:-1]
y = dui.iloc[:,-1]

# split the data in to train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [48]:
dui[dui['Fatality']==-1]['Fatality']=0

### **The data would be split into 80% of train dataset and 20% of test dataset**

## 2.Model and Hyperparameter Selection

### Logistic Regression

**Since the dependent variable is not continuous, I think linear regression would previde higher accuracy, so I choose to use Logistic regressio instead.**

In [4]:
print(len(dui[dui['Fatality']==0]))
print(len(dui[dui['Fatality']==1]))

5628
2690


**The number of accident that people not die in dataset is 2 times bigger than the number of accident that people die, So for the logistic regression model, the weight of the class are set to 0.68 : 0.32**

In [5]:
# train the logistic regression model
lr = LogisticRegression(class_weight={0:0.68, 1:0.32})
lr.fit(X_train, y_train)

# predicting using test dataset
predictions = lr.predict(X_test)

In [6]:
# metrics
print('Accuracy: ', lr.score(X_test, y_test))
print('F1-Score: ', f1_score(y_test, predictions))
print('Confusion matrix:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.8533653846153846
F1-Score:  0.7925170068027212
Confusion matrix:
      0    1
0  954  165
1   79  466


### Decision Tree

**Since need to prevent the overfitting problem, finding the best max depth of decision tree is necessary. So I would do a 10-folds grid search cross validation on parameter citerion, max_depth and class_weight**

In [99]:
# grid search cross validation on parameters "criterion" and "max_depth"
dt = DecisionTreeClassifier()
folds = 10
grid_params = {'criterion':('entropy', 'gini'), 'max_depth':[2,3,4,5,6], 'class_weight':({0:0.68, 1:0.32}, 'balanced', None)}
classifier = GridSearchCV(dt, grid_params, cv=folds)
classifier.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'criterion': ('entropy', 'gini'), 'max_depth': [2, 3, 4, 5, 6], 'class_weight': ({0: 0.68, 1: 0.32}, 'balanced', None)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [100]:
# print out the best tuned model of Decision Tree and its accuracy
print('Best score: ', classifier.best_score_, '\nBest Tree: ', classifier.best_estimator_)

Best score:  0.8969196093163035 
Best Tree:  DecisionTreeClassifier(class_weight={0: 0.68, 1: 0.32}, criterion='entropy',
            max_depth=2, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [101]:
# Since the GridSearchCV would re-fit on best parameter, it is not neccessary to train my own model
dt = classifier.best_estimator_
predictions = dt.predict(X_test)

In [102]:
# metrics
print('Accuracy: ', dt.score(X_test, y_test))
print('F1-Score: ', f1_score(y_test, predictions))
print('Confusion matrix:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.8804086538461539
F1-Score:  0.8334728033472805
Confusion matrix:
      0    1
0  967  152
1   47  498


### Random Forest

**In order to prevent the overfitting problem, doing a 10-folds grid search cross validation on parameter max_depth, also on the criterion and class_weight to find the best parameter**

In [103]:
# grid search cross validation on parameters "criterion" and "max_depth"
rf = RandomForestClassifier()
folds = 10
grid_params = {'criterion':('entropy', 'gini'), 'max_depth':[2,3,4,5,6], 'class_weight':({0:0.68, 1:0.32}, 'balanced', None)}
classifier = GridSearchCV(rf, grid_params, cv=folds)
classifier.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'criterion': ('entropy', 'gini'), 'max_depth': [2, 3, 4, 5, 6], 'class_weight': ({0: 0.68, 1: 0.32}, 'balanced', None)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [104]:
# print out the best tuned model of Random Forest and its accuracy
print('Best score: ', classifier.best_score_, '\nBest Tree: ', classifier.best_estimator_)

Best score:  0.8969196093163035 
Best Tree:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [105]:
# Since the GridSearchCV would re-fit on best parameter, it is not neccessary to train my own model
rf = classifier.best_estimator_
predictions = rf.predict(X_test)

In [106]:
# metrics
print('Accuracy: ', rf.score(X_test, y_test))
print('F1-Score: ', f1_score(y_test, predictions))
print('Confusion matrix:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.8804086538461539
F1-Score:  0.8334728033472805
Confusion matrix:
      0    1
0  967  152
1   47  498


### AdaBoost

**Doing a 10-folds grid search cross validation on parameter learning_rate**

In [15]:
# grid search cross validation on parameters learning_rate
ab = AdaBoostClassifier()
folds = 10
grid_params = {'learning_rate':[0.01, 0.001, 0.0001]}
classifier = GridSearchCV(ab, grid_params, cv=folds)
classifier.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'learning_rate': [0.01, 0.001, 0.0001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [16]:
# print out the best tuned model of AdaBoost and its accuracy
print('Best score: ', classifier.best_score_, '\nBest Tree: ', classifier.best_estimator_)

Best score:  0.8898572501878287 
Best Tree:  AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.01, n_estimators=50, random_state=None)


In [17]:
# Since the GridSearchCV would re-fit on best parameter, it is not neccessary to train my own model
ab = classifier.best_estimator_
predictions = ab.predict(X_test)

In [18]:
# metrics
print('Accuracy: ', ab.score(X_test, y_test))
print('F1-Score: ', f1_score(y_test, predictions))
print('Confusion matrix:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.8804086538461539
F1-Score:  0.8334728033472805
Confusion matrix:
      0    1
0  967  152
1   47  498


### GradientBoosting

**Doing a 10-folds grid search cross validation on parameter learning_rate, max_depth and criterion**

In [19]:
# grid search cross validation on parameters learning_rate and max_depth
gb = GradientBoostingClassifier()
folds = 10
grid_params = {'learning_rate':[0.1, 0.001, 0.0001], 'max_depth':[2,3,4,5,6], 'criterion':('friedman_mse','mse')}
classifier = GridSearchCV(gb, grid_params, cv=folds)
classifier.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'learning_rate': [0.1, 0.001, 0.0001], 'max_depth': [2, 3, 4, 5, 6], 'criterion': ('friedman_mse', 'mse')},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [20]:
# print out the best tuned model of AdaBoost and its accuracy
print('Best score: ', classifier.best_score_, '\nBest Tree: ', classifier.best_estimator_)

Best score:  0.8969196093163035 
Best Tree:  GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=2,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)


In [21]:
# Since the GridSearchCV would re-fit on best parameter, it is not neccessary to train my own model
gb = classifier.best_estimator_
predictions = gb.predict(X_test)

In [22]:
# metrics
print('Accuracy: ', gb.score(X_test, y_test))
print('F1-Score: ', f1_score(y_test, predictions))
print('Confusion matrix:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.8804086538461539
F1-Score:  0.8334728033472805
Confusion matrix:
      0    1
0  967  152
1   47  498


### KNN

**In order to find the best K for KNN classifier, doing 50 iterations to find out the smallest k with best accuracy.**

**Splitting the the train data to do the cross validation. The train data and validate data would be 0.7:0.3**

In [29]:
# spliting the train data into train data and validation data, so that we can use it to find the best k
X_train_knn, X_val_knn, y_train_knn, y_val_knn = train_test_split(X_train, y_train, test_size=0.30)

results = []
for k in range(1, 50):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_knn, y_train_knn)
    results.append([k, knn.score(X_val_knn, y_val_knn)])

In [30]:
# Print out the accuracy for different k
results

[[1, 0.8312468703054582],
 [2, 0.800701051577366],
 [3, 0.8517776664997496],
 [4, 0.8387581372058087],
 [5, 0.8422633950926389],
 [6, 0.8242363545317977],
 [7, 0.8367551326990486],
 [8, 0.8182273410115173],
 [9, 0.827741612418628],
 [10, 0.8207310966449675],
 [11, 0.827741612418628],
 [12, 0.8187280921382073],
 [13, 0.829243865798698],
 [14, 0.8182273410115173],
 [15, 0.8227341011517276],
 [16, 0.8172258387581373],
 [17, 0.8307461191787682],
 [18, 0.8182273410115173],
 [19, 0.8392588883324987],
 [20, 0.8192288432648973],
 [21, 0.8347521281922884],
 [22, 0.8172258387581373],
 [23, 0.8302453680520782],
 [24, 0.8167250876314471],
 [25, 0.8257386079118678],
 [26, 0.8177265898848273],
 [27, 0.827741612418628],
 [28, 0.8247371056584877],
 [29, 0.8317476214321482],
 [30, 0.8192288432648973],
 [31, 0.8242363545317977],
 [32, 0.8197295943915874],
 [33, 0.8267401101652478],
 [34, 0.8202303455182774],
 [35, 0.8197295943915874],
 [36, 0.8127190786179269],
 [37, 0.8192288432648973],
 [38, 0.8102153

**As the results above, the best accuracy cames with the best k 3, so re-trian the model with k 3**

In [31]:
# re-train the model and test with testset
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

In [32]:
# metrics
print('Accuracy: ', knn.score(X_test, y_test))
print('F1-Score: ', f1_score(y_test, predictions))
print('Confusion matrix:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.8335336538461539
F1-Score:  0.7593397046046915
Confusion matrix:
      0    1
0  950  169
1  108  437


### XGBoost

**Doing a 10-folds grid search cross validation on parameter learning_rate and max_depth**

In [33]:
# grid search cross validation on parameters learning_rate and max_depth
xgb = XGBClassifier()
folds = 10
grid_params = {'learning_rate':[0.1, 0.001, 0.0001], 'max_depth':[1,2,3,4,5,6]}
classifier = GridSearchCV(xgb, grid_params, cv=folds)
classifier.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'learning_rate': [0.1, 0.001, 0.0001], 'max_depth': [1, 2, 3, 4, 5, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [34]:
# print out the best tuned model of AdaBoost and its accuracy
print('Best score: ', classifier.best_score_, '\nBest Tree: ', classifier.best_estimator_)

Best score:  0.8969196093163035 
Best Tree:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=1, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)


In [35]:
# Since the GridSearchCV would re-fit on best parameter, it is not neccessary to train my own model
xgb = classifier.best_estimator_
predictions = xgb.predict(X_test)

In [36]:
# metrics
print('Accuracy: ', xgb.score(X_test, y_test))
print('F1-Score: ', f1_score(y_test, predictions))
print('Confusion matrix:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.8804086538461539
F1-Score:  0.8334728033472805
Confusion matrix:
      0    1
0  967  152
1   47  498


### Neural Network

**Training 2 hidden layers neural network with 0.001 learning rate, adam optimizer and logistic sigmoid ativation function**

In [37]:
# train the neural network and test on testset
mc = MLPClassifier(hidden_layer_sizes=(200,100,), learning_rate_init=0.001, solver='adam', activation='logistic')
mc.fit(X_train, y_train)
predictions = mc.predict(X_test)

In [38]:
# metrics
print('Accuracy: ', mc.score(X_test, y_test))
print('F1-Score: ', f1_score(y_test, predictions))
print('Confusion matrix:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.8323317307692307
F1-Score:  0.7267384916748285
Confusion matrix:
       0    1
0  1014  105
1   174  371


### Naive Bayes

In [89]:
# train the Gaussian naive bayes model and test on testset
nb = GaussianNB()
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)

In [90]:
# metrics
print('Accuracy: ', nb.score(X_test, y_test))
print('F1-Score: ', f1_score(y_test, predictions))
print('Confusion matrix:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.8137019230769231
F1-Score:  0.7753623188405797
Confusion matrix:
      0    1
0  819  300
1   10  535


### Conclusion

**After training the models above, XGBoost, GradientBoosting, AdaBoost, Random Forest and Decision Tree come out with the best accuracy and best f1 score. Among the models that have best accuray and f1 socre, I would choose the model using decision tree since its more simple and need less time to train than GradientBoosting, Adaboost and XGBoost, also the Simplest one.**
```
Chosen model: Decision Tree
Hyperparameter: criterion='entropy', max_depth=2
```

## 3.Feature selection

### Correlation

In [39]:
dui.corr()

Unnamed: 0,Age,Gender,BAC,speeding5MPH,speeding10MPH,Fatality
Age,1.0,0.003697,-0.001608,-0.019959,-0.024643,0.004651
Gender,0.003697,1.0,0.122318,-0.014784,-0.005923,0.679881
BAC,-0.001608,0.122318,1.0,-0.005684,0.012962,-0.136319
speeding5MPH,-0.019959,-0.014784,-0.005684,1.0,0.569499,0.028746
speeding10MPH,-0.024643,-0.005923,0.012962,0.569499,1.0,0.024826
Fatality,0.004651,0.679881,-0.136319,0.028746,0.024826,1.0


**As the correlation showing above, Gender and BAC have higher relationship with Fatality**

### Feature importances of Model

In [41]:
dt.feature_importances_

array([0.        , 0.79795034, 0.19929066, 0.        , 0.00275901])

**As the importances list above, the second and third features which is Gender and BAC have higher score of importance.**

### Conclusion

**Based on correlation and feature importances above, the top two feature contribute to fatality are Gender and BAC**

## 4.Generate Hypotheses

**Using my selected model Decision tree to generate hypotheses**

In [84]:
# load data an transform the gender colum
dui_test = pd.read_csv('data/dui-test.csv')
gender_test = dui_test.iloc[:,1]
encoded_column_test = le.transform(gender_test)
dui_test['Gender'] = encoded_column_test

X_hyps_gen = dui_test.iloc[:,:-1]

In [85]:
# predict on the independent variable
hyps = dt.predict(X_hyps_gen)

In [86]:
# add the hyps to test data
dui_test['Gender'] = gender_test
dui_test['Fatality'] = hyps

# write to dui-test-hypotheses.csv
dui_test.to_csv("dui-test-hypotheses.csv", sep='\t', index=False)