In [122]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [123]:
dui = pd.read_csv('data/dui.csv')
dui['Fatality'].where(dui['Fatality']<=0, 1, True)
gender = dui.iloc[:,1]
le = LabelEncoder()
le.fit(gender)
encoded_column = le.transform(gender)
dui['Gender'] = encoded_column
X = dui.iloc[:,:-1]
y = dui.iloc[:,-1]

In [124]:
print(len(dui[dui['Fatality']==0]))
print(len(dui[dui['Fatality']==1]))

5628
2690


In [125]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [126]:
print(len(y_train[y_train==0]))
print(len(y_train[y_train==1]))
print(len(y_test[y_test==0]))
print(len(y_test[y_test==1]))

4501
2153
1127
537


In [127]:
# dui_test = pd.read_csv('data/dui-test.csv')
# dui_test['Fatality'].where(dui_test['Fatality']<=0, 1, True)
# gender_test = dui_test.iloc[:,1]
# le_test = LabelEncoder()
# le_test.fit(gender_test)
# encoded_column_test = le_test.transform(gender_test)
# dui_test['Gender'] = encoded_column_test
# X_test = dui_test.iloc[:,0:-1]
# y_test = dui_test.iloc[:,-1]

### Logistic Regression

In [128]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(class_weight={0:0.68, 1:0.32})
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)

In [129]:
print('Accuracy: ', lr.score(X_test, y_test))
print('Confusion matrixe:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.8671875
Confusion matrixe:
      0    1
0  995  132
1   89  448


### Decision Tree

In [130]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
folds = 10
grid_params = {'criterion':('entropy', 'gini'), 'max_depth':[2,3,4,5,6]}
classifier = GridSearchCV(dt, grid_params, cv=folds)
classifier.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'criterion': ('entropy', 'gini'), 'max_depth': [2, 3, 4, 5, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [131]:
print('Best score: ', classifier.best_score_, '\nBest Tree: ', classifier.best_estimator_)

Best score:  0.8918106686701728 
Best Tree:  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [132]:
dt = classifier.best_estimator_
predictions = dt.predict(X_test)

In [133]:
print('Accuracy: ', dt.score(X_test, y_test))
print('Confusion matrixe:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.9014423076923077
Confusion matrixe:
       0    1
0  1005  122
1    42  495


### Random Forest

In [134]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
folds = 10
grid_params = {'criterion':('entropy', 'gini'), 'max_depth':[2,3,4,5,6]}
classifier = GridSearchCV(rf, grid_params, cv=folds)
classifier.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'criterion': ('entropy', 'gini'), 'max_depth': [2, 3, 4, 5, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [135]:
print('Best score: ', classifier.best_score_, '\nBest Tree: ', classifier.best_estimator_)

Best score:  0.8918106686701728 
Best Tree:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [136]:
rf = classifier.best_estimator_
predictions = rf.predict(X_test)

In [137]:
print('Accuracy: ', rf.score(X_test, y_test))
print('Confusion matrixe:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.9014423076923077
Confusion matrixe:
       0    1
0  1005  122
1    42  495


### AdaBoost

In [138]:
from sklearn.ensemble import AdaBoostClassifier

ab = AdaBoostClassifier()
folds = 10
grid_params = {'learning_rate':[0.01, 0.001, 0.0001]}
classifier = GridSearchCV(ab, grid_params, cv=folds)
classifier.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'learning_rate': [1, 0.01, 0.001, 0.0001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [139]:
print('Best score: ', classifier.best_score_, '\nBest Tree: ', classifier.best_estimator_)

Best score:  0.8851990984222389 
Best Tree:  AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.01, n_estimators=50, random_state=None)


In [140]:
ab = classifier.best_estimator_
predictions = ab.predict(X_test)

In [141]:
print('Accuracy: ', ab.score(X_test, y_test))
print('Confusion matrixe:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.9014423076923077
Confusion matrixe:
       0    1
0  1005  122
1    42  495


### GradientBoosting

In [187]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
folds = 10
grid_params = {'learning_rate':[0.1, 0.001, 0.0001], 'max_depth':[2,3,4,5,6]}
classifier = GridSearchCV(gb, grid_params, cv=folds)
classifier.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'learning_rate': [0.1, 0.001, 0.0001], 'max_depth': [2, 3, 4, 5, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [188]:
print('Best score: ', classifier.best_score_, '\nBest Tree: ', classifier.best_estimator_)

Best score:  0.8916604057099925 
Best Tree:  GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=2,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)


In [189]:
gb = classifier.best_estimator_
predictions = gb.predict(X_test)

In [190]:
print('Accuracy: ', gb.score(X_test, y_test))
print('Confusion matrixe:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.9014423076923077
Confusion matrixe:
       0    1
0  1005  122
1    42  495


### KNN

In [146]:
from sklearn.neighbors import KNeighborsClassifier

X_train_knn, X_val_knn, y_train_knn, y_val_knn = train_test_split(X_train, y_train, test_size=0.33)

results = []
for k in range(1, 50):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_knn, y_train_knn)
    results.append([k, knn.score(X_val_knn, y_val_knn)])

In [147]:
results

[[1, 0.8279472007282658],
 [2, 0.8033682294037323],
 [3, 0.837960855712335],
 [4, 0.8152025489303596],
 [5, 0.8229403732362313],
 [6, 0.8088302230314065],
 [7, 0.8142922166590806],
 [8, 0.8042785616750113],
 [9, 0.810195721438325],
 [10, 0.8015475648611743],
 [11, 0.8097405553026855],
 [12, 0.7928994082840237],
 [13, 0.8024578971324533],
 [14, 0.7947200728265817],
 [15, 0.8097405553026855],
 [16, 0.8015475648611743],
 [17, 0.8070095584888485],
 [18, 0.8056440600819299],
 [19, 0.8038233955393719],
 [20, 0.8042785616750113],
 [21, 0.812016385980883],
 [22, 0.8106508875739645],
 [23, 0.8161128812016386],
 [24, 0.812926718252162],
 [25, 0.8192990441511152],
 [26, 0.8161128812016386],
 [27, 0.8161128812016386],
 [28, 0.8142922166590806],
 [29, 0.8161128812016386],
 [30, 0.8088302230314065],
 [31, 0.8233955393718707],
 [32, 0.809285389167046],
 [33, 0.8170232134729176],
 [34, 0.811106053709604],
 [35, 0.8170232134729176],
 [36, 0.812926718252162],
 [37, 0.815657715065999],
 [38, 0.8092853891

In [148]:
knn = KNeighborsClassifier(n_neighbors=3)
predictions = knn.predict(X_test)

In [149]:
print('Accuracy: ', knn.score(X_test, y_test))
print('Confusion matrixe:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.8407451923076923
Confusion matrixe:
      0    1
0  985  142
1  123  414


### XGBoost

In [183]:
from xgboost import XGBClassifier

xgb = XGBClassifier(learning_rate=0.001, max_depth=2)
folds = 5
grid_params = {'learning_rate':[0.1, 0.001, 0.0001], 'max_depth':[1,2,3,4,5,6]}
classifier = GridSearchCV(xgb, grid_params, cv=folds)
classifier.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.001, max_delta_step=0,
       max_depth=2, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'learning_rate': [0.1, 0.001, 0.0001], 'max_depth': [1, 2, 3, 4, 5, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [184]:
print('Best score: ', classifier.best_score_, '\nBest Tree: ', classifier.best_estimator_)

Best score:  0.8922614575507137 
Best Tree:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.001, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)


In [185]:
xgb = classifier.best_estimator_
predictions = xgb.predict(X_test)

In [186]:
print('Accuracy: ', xgb.score(X_test, y_test))
print('Confusion matrixe:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.8960336538461539
Confusion matrixe:
       0    1
0  1001  126
1    47  490


### Neural Network

In [152]:
from sklearn.neural_network import MLPClassifier

mc = MLPClassifier(hidden_layer_sizes=(200,100,), learning_rate_init=0.001, solver='adam', activation='logistic')
mc.fit(X_train, y_train)
predictions = mc.predict(X_test)

In [153]:
print('Accuracy: ', mc.score(X_test, y_test))
print('Confusion matrixe:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.875
Confusion matrixe:
       0    1
0  1026  101
1   107  430
