In [141]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

In [142]:
dui = pd.read_csv('data/dui.csv')
dui['Fatality'].where(dui['Fatality']<=0, 1, True)
gender = dui.iloc[:,1]
le = LabelEncoder()
le.fit(gender)
encoded_column = le.transform(gender)
dui['Gender'] = encoded_column
X = dui.iloc[:,:-1]
y = dui.iloc[:,-1]

In [143]:
print(len(dui[dui['Fatality']==0]))
print(len(dui[dui['Fatality']==1]))

5628
2690


In [144]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [145]:
print(len(y_train[y_train==0]))
print(len(y_train[y_train==1]))
print(len(y_test[y_test==0]))
print(len(y_test[y_test==1]))

3750
1822
1878
868


In [146]:
# dui_test = pd.read_csv('data/dui-test.csv')
# dui_test['Fatality'].where(dui_test['Fatality']<=0, 1, True)
# gender_test = dui_test.iloc[:,1]
# le_test = LabelEncoder()
# le_test.fit(gender_test)
# encoded_column_test = le_test.transform(gender_test)
# dui_test['Gender'] = encoded_column_test
# X_test = dui_test.iloc[:,0:-1]
# y_test = dui_test.iloc[:,-1]

### Logistic Regression

In [147]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(class_weight={0:0.68, 1:0.32})
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)



In [148]:
print('Accuracy: ', lr.score(X_test, y_test))
print('Confusion matrixe:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.8579752367079388
Confusion matrixe:
       0    1
0  1630  248
1   142  726


### Decision Tree

In [149]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
folds = 10
grid_params = {'criterion':('entropy', 'gini'), 'max_depth':[2,3,4,5,6]}
classifier = GridSearchCV(dt, grid_params, cv=folds)
classifier.fit(X_train, y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'criterion': ('entropy', 'gini'), 'max_depth': [2, 3, 4, 5, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [150]:
print('Best score: ', classifier.best_score_, '\nBest Tree: ', classifier.best_estimator_)

Best score:  0.8907231293737664 
Best Tree:  DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [151]:
dt = classifier.best_estimator_
dt.fit(X_train, y_train)
predictions = dt.predict(X_test)

In [152]:
print('Accuracy: ', dt.score(X_test, y_test))
print('Confusion matrixe:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.8994901675163874
Confusion matrixe:
       0    1
0  1670  208
1    68  800


### Random Forest

In [153]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
folds = 10
grid_params = {'criterion':('entropy', 'gini'), 'max_depth':[2,3,4,5,6]}
classifier = GridSearchCV(rf, grid_params, cv=folds)
classifier.fit(X_train, y_train)







GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'criterion': ('entropy', 'gini'), 'max_depth': [2, 3, 4, 5, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [154]:
print('Best score: ', classifier.best_score_, '\nBest Tree: ', classifier.best_estimator_)

Best score:  0.8907231293737664 
Best Tree:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [155]:
rf = classifier.best_estimator_
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)

In [156]:
print('Accuracy: ', rf.score(X_test, y_test))
print('Confusion matrixe:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.8994901675163874
Confusion matrixe:
       0    1
0  1670  208
1    68  800


### AdaBoost

In [157]:
from sklearn.ensemble import AdaBoostClassifier

ab = AdaBoostClassifier()
folds = 10
grid_params = {'learning_rate':[1, 0.01, 0.001, 0.0001]}
classifier = GridSearchCV(ab, grid_params, cv=folds)
classifier.fit(X_train, y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'learning_rate': [1, 0.01, 0.001, 0.0001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [158]:
print('Best score: ', classifier.best_score_, '\nBest Tree: ', classifier.best_estimator_)

Best score:  0.8842634128835457 
Best Tree:  AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.01, n_estimators=50, random_state=None)


In [159]:
ab = classifier.best_estimator_
ab.fit(X_train, y_train)
predictions = ab.predict(X_test)

In [160]:
print('Accuracy: ', ab.score(X_test, y_test))
print('Confusion matrixe:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.8994901675163874
Confusion matrixe:
       0    1
0  1670  208
1    68  800


### GradientBoosting

In [161]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
folds = 10
grid_params = {'learning_rate':[1, 0.1, 0.001, 0.0001]}
classifier = GridSearchCV(gb, grid_params, cv=folds)
classifier.fit(X_train, y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'learning_rate': [1, 0.1, 0.001, 0.0001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [162]:
print('Best score: ', classifier.best_score_, '\nBest Tree: ', classifier.best_estimator_)

Best score:  0.8896465099587296 
Best Tree:  GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)


In [163]:
gb = classifier.best_estimator_
gb.fit(X_train, y_train)
predictions = gb.predict(X_test)

In [164]:
print('Accuracy: ', gb.score(X_test, y_test))
print('Confusion matrixe:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.8980335032774945
Confusion matrixe:
       0    1
0  1666  212
1    68  800


### KNN

In [165]:
from sklearn.neighbors import KNeighborsClassifier

X_train_knn, X_val_knn, y_train_knn, y_val_knn = train_test_split(X_train, y_train, test_size=0.33)

results = []
for k in range(1, 50):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_knn, y_train_knn)
    results.append([k, knn.score(X_val_knn, y_val_knn)])

In [166]:
results

[[1, 0.8141304347826087],
 [2, 0.7945652173913044],
 [3, 0.8239130434782609],
 [4, 0.8119565217391305],
 [5, 0.8206521739130435],
 [6, 0.8125],
 [7, 0.8130434782608695],
 [8, 0.8059782608695653],
 [9, 0.8179347826086957],
 [10, 0.8054347826086956],
 [11, 0.8141304347826087],
 [12, 0.803804347826087],
 [13, 0.8184782608695652],
 [14, 0.8],
 [15, 0.8141304347826087],
 [16, 0.8010869565217391],
 [17, 0.8141304347826087],
 [18, 0.8059782608695653],
 [19, 0.8168478260869565],
 [20, 0.8103260869565218],
 [21, 0.8179347826086957],
 [22, 0.8059782608695653],
 [23, 0.8190217391304347],
 [24, 0.8114130434782608],
 [25, 0.8125],
 [26, 0.8027173913043478],
 [27, 0.808695652173913],
 [28, 0.8081521739130435],
 [29, 0.8114130434782608],
 [30, 0.8114130434782608],
 [31, 0.8097826086956522],
 [32, 0.8],
 [33, 0.8097826086956522],
 [34, 0.7945652173913044],
 [35, 0.8],
 [36, 0.7891304347826087],
 [37, 0.7918478260869565],
 [38, 0.7809782608695652],
 [39, 0.7864130434782609],
 [40, 0.7771739130434783],


In [167]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

In [168]:
print('Accuracy: ', knn.score(X_test, y_test))
print('Confusion matrixe:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.8492352512745812
Confusion matrixe:
       0    1
0  1627  251
1   163  705


### Neural Network

In [169]:
from sklearn.neural_network import MLPClassifier

mc = MLPClassifier(hidden_layer_sizes=(200,), learning_rate_init=0.001, solver='adam', activation='logistic')
mc.fit(X_train, y_train)
predictions = mc.predict(X_test)

In [170]:
print('Accuracy: ', mc.score(X_test, y_test))
print('Confusion matrixe:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.880189366351056
Confusion matrixe:
       0    1
0  1594  284
1    45  823
