In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix

In [2]:
dui = pd.read_csv('data/dui.csv')
dui['Fatality'].where(dui['Fatality']<=0, 1, True)
gender = dui.iloc[:,1]
le = LabelEncoder()
le.fit(gender)
encoded_column = le.transform(gender)
dui['Gender'] = encoded_column
X_train = dui.iloc[:,0:-1]
y_train = dui.iloc[:,-1]

In [168]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [3]:
# dui_test = pd.read_csv('data/dui-test.csv')
# dui_test['Fatality'].where(dui_test['Fatality']<=0, 1, True)
# gender_test = dui_test.iloc[:,1]
# le_test = LabelEncoder()
# le_test.fit(gender_test)
# encoded_column_test = le_test.transform(gender_test)
# dui_test['Gender'] = encoded_column_test
# X_test = dui_test.iloc[:,0:-1]
# y_test = dui_test.iloc[:,-1]

TypeError: '<=' not supported between instances of 'str' and 'int'

### Logistic Regression

In [169]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [170]:
predictions = lr.predict(X_test)

In [171]:
print('Accuracy: ', lr.score(X_test, y_test))
print('Confusion matrixe:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.8543335761107065
Confusion matrixe:
       0    1
0  1467  371
1    29  879


### Decision Tree

In [172]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

dt = DecisionTreeClassifier()
folds = 10
grid_params = {'criterion':('entropy', 'gini'), 'max_depth':[2,3,4,5,6]}
classifier = GridSearchCV(dt, grid_params, cv=folds)
classifier.fit(X_train, y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'criterion': ('entropy', 'gini'), 'max_depth': [2, 3, 4, 5, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [173]:
print('Best score: ', classifier.best_score_, '\nBest Tree: ', classifier.best_estimator_)

Best score:  0.8919791853579759 
Best Tree:  DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [174]:
dt = DecisionTreeClassifier(criterion='entropy', max_depth=2)
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [175]:
predictions = dt.predict(X_test)

In [176]:
print('Accuracy: ', dt.score(X_test, y_test))
print('Confusion matrixe:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.8969410050983249
Confusion matrixe:
       0    1
0  1626  212
1    71  837


### Random Forest

In [177]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
folds = 10
grid_params = {'criterion':('entropy', 'gini'), 'max_depth':[2,3,4,5,6]}
classifier = GridSearchCV(rf, grid_params, cv=folds)
classifier.fit(X_train, y_train)







GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'criterion': ('entropy', 'gini'), 'max_depth': [2, 3, 4, 5, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [178]:
print('Best score: ', classifier.best_score_, '\nBest Tree: ', classifier.best_estimator_)

Best score:  0.8919791853579759 
Best Tree:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [179]:
rf = RandomForestClassifier(criterion='entropy', max_depth=3)
rf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [180]:
predictions = rf.predict(X_test)

In [181]:
print('Accuracy: ', rf.score(X_test, y_test))
print('Confusion matrixe:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.8969410050983249
Confusion matrixe:
       0    1
0  1626  212
1    71  837


### AdaBoost

In [182]:
from sklearn.ensemble import AdaBoostClassifier

ab = AdaBoostClassifier()
folds = 10
grid_params = {'learning_rate':[1, 0.01, 0.001, 0.0001]}
classifier = GridSearchCV(ab, grid_params, cv=folds)
classifier.fit(X_train, y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'learning_rate': [1, 0.01, 0.001, 0.0001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [183]:
print('Best score: ', classifier.best_score_, '\nBest Tree: ', classifier.best_estimator_)

Best score:  0.8828279203301633 
Best Tree:  AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.01, n_estimators=50, random_state=None)


In [184]:
ab = AdaBoostClassifier(learning_rate=0.01)
ab.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.01, n_estimators=50, random_state=None)

In [185]:
predictions = ab.predict(X_test)

In [186]:
print('Accuracy: ', ab.score(X_test, y_test))
print('Confusion matrixe:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.8969410050983249
Confusion matrixe:
       0    1
0  1626  212
1    71  837


### KNN

In [187]:
from sklearn.neighbors import KNeighborsClassifier

results = []
for k in range(1, 20):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    results.append([k, knn.score(X_test, y_test)])
results

[[1, 0.8383102694828842],
 [2, 0.7946103423160962],
 [3, 0.8455935906773488],
 [4, 0.8237436270939549],
 [5, 0.8419519300801165],
 [6, 0.8284777858703569],
 [7, 0.8419519300801165],
 [8, 0.8244719592134013],
 [9, 0.8361252731245448],
 [10, 0.8233794610342317],
 [11, 0.8328477785870357],
 [12, 0.8204661325564457],
 [13, 0.8295702840495266],
 [14, 0.8219227967953386],
 [15, 0.8310269482884195],
 [16, 0.8281136198106337],
 [17, 0.8284777858703569],
 [18, 0.817916970138383],
 [19, 0.8270211216314639]]

In [188]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [189]:
predictions = knn.predict(X_test)

In [190]:
print('Accuracy: ', knn.score(X_test, y_test))
print('Confusion matrixe:\n', pd.DataFrame(confusion_matrix(y_test, predictions)))

Accuracy:  0.8419519300801165
Confusion matrixe:
       0    1
0  1546  292
1   142  766
