In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import graphviz

from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn import tree
from sklearn import ensemble

In [2]:
# read in data set with categorical variables turned into dummy variablees
df = pd.read_csv('data/cleaned_data/md_dum.csv')

# create X and y values for modeling, this time using a mask to create 1 and 0 values for classification
car_c_y = df.car_acc_score.mask(df.car_acc_score > 0, 1)
car_X = df.drop(columns=['Unnamed: 0', 'car_acc_score', 'car_dens_score', 'bike_dens_score'])
bike_c_y = df.bike_acc_score.mask(df.bike_acc_score > 0, 1)
bike_X = df.drop(columns=['Unnamed: 0', 'bike_acc_score', 'car_dens_score', 'bike_dens_score'])

In [3]:
# train test split
X_car_c_train, X_car_c_test, y_car_c_train, y_car_c_test = train_test_split(car_X, car_c_y, test_size=0.3, 
                                                                            random_state=18,
                                                                            shuffle=True, stratify=car_c_y)
X_bike_c_train, X_bike_c_test, y_bike_c_train, y_bike_c_test = train_test_split(bike_X, bike_c_y, test_size=0.3, 
                                                                            random_state=18,
                                                                            shuffle=True, stratify=bike_c_y)

In [6]:
params = {'criterion' : ['gini', 'entropy'],
         'splitter' : ['best', 'random'],
         'max_depth' : [10, 20, 50, 100],
         'min_samples_split': [2, 5, 10]}
dtc = tree.DecisionTreeClassifier(random_state=18)
cv = GridSearchCV(dtc, param_grid=params)
cv.fit(X_car_c_train, y_car_c_train)

# Compute and print metrics
print("Accuracy: {}".format(cv.score(X_car_c_test, y_car_c_test)))
print("Tuned Model Parameters: {}".format(cv.best_params_))

Accuracy: 0.8881593740162792
Tuned Model Parameters: {'criterion': 'entropy', 'max_depth': 100, 'min_samples_split': 2, 'splitter': 'best'}


This is only slightly better than the original decision tree classifier. Trying again with some different parameters, but I may want to tune some other ensemble methods and see if they can be made better.

In [12]:
dtc = tree.DecisionTreeClassifier(criterion='entropy', splitter='best', min_samples_split=2, random_state=18)
params = {'max_depth': [60, 75, 100, 200, 300, 400, 500]}
cv = GridSearchCV(dtc, param_grid=params)
cv.fit(X_car_c_train, y_car_c_train)

# Compute and print metrics
print("Accuracy: {}".format(cv.score(X_car_c_test, y_car_c_test)))
print("Tuned Model Parameters: {}".format(cv.best_params_))

Accuracy: 0.8881593740162792
Tuned Model Parameters: {'max_depth': 75}


In [8]:
from sklearn.metrics import roc_curve, auc
y_pred = cv.predict(X_car_c_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_car_c_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

0.8853117171894272

In [None]:
rfc = ensemble.RandomForestClassifier(random_state=18)
params = {'n_estimators': [10, 100, 500, 1000],
         'criterion': ['gini', 'entropy'],
         'max_depth' : [10, 20, 50, 75, 100, 200],
         'bootstrap': [True, False],
         'ccp_alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]}
cv = GridSearchCV(rfc, param_grid=params)
cv.fit(X_car_c_train, y_car_c_train)

# Compute and print metrics
print("Accuracy: {}".format(cv.score(X_car_c_test, y_car_c_test)))
print("Tuned Model Parameters: {}".format(cv.best_params_))

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
scaler = StandardScaler()
steps = [('scaler', scaler), ('ridge_regression', ri)]
params = {'ridge_regression__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
pipeline = Pipeline(steps)
cv = GridSearchCV(pipeline, param_grid=params)
cv.fit(X_car_train, y_car_train)
y_pred = cv.predict(X_car_test)

# Compute and print metrics
print("Accuracy: {}".format(cv.score(X_car_test, y_car_test)))
print("Tuned Model Parameters: {}".format(cv.best_params_))

In [None]:
from sklearn.linear_model import Lasso
la = Lasso()
steps = [('scaler', scaler), ('lasso', la)]
params = {'lasso__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
pipeline = Pipeline(steps)
cv = GridSearchCV(pipeline, param_grid=params)
cv.fit(X_car_train, y_car_train)
y_pred = cv.predict(X_car_test)

# Compute and print metrics
print("Accuracy: {}".format(cv.score(X_car_test, y_car_test)))
print("Tuned Model Parameters: {}".format(cv.best_params_))

In [None]:
from sklearn.linear_model import Lasso
la = Lasso(max_iter=10000)
steps = [('scaler', scaler), ('lasso', la)]
params = {'lasso__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
pipeline = Pipeline(steps)
cv = GridSearchCV(pipeline, param_grid=params)
cv.fit(X_car_train, y_car_train)
y_pred = cv.predict(X_car_test)

 