In [1]:
import warnings
warnings.simplefilter("ignore")

import pandas as pd
import matplotlib as plt
import numpy as np
import sklearn as sk
import time

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier


In [2]:
#import data 
data = pd.read_csv("taxi_clean_med.csv")

#display data
print(data.shape)
      
data.head()

(8050, 19)


Unnamed: 0,trip_distance,fare_amount,winter,spring,summer,fall,PULongitude,PULatitude,DOLongitude,DOLatitude,pickup_datetime,dropoff_datetime,ride_duration,Early morning,Morning,Afternoon,Night,Holiday Proximity,label
0,1.1,7.0,0,1,0,0,-73.984176,40.759845,-73.972145,40.756816,2019-04-29 07:55:30,2019-04-29 08:03:39,0 days 00:08:09.000000000,0,1,0,0,0,B
1,2.6,12.0,0,0,1,0,-73.992455,40.748476,-74.008386,40.735248,2019-08-31 14:26:37,2019-08-31 14:42:27,0 days 00:15:50.000000000,0,0,1,0,0,D
2,1.2,5.5,0,0,1,0,-73.965174,40.756589,-73.951208,40.778496,2019-07-07 17:53:54,2019-07-07 17:57:55,0 days 00:04:01.000000000,0,0,1,0,0,B
3,1.9,10.5,0,1,0,0,-73.987973,40.77577,-73.978367,40.764425,2019-04-30 14:45:33,2019-04-30 14:58:43,0 days 00:13:10.000000000,0,0,1,0,0,D
4,1.0,5.5,0,0,0,1,-73.985214,40.727944,-73.976942,40.747654,2019-09-15 01:52:45,2019-09-15 01:57:18,0 days 00:04:33.000000000,1,0,0,0,0,B


In [3]:
data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])
data['pickup_datetime'] = pd.to_numeric(data['pickup_datetime'])
data['dropoff_datetime'] = pd.to_datetime(data['dropoff_datetime'])
data['dropoff_datetime'] = pd.to_numeric(data['dropoff_datetime'])

data['ride_duration'] = data['dropoff_datetime'] - data['pickup_datetime']



In [4]:
#split our data 
#data_X = our attributes
#data_y = labels

data_X = data.drop(['label'], axis = 1)
data_y = data['label']


In [5]:
#start with naive bayes
#define our classifier
clf = GaussianNB()

#run a cross validation loop on our data and return the accuracy
scores = cross_val_score(clf, data_X, data_y, cv = 10)
print('Accuracy:',scores.mean())

Accuracy: 0.5298258996139544


In [6]:
#display results with a confusion matrix 
y_pred = cross_val_predict( clf, data_X, data_y, cv = 10)
y_pred

print('Confusion Matrix:\n', confusion_matrix(data_y , y_pred),'\n')
print("Classification Report:\n\n", classification_report(data_y, y_pred))

Confusion Matrix:
 [[   0  564    0    0    0    0    0    0    0    0]
 [   0 1970   28    0    0    0    0    0    0    0]
 [   0  642  974   33    0    0    0    0    0    0]
 [   0   56  652  423    2    0    0    0    0    0]
 [   0   10  123  406  187    8    1    0    0    0]
 [   0    1   43  124  195   73   19    0    0    3]
 [   0    0   25   65  104   63   26    0    0   48]
 [   0    0   10   28   55   33   12    0    0   67]
 [   0    0    7   24   26   26    6    0    0   73]
 [   0    2    3   28   76   65   29    0    0  612]] 

Classification Report:

               precision    recall  f1-score   support

           A       0.00      0.00      0.00       564
           B       0.61      0.99      0.75      1998
           C       0.52      0.59      0.55      1649
           D       0.37      0.37      0.37      1133
           E       0.29      0.25      0.27       735
           F       0.27      0.16      0.20       458
           G       0.28      0.08      0.12 

In [7]:
# now lets use a neural net

#create classifier object
mlp = MLPClassifier()

#define scaler
scaler = StandardScaler()
#create pipeline
pipe = Pipeline(steps = [('scaler',scaler),('MLP',mlp)])

#define parameters
param_grid = { 'MLP__hidden_layer_sizes' : [(10,), (15,) ] , 
              'MLP__activation' : ['logistic', 'tanh']}

#create grid search object
grid_search = GridSearchCV(pipe, param_grid, cv = 5)

#calculate score
score = cross_val_score(grid_search, data_X, data_y, cv = 5)

#print results
print('Average Accuracy:',score.mean())


Average Accuracy: 0.9535441943993883


In [8]:
#display results with a confusion matrix 
y_pred = cross_val_predict( mlp, data_X, data_y, cv = 10)
y_pred

print('Confusion Matrix:\n', confusion_matrix(data_y , y_pred),'\n')
print("Classification Report:\n\n", classification_report(data_y, y_pred))

Confusion Matrix:
 [[  57  339    0   56  112    0    0    0    0    0]
 [ 200 1199    0  200  399    0    0    0    0    0]
 [ 165  989    0  165  330    0    0    0    0    0]
 [ 113  681    0  113  226    0    0    0    0    0]
 [  74  441    0   73  147    0    0    0    0    0]
 [  46  275    0   46   91    0    0    0    0    0]
 [  33  199    0   33   66    0    0    0    0    0]
 [  21  123    0   20   41    0    0    0    0    0]
 [  16   98    0   16   32    0    0    0    0    0]
 [  82  482    7   81  163    0    0    0    0    0]] 

Classification Report:

               precision    recall  f1-score   support

           A       0.07      0.10      0.08       564
           B       0.25      0.60      0.35      1998
           C       0.00      0.00      0.00      1649
           D       0.14      0.10      0.12      1133
           E       0.09      0.20      0.13       735
           F       0.00      0.00      0.00       458
           G       0.00      0.00      0.00 

When running a cross validation loop, I got an accuracy of 9

## KNN Analysis

In [20]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

featurevalues = data.drop(['label'], axis = 1)
classlabels = data['label']

# define a pipeline to search for best combination of PCA dimensions and n_neighbors
scaler = MinMaxScaler()
pca = PCA()
knn = KNeighborsClassifier()

# create a pipeline
pipe = Pipeline(steps=[('scaler', scaler), ('pca', pca), ('knn', knn)])

# set up parameters to tune for each step in pipeline
param_grid = {
    'pca__n_components': list(range(1, 19)), # find how many principal componenet to keep
    'knn__n_neighbors': list(range(1, 30)),  # find the best value of k
}

# pass pipeline into gridsearchcv
grid_pipe = GridSearchCV(pipe,param_grid,cv=5)

# call fit on grid_pipe and pass in unscaled data
grid_pipe = grid_pipe.fit(featurevalues,classlabels)

# print out the best_score_ and best_params_ from the GridSearchCV
print("best_score",grid_pipe.best_score_)
print("best_params",grid_pipe.best_params_)

best_score 0.5937888198757764
best_params {'knn__n_neighbors': 1, 'pca__n_components': 12}


In [21]:
# display accuracy on model
scores = cross_val_score(grid_pipe,featurevalues,classlabels,cv=5)
print("Accuracy:", scores.mean()*100)

Accuracy: 59.38079032843297


## SVM Analysis

In [25]:
from sklearn.svm import SVC

scaler = StandardScaler()
svc = SVC()

# set up pipeline
pipe = Pipeline(steps=[('scaler',scaler),('svc',svc)])
pipeline = cross_val_score(pipe,featurevalues,classlabels,cv=5)
print("Accuracy:", pipeline.mean()*100)

Accuracy: 79.62815514695144


In [30]:
# tune 'svm' part of the pipeline, 'kernel' hyperparameter
param_grid = {'svc__kernel': ['linear', 'rbf', 'poly', 'sigmoid']}

# find and print best parameter
gridsearch = GridSearchCV(pipe,param_grid,cv=5)
gridsearch = gridsearch.fit(featurevalues,classlabels)
print(gridsearch.best_params_)

# find best value of c and print accuracy
c = []
for x in range(50,110,5):
    c.append(x)
param_grid = {'svc__kernel':['linear','rbf','poly','sigmoid'],'svc__C':c}
grid_search = GridSearchCV(pipe,param_grid,cv=5)
grid_accuracy = cross_val_score(grid_search,featurevalues,classlabels,cv=5)
print("Accuracy:",grid_accuracy.mean()*100)

{'svc__kernel': 'linear'}
Accuracy: 99.96265558880566
