In [2]:
import warnings
warnings.simplefilter("ignore")

import pandas as pd
import matplotlib as plt
import numpy as np
import sklearn as sk
import time

#import data 
data = pd.read_csv("taxi_clean_med.csv")

#display data
print(data.shape)
      
data.head()

(8050, 19)


Unnamed: 0,trip_distance,fare_amount,winter,spring,summer,fall,PULongitude,PULatitude,DOLongitude,DOLatitude,pickup_datetime,dropoff_datetime,ride_duration,Early morning,Morning,Afternoon,Night,Holiday Proximity,label
0,1.1,7.0,0,1,0,0,-73.984176,40.759845,-73.972145,40.756816,2019-04-29 07:55:30,2019-04-29 08:03:39,0 days 00:08:09.000000000,0,1,0,0,0,B
1,2.6,12.0,0,0,1,0,-73.992455,40.748476,-74.008386,40.735248,2019-08-31 14:26:37,2019-08-31 14:42:27,0 days 00:15:50.000000000,0,0,1,0,0,D
2,1.2,5.5,0,0,1,0,-73.965174,40.756589,-73.951208,40.778496,2019-07-07 17:53:54,2019-07-07 17:57:55,0 days 00:04:01.000000000,0,0,1,0,0,B
3,1.9,10.5,0,1,0,0,-73.987973,40.77577,-73.978367,40.764425,2019-04-30 14:45:33,2019-04-30 14:58:43,0 days 00:13:10.000000000,0,0,1,0,0,D
4,1.0,5.5,0,0,0,1,-73.985214,40.727944,-73.976942,40.747654,2019-09-15 01:52:45,2019-09-15 01:57:18,0 days 00:04:33.000000000,1,0,0,0,0,B


In [3]:
data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])
data['pickup_datetime'] = pd.to_numeric(data['pickup_datetime'])
data['dropoff_datetime'] = pd.to_datetime(data['dropoff_datetime'])
data['dropoff_datetime'] = pd.to_numeric(data['dropoff_datetime'])

data['ride_duration'] = data['dropoff_datetime'] - data['pickup_datetime']

## KNN Analysis

In [6]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

featurevalues = data.drop(['label'], axis = 1)
classlabels = data['label']

# define a pipeline to search for best combination of PCA dimensions and n_neighbors
scaler = MinMaxScaler()
pca = PCA()
knn = KNeighborsClassifier()

# create a pipeline
pipe = Pipeline(steps=[('scaler', scaler), ('pca', pca), ('knn', knn)])

# set up parameters to tune for each step in pipeline
param_grid = {
    'pca__n_components': list(range(1, 19)), # find how many principal componenet to keep
    'knn__n_neighbors': list(range(1, 30)),  # find the best value of k
}

# pass pipeline into gridsearchcv
grid_pipe = GridSearchCV(pipe,param_grid,cv=5)

# call fit on grid_pipe and pass in unscaled data
grid_pipe = grid_pipe.fit(featurevalues,classlabels)

# print out the best_score_ and best_params_ from the GridSearchCV
print("best_score",grid_pipe.best_score_)
print("best_params",grid_pipe.best_params_)

best_score 0.5937888198757764
best_params {'knn__n_neighbors': 1, 'pca__n_components': 12}


In [7]:
# display accuracy on model
scores = cross_val_score(grid_pipe,featurevalues,classlabels,cv=5)
print("Accuracy:", scores.mean()*100)

Accuracy: 59.38079032843297


## SVM Analysis

In [9]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
svc = SVC()

# set up pipeline
pipe = Pipeline(steps=[('scaler',scaler),('svc',svc)])
pipeline = cross_val_score(pipe,featurevalues,classlabels,cv=5)
print("Accuracy:", pipeline.mean()*100)

Accuracy: 79.62815514695144


In [10]:
# tune 'svm' part of the pipeline, 'kernel' hyperparameter
param_grid = {'svc__kernel': ['linear', 'rbf', 'poly', 'sigmoid']}

# find and print best parameter
gridsearch = GridSearchCV(pipe,param_grid,cv=5)
gridsearch = gridsearch.fit(featurevalues,classlabels)
print(gridsearch.best_params_)

# find best value of c and print accuracy
c = []
for x in range(50,110,5):
    c.append(x)
param_grid = {'svc__kernel':['linear','rbf','poly','sigmoid'],'svc__C':c}
grid_search = GridSearchCV(pipe,param_grid,cv=5)
grid_accuracy = cross_val_score(grid_search,featurevalues,classlabels,cv=5)
print("Accuracy:",grid_accuracy.mean()*100)

{'svc__kernel': 'linear'}
Accuracy: 99.96265558880566
