# **Taxi Dynamic Pricing System **   
Team Number: DC19022 - 1st March 2019
____________________________________________________________________________________________________________________________________________

## 1. Data Processing

### a. Load Data
First of all, we concatenate all 12 files into a total dataset file, solving wrong placement of delimiter. The total dataset file contain 12 months’ data, which is over 4 GB. In order to handle this huge dataset file, we upload it to Kaggle for future processing.

In [1]:
#Import required libraries
import numpy as np
from numpy import nan as NA
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import os
import seaborn as sns
from sklearn.cluster import KMeans

In [2]:
# Load data
taxi_df = pd.read_csv('../input/taxi_final.csv', delimiter = ',', low_memory = True )

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Column names
taxi_df.columns

Index(['Type', 'PROVIDER NAME', 'StartDateTime', 'DateCreated', 'ID',
       'ExternalID', 'FareAmount', 'GratuityAmount', 'SurchargeAmount',
       'ExtraFareAmount', 'TollAmount', 'TotalAmount', 'PaymentType',
       'StartDateTime.1', 'EndDateTime', 'OriginStreetNumber',
       'OriginStreetName', 'OriginCity', 'OriginState', 'OriginZip',
       'OriginLatitude', 'OriginLongitude', 'DestinationStreetNumber',
       'DestinationStreetName', 'DestinationCity', 'DestinationState',
       'DestinationZip', 'DestinationLatitude', 'DestinationLongitude',
       'Milage', 'Duration', 'Unnamed: 31'],
      dtype='object')

In [4]:
# Drop useless columns
taxi_df.drop(['DateCreated', 'StartDateTime.1', 'Unnamed: 31'], axis=1, inplace=True)

In [5]:
# Remove nulls and wrong values
taxi_df = taxi_df[taxi_df['FareAmount'] > 0]
taxi_df = taxi_df[taxi_df['GratuityAmount'] >= 0]
taxi_df = taxi_df[taxi_df['SurchargeAmount'] >= 0]
taxi_df = taxi_df[taxi_df['ExtraFareAmount'] >= 0]
taxi_df = taxi_df[taxi_df['TotalAmount'] > 0]
taxi_df = taxi_df[taxi_df['Milage'] > 0]
taxi_df = taxi_df[taxi_df['Milage'] < 500]

In [54]:
taxi_df.head()

Unnamed: 0,Type,PROVIDER NAME,StartDateTime,ID,ExternalID,FareAmount,GratuityAmount,SurchargeAmount,ExtraFareAmount,TollAmount,TotalAmount,PaymentType,EndDateTime,OriginStreetNumber,OriginStreetName,OriginCity,OriginState,OriginZip,OriginLatitude,OriginLongitude,DestinationStreetNumber,DestinationStreetName,DestinationCity,DestinationState,DestinationZip,DestinationLatitude,DestinationLongitude,Milage,Duration,Date,Hour,Weekday,DayofWeek,IsWeekday,Label
0,1,Yellow Cab,2017-12-01 00:01:06,B467__82709__2017-12-01T00:09:54.403_36188498,36188498,7.57,1.83,0.25,0.25,0.0,11.0,1.0,2017-12-01 00:09:51,,"2660 WOODLEY RD NW, WASHINGTON, DC 20008, USA",,DC,20008,38.92492,-77.05333,,"1526 K ST NW, WASHINGTON, DC 20005, USA",,DC,20005,38.90232,-77.0356,2.11,8.0,2017-12-01,0,4,Friday,1,1
1,1,Yellow Cab,2017-12-01 00:05:27,B369__52337__2017-12-01T00:11:43.467_36188500,36188500,4.6,0.0,0.25,1.25,0.0,7.47,1.0,2017-12-01 00:11:40,,"400 7TH ST NW, WASHINGTON, DC 20004, USA",,DC,20004,38.89493,-77.02221,,"1335 F ST NW, WASHINGTON, DC 20004, USA",,DC,20004,38.89737,-77.03103,0.62,6.0,2017-12-01,0,4,Friday,1,1
2,1,Yellow Cab,2017-12-01 00:07:01,K781__91115__2017-12-01T00:13:01.313_36188501,36188501,5.14,1.0,0.25,0.25,0.0,8.01,1.0,2017-12-01 00:12:58,,"1345 VERMONT AVE NW, WASHINGTON, DC 20005, USA",,DC,20005,38.90776,-77.03051,,"1631-1651 18 ST NW, WASHINGTON, DC 20009, USA",,DC,20009,38.91212,-77.04165,0.96,5.0,2017-12-01,0,4,Friday,1,1
3,1,Yellow Cab,2017-12-01 00:04:23,B395__70930__2017-12-01T00:15:08.803_36188503,36188503,11.89,0.0,0.25,1.25,0.0,13.95,2.0,2017-12-01 00:15:07,,"715-731 7TH ST NW, WASHINGTON, DC 20001, USA",,DC,20001,38.89933,-77.02192,,"1405 N SCOTT ST, ARLINGTON, VA 22209, USA",,VA,22209,38.88981,-77.08011,4.05,10.0,2017-12-01,0,4,Friday,1,1
4,1,Yellow Cab,2017-12-01 00:01:41,B477__78157__2017-12-01T00:15:40.900_36188504,36188504,6.22,0.0,0.25,2.25,0.0,14.41,2.0,2017-12-01 00:15:39,,"331 PENNSYLVANIA AVENUE SOUTHEAST, WASHINGTON,...",,DC,20003,38.88637,-77.00117,,"1803 POTOMAC AVE SE, WASHINGTON, DC 20003, USA",,DC,20003,38.88304,-76.97803,1.37,13.0,2017-12-01,0,4,Friday,1,1


In [6]:
taxi_df = taxi_df.drop_duplicates()

## 1.2 Features Engineering

In [7]:
# Change data format
taxi_df['StartDateTime'] = pd.to_datetime(taxi_df['StartDateTime'], errors='coerce')
taxi_df['EndDateTime'] = pd.to_datetime(taxi_df['EndDateTime'])

# Generate detailed columns of datetime
taxi_df['Date'] = taxi_df['StartDateTime'].dt.date
taxi_df['Hour'] = taxi_df['StartDateTime'].dt.hour
taxi_df['Weekday'] = taxi_df['StartDateTime'].dt.weekday
taxi_df['DayofWeek'] = taxi_df['StartDateTime'].dt.weekday_name


In [None]:
# Plot the heat map
time_map = pd.pivot_table(taxi_df,index=['DayofWeek'],columns =['Hour'],aggfunc='size')
fig, ax = plt.subplots(figsize=(20,15)) 
ax=sns.heatmap(time_map,linewidths=0.1,square=True,cmap='YlGnBu')

In [8]:
# Classify weekdays and weekend
x = 0
taxi_df['IsWeekday'] = [x+1 if i<6 else x for i in taxi_df['Weekday']]

In [9]:
# Remove nulls and wrong values
taxi_df['StartDateTime'].dropna(inplace=True)
taxi_df = taxi_df[taxi_df['StartDateTime'] < taxi_df['EndDateTime']]

In [10]:
df1 = taxi_df[['IsWeekday','Hour','Weekday', 'Milage', 'Duration']]
df1 = df1.reset_index()

In [11]:
taxi_dfCleaned = taxi_df.dropna()
print(len(taxi_dfCleaned))

2137880


In [None]:
import seaborn as sns
sns.heatmap(taxi_dfCleaned.corr())

In [None]:
taxi_dfCleaned = pd.concat([taxi_dfCleaned,pd.get_dummies(taxi_dfCleaned['PROVIDER NAME'], prefix='PROVIDER NAME')],axis=1)
taxi_dfCleaned.drop(['PROVIDER NAME'],axis=1, inplace=True)

In [None]:
taxi_dfCleaned.head()

## 2. Clustering   

In [None]:
taxi_df_sampled = df1.sample(frac = .01,random_state=123)

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Apply clustering algorithm K-Means
def KMeansClustering(number_Components):
     return KMeans(n_clusters=number_Components, n_init=10)

Components = [i for i in range(2,5)]

for component in Components:
    print("##### Number OF Clusters",component)
    
    KMclusterer = KMeansClustering(component)
    KMclusterer.fit(taxi_df_sampled)
    
    # Predict the cluster for each data point
    KMpreds = KMclusterer.predict(taxi_df_sampled)
    
    print("Predictions")
    # Calculate the mean silhouette coefficient for the number of clusters chosen
    KMscore = silhouette_score(taxi_df_sampled,KMpreds)
    
    print("silhouette score = ",KMscore)

In [None]:
from sklearn.mixture import GaussianMixture

# Apply clustering algorithm GMM
def GMMClustering(number_Components):
     
    cluster = GaussianMixture(n_components=number_Components, covariance_type='full',init_params='kmeans')
    return cluster

Components = [i for i in range(2,5)]
for component in Components:
    print("##### Number OF Clusters",component)
    clusterer = GMMClustering(component)
    clusterer.fit(taxi_df_sampled)
    # Predict the cluster for each data point
    preds = clusterer.predict(taxi_df_sampled)

    # Calculate the mean silhouette coefficient for the number of clusters chosen
    score = silhouette_score(taxi_df_sampled,preds)
    print("silhouette score = ",score)


## Apply Algorithm that return better scores on all data

In [79]:
df2 = taxi_df[['IsWeekday','Hour','Weekday', 'Milage', 'Duration']]
df2 = df2.reset_index()

In [80]:
num_clusters = 2

In [81]:
# Apply K-Means Algorithm in all the data
KM_Cluster = KMeans(n_clusters=num_clusters,random_state=0)
KM_Cluster.fit(df2)
taxi_df['Label'] = KM_Cluster.labels_

In [82]:
print(np.unique(KM_Cluster.labels_))

[0 1]


In [83]:
Cluster0 = taxi_df[taxi_df['Label'] == 0]
Cluster1 = taxi_df[taxi_df['Label'] == 1]

In [84]:
print(len(Cluster0),len(Cluster1))

5052040 5436926


In [85]:
Cluster0 = Cluster0.dropna()
Cluster1 = Cluster1.dropna()

In [92]:
x0 = Cluster0[['Milage','Duration','IsWeekday','SurchargeAmount','OriginLatitude','OriginLongitude','DestinationLatitude','DestinationLongitude',
              'GratuityAmount', 'ExtraFareAmount', 'TollAmount', 'FareAmount']]
y0 = Cluster0['TotalAmount']
x1 = Cluster1[['Milage','Duration','IsWeekday','SurchargeAmount','OriginLatitude','OriginLongitude','DestinationLatitude','DestinationLongitude',
              'GratuityAmount', 'ExtraFareAmount', 'TollAmount', 'FareAmount']]
y1 = Cluster1['TotalAmount']

In [93]:
from sklearn.model_selection import train_test_split
X_train0, X_test0, y_train0, y_test0 = train_test_split(x0, y0, test_size = 0.2, random_state=9)
X_train1, X_test1, y_train1, y_test1 = train_test_split(x1, y1, test_size = 0.2, random_state=9)

In [94]:
# import the regressor 
from sklearn.tree import DecisionTreeRegressor  
  
# create a regressor object 
regressor = DecisionTreeRegressor(random_state = 0)  
  
# fit the regressor with X and Y data 
regressor.fit(X_train0, y_train0) 

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=0, splitter='best')

In [95]:
from sklearn.metrics import mean_squared_error, r2_score
prediction= regressor.predict(X_test0)

test_set_rmse = (np.sqrt(mean_squared_error(y_test0, prediction)))
test_set_r2 = r2_score(y_test0, prediction)

print(test_set_rmse)
print(test_set_r2)

0.26125063977576607
0.9994509748737035


In [96]:
# import the regressor 
from sklearn.tree import DecisionTreeRegressor  
  
# create a regressor object 
regressor = DecisionTreeRegressor(random_state = 0)  
  
# fit the regressor with X and Y data 
regressor.fit(X_train1, y_train1) 

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=0, splitter='best')

In [97]:
from sklearn.metrics import mean_squared_error, r2_score
prediction= regressor.predict(X_test1)
test_set_rmse = (np.sqrt(mean_squared_error(y_test1, prediction)))
test_set_r2 = r2_score(y_test1, prediction)

print(test_set_rmse)
print(test_set_r2)

0.375372332303672
0.9987505582024485
