In [5]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

# NYC taxi. March, 2016
data = pd.read_csv(filepath_or_buffer='yellow_tripdata_2016-03.csv')
# Initial data dimensions
print(data.shape)
data.head()

(12210952, 19)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,1,2016-03-01 00:00:00,2016-03-01 00:07:55,1,2.5,-73.976746,40.765152,1,N,-74.004265,40.746128,1,9.0,0.5,0.5,2.05,0.0,0.3,12.35
1,1,2016-03-01 00:00:00,2016-03-01 00:11:06,1,2.9,-73.983482,40.767925,1,N,-74.005943,40.733166,1,11.0,0.5,0.5,3.05,0.0,0.3,15.35
2,2,2016-03-01 00:00:00,2016-03-01 00:31:06,2,19.98,-73.782021,40.64481,1,N,-73.974541,40.67577,1,54.5,0.5,0.5,8.0,0.0,0.3,63.8
3,2,2016-03-01 00:00:00,2016-03-01 00:00:00,3,10.78,-73.863419,40.769814,1,N,-73.96965,40.757767,1,31.5,0.0,0.5,3.78,5.54,0.3,41.62
4,2,2016-03-01 00:00:00,2016-03-01 00:00:00,5,30.43,-73.971741,40.792183,3,N,-74.17717,40.695053,1,98.0,0.0,0.0,0.0,15.5,0.3,113.8


In [6]:
# Choosed features:
data = data[['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
             'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']]
print(data.shape)
data.head()

(12210952, 7)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
0,1,2016-03-01 00:00:00,2016-03-01 00:07:55,-73.976746,40.765152,-74.004265,40.746128
1,1,2016-03-01 00:00:00,2016-03-01 00:11:06,-73.983482,40.767925,-74.005943,40.733166
2,2,2016-03-01 00:00:00,2016-03-01 00:31:06,-73.782021,40.64481,-73.974541,40.67577
3,2,2016-03-01 00:00:00,2016-03-01 00:00:00,-73.863419,40.769814,-73.96965,40.757767
4,2,2016-03-01 00:00:00,2016-03-01 00:00:00,-73.971741,40.792183,-74.17717,40.695053


In [7]:
# Clear invalid trips, i.e. dropoff earlier than pickup
data = data.loc[data.tpep_pickup_datetime < data.tpep_dropoff_datetime]
data.shape

(12198402, 7)

In [8]:
# Checking data distribution
data.describe()

Unnamed: 0,VendorID,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
count,12198400.0,12198400.0,12198400.0,12198400.0,12198400.0
mean,1.531154,-72.87561,40.14585,-73.00812,40.21979
std,0.4990285,8.946978,4.928559,8.396655,4.625489
min,1.0,-161.6987,0.0,-161.6987,0.0
25%,1.0,-73.99174,40.73632,-73.99126,40.73471
50%,2.0,-73.98161,40.75325,-73.97963,40.75378
75%,2.0,-73.96678,40.76765,-73.96236,40.76912
max,2.0,0.0,66.85682,0.0,50.79786


In [9]:
# Difference between Min and Max of latitudes and longitudes are abnormally high
# but overall distribution is fine. Need to clean out outliers (Someone's traveling to Africa by taxi xD)
# We can suppose the distribution is normal and keep data in range of [mean-2*std,mean+2*std]

# Calculate means
# Pickup latitude
plat_mean = np.mean(data['pickup_latitude'])
# Pickup longitude
plong_mean = np.mean(data['pickup_longitude'])
# Dropoff latitude
dlat_mean = np.mean(data['dropoff_latitude'])
# Dropoff longitude
dlong_mean = np.mean(data['dropoff_longitude'])
plat_mean, plong_mean, dlat_mean, dlong_mean

(40.145849766934894, -72.87560904146224, 40.21979101957289, -73.00811566130828)

In [10]:
# Calculate stds
# Pickup latitude
plat_std = np.std(data['pickup_latitude'])
# Pickup longitude
plong_std = np.std(data['pickup_longitude'])
# Dropoff latitude
dlat_std = np.std(data['dropoff_latitude'])
# Dropoff longitude
dlong_std = np.std(data['dropoff_longitude'])
plat_std, plong_std, dlat_std, dlong_std

(4.928558813737272, 8.946977428814003, 4.62548910033178, 8.396654436315325)

In [11]:
# Pickup latitude constraints
data = data.loc[(data.pickup_latitude >= (plat_mean-2*plat_std)) &
              (data.pickup_latitude <= (plat_mean+2*plat_std))]
# Pickup longitude constraints
data = data.loc[(data.pickup_longitude >= (plong_mean-2*plong_std)) &
                (data.pickup_longitude <= (plong_mean+2*plong_std))]
# Dropoff latitude constraints
data = data.loc[(data.dropoff_latitude >= (dlat_mean-2*dlat_std)) &
                (data.dropoff_latitude <= (dlat_mean+2*dlat_std))]
# Dropoff longitude constraints
data = data.loc[(data.dropoff_longitude >= (dlong_mean-2*dlong_std)) &
                (data.dropoff_longitude <= (dlong_mean+2*dlong_std))]
data.shape

(12011540, 7)

In [12]:
# Convert string to datetime
data['tpep_pickup_datetime'] = pd.to_datetime(data['tpep_pickup_datetime'], format='%Y-%m-%d %H:%M:%S')
data['tpep_dropoff_datetime'] = pd.to_datetime(data['tpep_dropoff_datetime'], format='%Y-%m-%d %H:%M:%S')
# Disassemble pickup datetime
data.loc[data.tpep_pickup_datetime.dt.weekday < 5, 'is_weekday'] = 1
data.loc[data.tpep_pickup_datetime.dt.weekday < 5, 'is_weekend'] = 0
data.loc[data.tpep_pickup_datetime.dt.weekday >= 5, 'is_weekday'] = 0
data.loc[data.tpep_pickup_datetime.dt.weekday >= 5, 'is_weekend'] = 1
data['minute_of_the_day'] = data.tpep_pickup_datetime.dt.hour * 60 + data.tpep_pickup_datetime.dt.minute
# Calculate trip time
data['trip_time'] = (data.tpep_dropoff_datetime - data.tpep_pickup_datetime).dt.total_seconds()

# Delete datetime column
data = data.drop('tpep_pickup_datetime', axis=1)
data = data.drop('tpep_dropoff_datetime', axis=1)

data.head()

Unnamed: 0,VendorID,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,is_weekday,is_weekend,minute_of_the_day,trip_time
0,1,-73.976746,40.765152,-74.004265,40.746128,1.0,0.0,0,475.0
1,1,-73.983482,40.767925,-74.005943,40.733166,1.0,0.0,0,666.0
2,2,-73.782021,40.64481,-73.974541,40.67577,1.0,0.0,0,1866.0
7,1,-73.788773,40.647758,-73.829208,40.712345,1.0,0.0,0,963.0
8,1,-73.958221,40.764641,-73.967896,40.762901,1.0,0.0,0,299.0


In [13]:
# Getting X and y
y = data['trip_time'].values
data = data.drop('trip_time', axis=1)
X = data.loc[:,:].values

# Shuffle and split to training and testing sets
X, y = shuffle(X, y, random_state=19)
offset = int(X.shape[0] * 0.9)
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((10810386, 8), (10810386,), (1201154, 8), (1201154,))

In [16]:
X_train

array([[ 2.00000000e+00, -7.39743500e+01,  4.07836952e+01, ...,
         1.00000000e+00,  0.00000000e+00,  1.38600000e+03],
       [ 1.00000000e+00, -7.39781265e+01,  4.07629471e+01, ...,
         1.00000000e+00,  0.00000000e+00,  8.07000000e+02],
       [ 1.00000000e+00, -7.38730698e+01,  4.07740974e+01, ...,
         1.00000000e+00,  0.00000000e+00,  9.66000000e+02],
       ...,
       [ 2.00000000e+00, -7.39619827e+01,  4.07706757e+01, ...,
         0.00000000e+00,  1.00000000e+00,  6.06000000e+02],
       [ 2.00000000e+00, -7.39696884e+01,  4.07531128e+01, ...,
         1.00000000e+00,  0.00000000e+00,  5.70000000e+02],
       [ 2.00000000e+00, -7.39689484e+01,  4.07583199e+01, ...,
         0.00000000e+00,  1.00000000e+00,  1.50000000e+01]])

In [18]:
regr = linear_model.LinearRegression()

In [19]:
# Train the model using the training sets
regr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [20]:
# Make predictions using the testing set
y_pred = regr.predict(X_test)

In [21]:
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))

Coefficients: 
 [ 2.40141970e+02  3.95644389e+03 -3.20067842e+03  2.24820088e+03
 -2.99871134e+03  1.74614066e+01 -1.74614066e+01  2.23757633e-02]
Mean squared error: 14395404.76
Variance score: 0.01


In [24]:
# Plot outputs
import matplotlib.pyplot as plt
plt.scatter(X_test, y_test,  color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()

ValueError: x and y must be the same size