In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd

In [17]:
# paths for the three dfs

df_jan_path = "/content/yellow_tripdata_2016-01.csv"
df_feb_path = "/content/yellow_tripdata_2016-02.csv"
df_mar_path = "/content/yellow_tripdata_2016-03.csv"

# load the dataframes

df_jan = dd.read_csv(df_jan_path, assume_missing=True, usecols= ['trip_distance', 'tpep_pickup_datetime', 'pickup_longitude',
       'pickup_latitude','dropoff_longitude', 'dropoff_latitude', 'fare_amount'], parse_dates=["tpep_pickup_datetime"])

df_feb = dd.read_csv(df_feb_path, assume_missing=True, usecols= ['trip_distance', 'tpep_pickup_datetime', 'pickup_longitude',
       'pickup_latitude','dropoff_longitude', 'dropoff_latitude', 'fare_amount'], parse_dates=["tpep_pickup_datetime"])


df_mar = dd.read_csv(df_mar_path, assume_missing=True, usecols= ['trip_distance', 'tpep_pickup_datetime', 'pickup_longitude',
       'pickup_latitude','dropoff_longitude', 'dropoff_latitude', 'fare_amount'], parse_dates=["tpep_pickup_datetime"])

In [18]:
df_jan

Unnamed: 0_level_0,tpep_pickup_datetime,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,fare_amount
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,datetime64[ns],float64,float64,float64,float64,float64,float64
,...,...,...,...,...,...,...


In [19]:
# concat the three dataframes as one

df_final = dd.concat([df_jan, df_feb, df_mar], axis=0)

In [20]:
df_final

Unnamed: 0_level_0,tpep_pickup_datetime,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,fare_amount
npartitions=3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,datetime64[ns],float64,float64,float64,float64,float64,float64
,...,...,...,...,...,...,...
,...,...,...,...,...,...,...
,...,...,...,...,...,...,...


In [21]:
# set the values of coordinates

min_latitude = 40.60
max_latitude = 40.85

min_longitude = -74.05
max_longitude = -73.70

min_fare_amount_val = 0.50
max_fare_amount_val = 81.0

min_trip_distance_val = 0.25
max_trip_distance_val = 24.43

In [22]:
# select data points within the given ranges

df_final = df_final.loc[(df_final["pickup_latitude"].between(min_latitude, max_latitude, inclusive="both")) &
(df_final["pickup_longitude"].between(min_longitude, max_longitude, inclusive="both")) &
(df_final["dropoff_latitude"].between(min_latitude, max_latitude, inclusive="both")) &
(df_final["dropoff_longitude"].between(min_longitude, max_longitude, inclusive="both")), :]

In [23]:
# select data points within the given ranges

df_final = df_final.loc[(df_final["fare_amount"].between(min_fare_amount_val,max_fare_amount_val,inclusive="both")) &
(df_final["trip_distance"].between(min_trip_distance_val,max_trip_distance_val,inclusive="both"))]

In [24]:
# drop columns from the data

df_final = df_final.drop(columns=['trip_distance', 'dropoff_longitude', 'dropoff_latitude', 'fare_amount'])

In [25]:
df_final

Unnamed: 0_level_0,tpep_pickup_datetime,pickup_longitude,pickup_latitude
npartitions=3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,datetime64[ns],float64,float64
,...,...,...
,...,...,...
,...,...,...


In [26]:
df_final = df_final.compute()

In [27]:
# save the dataframe

df_final.to_csv("df_final.csv")

In [28]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler

In [29]:
data_path = "/content/df_final.csv"

In [30]:
df_reader = pd.read_csv(data_path, chunksize=100000, usecols=["pickup_latitude","pickup_longitude"])

In [31]:
# train the standard scaler

scaler = StandardScaler()

for chunk in df_reader:
    # fit the scaler
    scaler.partial_fit(chunk)

In [32]:
scaler

In [33]:
df_reader = pd.read_csv(data_path, chunksize=100000, usecols=["pickup_latitude","pickup_longitude"])

In [34]:
# train the model

mini_batch = MiniBatchKMeans(n_clusters=30, n_init=10, random_state=42)

for chunk in df_reader:
    # scale the chunk
    scaled_chunk = scaler.transform(chunk)
    # train the model
    mini_batch.partial_fit(scaled_chunk)

In [35]:
mini_batch

In [36]:
# centroids of the model

mini_batch.cluster_centers_

array([[ 0.73045884,  2.04239054],
       [-0.1173906 , -0.11849253],
       [ 3.47954831, -0.89529678],
       [ 5.48527202, -3.69790641],
       [-0.48663087, -1.04017627],
       [ 0.62788202,  0.92046449],
       [-0.34914079,  0.51518862],
       [ 0.01473087, -2.32907509],
       [-0.81572385, -0.46187737],
       [ 0.58036026, -1.23243074],
       [ 0.31455281,  0.33690916],
       [ 0.22105344,  1.71983443],
       [ 1.57598103,  0.6153353 ],
       [ 1.01624471,  2.91013889],
       [-0.3848926 , -0.72141732],
       [-0.51584289,  0.08019297],
       [ 1.33254777, -1.84857295],
       [-0.99012149, -1.312862  ],
       [-0.52340502, -2.17818074],
       [ 1.37546799, -0.05675938],
       [ 0.3712285 ,  0.6753905 ],
       [ 0.02751467,  1.36163402],
       [-0.01557346,  0.26816605],
       [ 0.80808335, -2.99996012],
       [-0.1860004 ,  0.98411707],
       [ 0.65884912,  1.24555345],
       [-0.37695364, -0.33759611],
       [ 3.04968943,  0.78005214],
       [-0.47919175,

In [37]:
scaler.inverse_transform(mini_batch.cluster_centers_)

array([[-73.94920774,  40.80730104],
       [-73.9787375 ,  40.74671307],
       [-73.85345965,  40.72493263],
       [-73.78360226,  40.64635159],
       [-73.99159778,  40.72087042],
       [-73.95278039,  40.77584389],
       [-73.98680913,  40.76448056],
       [-73.97413584,  40.6847316 ],
       [-74.00305976,  40.73708507],
       [-73.95443553,  40.71547989],
       [-73.96369334,  40.75948186],
       [-73.96694983,  40.79825705],
       [-73.91975903,  40.76728852],
       [-73.9392541 ,  40.83163143],
       [-73.98805433,  40.72980795],
       [-73.9926152 ,  40.75228392],
       [-73.92823757,  40.69820417],
       [-74.00913386,  40.71322472],
       [-73.99287859,  40.68896246],
       [-73.9267427 ,  40.74844398],
       [-73.96171938,  40.76897238],
       [-73.97369059,  40.78821364],
       [-73.97519131,  40.75755441],
       [-73.94650415,  40.66592097],
       [-73.98112711,  40.77762862],
       [-73.95170184,  40.78495891],
       [-73.98777783,  40.74056973],
 

In [38]:
# perform predictions and assign clusters

location_subset = df_final[df_final.columns[1:]]

location_subset

Unnamed: 0,pickup_longitude,pickup_latitude
0,-73.990372,40.734695
1,-73.980782,40.729912
2,-73.984550,40.679565
3,-73.993469,40.718990
4,-73.960625,40.781330
...,...,...
106884,-73.980759,40.733898
106885,-73.971992,40.794201
106886,-73.989906,40.756729
106887,-74.000534,40.747726


In [39]:
# scale the input data

scaled_location_subset = scaler.transform(location_subset)

scaled_location_subset

array([[-0.45142819, -0.54710418],
       [-0.1760788 , -0.71771339],
       [-0.28429089, -2.51332769],
       ...,
       [-0.43806596,  0.23873217],
       [-0.74320654, -0.08235055],
       [ 0.4182115 ,  0.44417069]])

In [40]:
# get the cluster predictions

cluster_predictions = mini_batch.predict(scaled_location_subset)

cluster_predictions.shape

(307176,)

In [41]:
# save the cluster predictions in data

df_final['region'] = cluster_predictions

df_final

Unnamed: 0,tpep_pickup_datetime,pickup_longitude,pickup_latitude,region
0,2016-01-01 00:00:00,-73.990372,40.734695,14
1,2016-01-01 00:00:00,-73.980782,40.729912,14
2,2016-01-01 00:00:00,-73.984550,40.679565,7
3,2016-01-01 00:00:00,-73.993469,40.718990,4
4,2016-01-01 00:00:00,-73.960625,40.781330,25
...,...,...,...,...
106884,2016-03-01 06:53:20,-73.980759,40.733898,14
106885,2016-03-01 06:53:20,-73.971992,40.794201,11
106886,2016-03-01 06:53:20,-73.989906,40.756729,15
106887,2016-03-01 06:53:20,-74.000534,40.747726,15


In [42]:
# drop the latitude and logitude columns from data

time_series_data = df_final.drop(columns=["pickup_latitude","pickup_longitude"])

time_series_data

Unnamed: 0,tpep_pickup_datetime,region
0,2016-01-01 00:00:00,14
1,2016-01-01 00:00:00,14
2,2016-01-01 00:00:00,7
3,2016-01-01 00:00:00,4
4,2016-01-01 00:00:00,25
...,...,...
106884,2016-03-01 06:53:20,14
106885,2016-03-01 06:53:20,11
106886,2016-03-01 06:53:20,15
106887,2016-03-01 06:53:20,15


In [43]:
# save the time series data

time_series_data.to_csv("time_series.csv", index=False)

In [44]:
time_series_data.dtypes

Unnamed: 0,0
tpep_pickup_datetime,datetime64[ns]
region,int32


In [45]:
# set the time series as the index

time_series_data.set_index('tpep_pickup_datetime', inplace=True)

time_series_data

Unnamed: 0_level_0,region
tpep_pickup_datetime,Unnamed: 1_level_1
2016-01-01 00:00:00,14
2016-01-01 00:00:00,14
2016-01-01 00:00:00,7
2016-01-01 00:00:00,4
2016-01-01 00:00:00,25
...,...
2016-03-01 06:53:20,14
2016-03-01 06:53:20,11
2016-03-01 06:53:20,15
2016-03-01 06:53:20,15


In [47]:
# group the time series data based on regions

region_grp = time_series_data.groupby("region")

region_grp

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7d152e731fd0>

In [48]:
# check for missing values

time_series_data.isna().sum()

Unnamed: 0,0
region,0


In [49]:
# resample the time series in 15 minute intervals

resampled_data = (
    region_grp['region']
    .resample("15min")
    .count()
)

resampled_data

Unnamed: 0_level_0,Unnamed: 1_level_0,region
region,tpep_pickup_datetime,Unnamed: 2_level_1
0,2016-01-01 00:00:00,46
0,2016-01-01 00:15:00,110
0,2016-01-01 00:30:00,133
0,2016-01-01 00:45:00,137
0,2016-01-01 01:00:00,177
...,...,...
29,2016-03-10 13:15:00,0
29,2016-03-10 13:30:00,0
29,2016-03-10 13:45:00,1
29,2016-03-10 14:00:00,1


In [50]:
resampled_data.name = "total_pickups"

In [51]:
resampled_data = resampled_data.reset_index(level=0)

resampled_data

Unnamed: 0_level_0,region,total_pickups
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-01-01 00:00:00,0,46
2016-01-01 00:15:00,0,110
2016-01-01 00:30:00,0,133
2016-01-01 00:45:00,0,137
2016-01-01 01:00:00,0,177
...,...,...
2016-03-10 13:15:00,29,0
2016-03-10 13:30:00,29,0
2016-03-10 13:45:00,29,1
2016-03-10 14:00:00,29,1


In [52]:
# zeros in the data

(resampled_data['total_pickups'] == 0).sum()

np.int64(196871)

In [53]:
epsilon_val = 10

resampled_data.replace({'total_pickups': {0 : epsilon_val}}, inplace=True)

In [54]:
(resampled_data['total_pickups'] == 0).sum()

np.int64(0)

# Smoothing

## Moving Average

In [55]:
from sklearn.metrics import mean_absolute_percentage_error

In [56]:
window_values = list(range(3,11,1))
window_values

[3, 4, 5, 6, 7, 8, 9, 10]

In [57]:
def calculate_best_window_value(windows):
    for window in windows:
        ind = window - 1
        y_pred = resampled_data['total_pickups'].rolling(window=window).mean().values[ind:]
        y = resampled_data['total_pickups'].values[ind:]
        error = mean_absolute_percentage_error(y, y_pred)
        print(f"For window value {window}, the MAPE is {error:.2f}")

In [58]:
calculate_best_window_value(window_values)

For window value 3, the MAPE is 0.02
For window value 4, the MAPE is 0.03
For window value 5, the MAPE is 0.03
For window value 6, the MAPE is 0.04
For window value 7, the MAPE is 0.04
For window value 8, the MAPE is 0.05
For window value 9, the MAPE is 0.06
For window value 10, the MAPE is 0.06


## EWMA

In [59]:
resampled_data['total_pickups'].ewm(alpha=0.9).mean()

Unnamed: 0_level_0,total_pickups
tpep_pickup_datetime,Unnamed: 1_level_1
2016-01-01 00:00:00,46.000000
2016-01-01 00:15:00,104.181818
2016-01-01 00:30:00,130.144144
2016-01-01 00:45:00,136.315032
2016-01-01 01:00:00,172.931869
...,...
2016-03-10 13:15:00,9.992161
2016-03-10 13:30:00,9.999216
2016-03-10 13:45:00,1.899922
2016-03-10 14:00:00,1.089992


In [60]:
smoothing_values = np.arange(0.2,1,0.1)
smoothing_values

array([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

In [61]:
def calculate_best_smoothing_value(values):
    y = resampled_data['total_pickups'].values
    for value in values:
        y_pred = resampled_data['total_pickups'].ewm(alpha=value).mean()
        error = mean_absolute_percentage_error(y, y_pred)
        print(f"For smoothing value {value:.1f}, the MAPE is {error:.2f}")

In [62]:
calculate_best_smoothing_value(smoothing_values)

For smoothing value 0.2, the MAPE is 0.05
For smoothing value 0.3, the MAPE is 0.03
For smoothing value 0.4, the MAPE is 0.02
For smoothing value 0.5, the MAPE is 0.02
For smoothing value 0.6, the MAPE is 0.01
For smoothing value 0.7, the MAPE is 0.01
For smoothing value 0.8, the MAPE is 0.00
For smoothing value 0.9, the MAPE is 0.00


In [63]:
# dataset with pickup smoothing applied

resampled_data["avg_pickups"] = resampled_data['total_pickups'].ewm(alpha=0.4).mean().round()

resampled_data

Unnamed: 0_level_0,region,total_pickups,avg_pickups
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-01-01 00:00:00,0,46,46.0
2016-01-01 00:15:00,0,110,86.0
2016-01-01 00:30:00,0,133,110.0
2016-01-01 00:45:00,0,137,122.0
2016-01-01 01:00:00,0,177,146.0
...,...,...,...
2016-03-10 13:15:00,29,10,9.0
2016-03-10 13:30:00,29,10,9.0
2016-03-10 13:45:00,29,1,6.0
2016-03-10 14:00:00,29,1,4.0


In [65]:
# save the resampled data

resampled_data.to_csv("final_data.csv", index=True)

In [66]:
# shape of the data

resampled_data.shape

(200452, 3)