In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch.autograd import Variable

%matplotlib notebook

%run utils.py

# Read in Raw Unprocessed Data

In [2]:
df = pd.read_csv('data/yellow_tripdata_2009-01.csv')

# Process Data Types

In [3]:
df['Trip_Dropoff_DateTime'] = pd.to_datetime(df['Trip_Dropoff_DateTime'])
df['Trip_Pickup_DateTime'] = pd.to_datetime(df['Trip_Pickup_DateTime'])

# Add Continuous Columns

In [4]:
df['duration'] = (df['Trip_Dropoff_DateTime'] - df['Trip_Pickup_DateTime']).dt.total_seconds()
df['weekday'] = df['Trip_Pickup_DateTime'].dt.weekday
df['hour'] = df['Trip_Pickup_DateTime'].dt.hour

# Add Dummy Columns

In [5]:
w = pd.get_dummies(df['weekday'])
w.columns = ['week_{}'.format(c) for c in w.columns]

hour = pd.get_dummies(df['hour'])
hour.columns = ['hour_{}'.format(c) for c in hour.columns ]

df = pd.concat([df, w, hour], axis=1)

# Filter Out Observations

In [6]:
filters = [filter_coords, filter_durations]

for f in tqdm(filters):
    df = f(df)




# De-Mean the Data

In [7]:
for c in ['Trip_Distance', 'Start_Lon', 'Start_Lat', 'End_Lon', 'End_Lat']:
    print (c)
    print (df[c].mean())
    print (df[c].std())
    print()
    df[c] = (df[c] - df[c].mean()) / df[c].std()

Trip_Distance
2.300940237560106
2.1919287167166224

Start_Lon
-73.97826142142969
0.0239235715736219

Start_Lat
40.753207908605766
0.02262721225173857

End_Lon
-73.97635609080373
0.025989097948950303

End_Lat
40.753145629515345
0.025820804247699374



# Test-Train-Validation Split

In [8]:
n = len(df)

In [9]:
splits = np.random.choice([0, 1, 2], size=n, p=[.8, .1, .1])

In [10]:
df_train = df[splits == 0]
df_test = df[splits == 1]
df_validate = df[splits == 2]

# Save Cleaned Data

In [11]:
x = ['duration', 'Trip_Distance','Start_Lon', 'Start_Lat', 'End_Lon', 'End_Lat', 
'week_0', 'week_1', 'week_2', 'week_3', 'week_4', 'week_5',
'week_6', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5',
'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12',
'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18',
'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23']

In [12]:
df[x].to_csv('data/yellow_tripdata_2009-01_cleaned.csv',index=False)

In [13]:
df_train[x].to_csv('data/yellow_tripdata_2009-01_train.csv',index=False)
df_test[x].to_csv('data/yellow_tripdata_2009-01_test.csv',index=False)
df_validate[x].to_csv('data/yellow_tripdata_2009-01_val.csv',index=False)

In [14]:
len(df)

13239938

In [15]:
len(df_train)

10590681

In [16]:
len(df_test)

1323607

In [17]:
len(df_validate)

1325650