## Import Statements

In [1]:
import numpy as np
from tqdm import tqdm
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from datetime import datetime

### TaxiData 2017

In [2]:
def taxi_resample(path):
    taxi_df = pd.read_csv(path, compression='gzip')
    taxi_df["tpep_pickup_datetime"] = pd.DatetimeIndex(taxi_df.tpep_pickup_datetime)
    taxi_df["tpep_dropoff_datetime"] = pd.DatetimeIndex(taxi_df.tpep_dropoff_datetime)
    taxi_df.set_index("tpep_pickup_datetime", inplace = True)
    taxi_hourly_df = taxi_df.resample('H').agg({'tpep_dropoff_datetime' : 'count', 'passenger_count' : 'sum'})
#     taxi_hourly_df = taxi_df.groupby(pd.Grouper(freq="1H")).aggregate({'tpep_dropoff_datetime' : 'count', 'passenger_count' : 'sum'})
    taxi_hourly_df.rename(columns={'tpep_dropoff_datetime' : 'num_pickups', 'passenger_count':'num_passengers'}, inplace = True)
    return taxi_hourly_df

In [3]:
taxi_hourly_df = taxi_resample('../clean_data/TaxiData_Jan17-Jun17.gz')

In [4]:
taxi_hourly_df.head()

Unnamed: 0_level_0,num_pickups,num_passengers
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-01 00:00:00,53,85.0
2017-01-01 01:00:00,3,3.0
2017-01-01 02:00:00,0,
2017-01-01 03:00:00,4,6.0
2017-01-01 04:00:00,4,8.0


In [5]:
taxi_hourly_df.shape

(4344, 2)

### TaxiData 2014-2016

In [6]:
def clean_query_data(path):
    taxi_old = pd.read_csv(path)
    taxi_old['tpep_pickup_datetime'] = taxi_old.apply(lambda row: datetime(row.yr, row.mnth, row.d, row.hr), axis=1)
    taxi_old = taxi_old.drop(['yr', 'mnth', 'd', 'hr'], axis = 1)
    taxi_old.set_index('tpep_pickup_datetime', inplace=True)
    return taxi_old

In [7]:
taxi_2014 = clean_query_data('../raw_data/taxi_aggregated_2014.csv')
taxi_2015 = clean_query_data('../raw_data/taxi_aggregated_2015.csv')
taxi_2016 = clean_query_data('../raw_data/taxi_aggregated_2016.csv')

In [12]:
taxi_2016_end = pd.read_csv('../clean_data/2016Jul-Dec_clean.csv', index_col=0)

In [13]:
taxi_2016_end.head()

Unnamed: 0_level_0,num_pickups,num_passengers
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-07-01 00:00:00,293,529.0
2016-07-01 01:00:00,8,13.0
2016-07-01 02:00:00,1,2.0
2016-07-01 03:00:00,1,1.0
2016-07-01 04:00:00,9,9.0


In [11]:
taxi_2016.tail()

Unnamed: 0_level_0,num_pickups,num_passengers
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-06-30 19:00:00,658,1142
2016-06-30 20:00:00,672,1091
2016-06-30 21:00:00,641,1036
2016-06-30 22:00:00,781,1341
2016-06-30 23:00:00,444,715


### Concatenating 2014-2017 Aggregated Per Hour Taxi Data

In [14]:
taxi_full = pd.concat([taxi_2014, taxi_2015, taxi_2016, taxi_2016_end, taxi_hourly_df], axis=0)

In [15]:
taxi_full.shape

(30425, 2)

In [16]:
len(taxi_full.index.unique())

30425

In [17]:
taxi_full.head()

Unnamed: 0_level_0,num_pickups,num_passengers
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-01-01 00:00:00,107,210.0
2014-01-01 01:00:00,34,45.0
2014-01-01 02:00:00,34,70.0
2014-01-01 03:00:00,12,18.0
2014-01-01 04:00:00,25,37.0


In [18]:
taxi_full.to_csv('../clean_data/Aggregated_TaxiData_14-17.csv')

---