In [1]:
import numpy as np
from tqdm import tqdm
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

In [7]:
from datetime import datetime

In [2]:
def taxi_resample(path):
    taxi_df = pd.read_csv(path, compression='gzip')
    taxi_df["tpep_pickup_datetime"] = pd.DatetimeIndex(taxi_df.tpep_pickup_datetime).tz_localize('America/New_York')
    taxi_df["tpep_dropoff_datetime"] = pd.DatetimeIndex(taxi_df.tpep_dropoff_datetime).tz_localize('America/New_York')
    taxi_df.set_index("tpep_pickup_datetime", inplace = True)
    taxi_hourly_df = taxi_df.resample('H').agg({'tpep_dropoff_datetime' : 'count', 'passenger_count' : 'sum'})
#    taxi_hourly_df = taxi_hourly.to_frame()
    taxi_hourly_df.rename(columns={'tpep_dropoff_datetime' : 'num_pickups', 'passenger_count':'num_passengers'}, inplace = True)
    return taxi_hourly_df

In [3]:
taxi_hourly_df = taxi_resample('../clean_data/TaxiData_Jan17-Jun17.gz')

In [4]:
taxi_hourly_df.head()

Unnamed: 0_level_0,num_pickups,num_passengers
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-01 00:00:00-05:00,53,85.0
2017-01-01 01:00:00-05:00,3,3.0
2017-01-01 02:00:00-05:00,0,
2017-01-01 03:00:00-05:00,4,6.0
2017-01-01 04:00:00-05:00,4,8.0


In [5]:
taxi_old = pd.read_csv('../raw_data/taxi_pickups_2014-2016.csv')

In [26]:
def clean_query_data(path):
    taxi_old = pd.read_csv(path)
    taxi_old['tpep_pickup_datetime'] = taxi_old.apply(lambda row: datetime(row.yr, row.mnth, row.d, row.hr), axis=1)
    taxi_old = taxi_old.drop(['yr', 'mnth', 'd', 'hr'], axis = 1)
    taxi_old.set_index('tpep_pickup_datetime', inplace=True)
    return taxi_old

In [34]:
taxi_2014 = clean_query_data('../raw_data/taxi_aggregated_2014.csv')

In [35]:
taxi_2015 = clean_query_data('../raw_data/taxi_aggregated_2015.csv')

In [28]:
taxi_2016 = clean_query_data('../raw_data/taxi_aggregated_2016.csv')

In [36]:
taxi_2016.head()

Unnamed: 0_level_0,num_pickups,num_passengers
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-01-01 00:00:00,55,100
2016-01-01 01:00:00,17,30
2016-01-01 02:00:00,13,13
2016-01-01 03:00:00,20,39
2016-01-01 04:00:00,12,23


In [30]:
taxi_old.head()

Unnamed: 0_level_0,num_pickups,num_passengers
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-01-01 00:00:00,107,210
2014-01-01 01:00:00,34,45
2014-01-01 02:00:00,34,70
2014-01-01 03:00:00,12,18
2014-01-01 04:00:00,25,37


In [37]:
taxi_full = pd.concat([taxi_2014, taxi_2015, taxi_2016, taxi_hourly_df])

In [38]:
taxi_full.to_csv('../clean_data/aggregate_taxis_all2.csv')

In [39]:
taxi_full.shape

(26008, 2)