### This notebook does the following:
- processes raw ridership data
- performs feature selection and cleaning
- performs hourly aggregation
- saves results in one file

In [1]:
import pandas as pd
import glob
import os
from datetime import date, timedelta
import itertools

In [33]:
!pwd

/Users/hemingyi/Documents/UrbanTemporalNetworks/Data Collection Scripts


In [34]:
# raw data directory
dataDir = '/Users/hemingyi/Documents/UrbanTemporalNetworks/rawData/'

In [35]:
files = glob.glob(dataDir+'*csv')
len(files)

36

In [None]:
pd.read_csv(fil)

### identify zones of interest

In [36]:
zones = pd.read_csv('../Data/taxi_zones.csv')
zones.head(2)

Unnamed: 0,OBJECTID,Shape_Leng,the_geom,Shape_Area,zone,LocationID,borough
0,1,0.116357,MULTIPOLYGON (((-74.18445299999996 40.69499599...,0.000782,Newark Airport,1,EWR
1,2,0.43347,MULTIPOLYGON (((-73.82337597260663 40.63898704...,0.004866,Jamaica Bay,2,Queens


In [37]:
zones[zones.zone.apply(lambda x: 'Airport' in x)][['zone','LocationID']]

Unnamed: 0,zone,LocationID
0,Newark Airport,1
136,JFK Airport,132
145,LaGuardia Airport,138


In [38]:
zones[zones.zone.apply(lambda x: 'Station' in x)][['zone','LocationID']]

Unnamed: 0,zone,LocationID
185,Penn Station/Madison Sq West,186


In [39]:
zone_dict = {'Jfk':  132,
'Lga' : 138,
'Penn' : 186,
'EWR':1}
zone_dict

{'Jfk': 132, 'Lga': 138, 'Penn': 186, 'EWR': 1}

### Run following script for each hub

In [40]:
hub = 'Penn'
zone = zone_dict[hub]
zone

1

In [41]:
processedFileDir = "../processedData/"
processedFile = processedFileDir+hub+"VehicleByHour.csv"

In [42]:
validDestZones = list(set([z for z in zones.LocationID if z != zone]))
len(validDestZones)

259

In [20]:
for file in files:
    print("Processing "+str(file).split('/')[-1])
    
    vehicleType = str(file).split('/')[-1].split('_')[0]
    df = pd.read_csv(file)
    print("DataFrame Shape: "+str(df.shape))
    
    # rename columns for consistency
    # set passenger count to 1 for fhv
    if vehicleType == 'fhv':
        df.rename(columns={'Pickup_DateTime': 'tpep_pickup_datetime', \
                           'PUlocationID':'PULocationID', 'DOlocationID':'DOLocationID' },inplace=True)
        df['passenger_count'] = 1
        
    if vehicleType == 'green':
        df.rename(columns={'lpep_pickup_datetime': 'tpep_pickup_datetime'},inplace=True)


    # treat for na values
    df = df.dropna(subset=['tpep_pickup_datetime','PULocationID', 'DOLocationID'])
    df.fillna(value={'passenger_count':1}, inplace = True)
    
    # correct data types
    df['PULocationID'] = df['PULocationID'].astype('int')
    df['DOLocationID'] = df['DOLocationID'].astype('int')
    
    # filter to get outgoing traffic from selected hub
    df = df[(df['PULocationID'] == zone) & (df['DOLocationID'].apply(lambda x: x in validDestZones))]
    print("JFK out DataFrame Shape: "+str(df.shape))
    
    # treat datetime
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df['Date'] = df['tpep_pickup_datetime'].dt.date
    df['Hour'] = df['tpep_pickup_datetime'].dt.hour
    
    #df['vehicle_type'] = vehicleType
    
    # select rquired columns
    #df = df[['vehicle_type', 'Date', 'Hour', 'DOLocationID','passenger_count']]
    df = df[['Date', 'Hour', 'DOLocationID','passenger_count']]
    
    # Hourly aggregation
    #df_count = df.groupby(['vehicle_type', 'Date', 'Hour', 'DOLocationID']).count().reset_index()
    df_count = df.groupby(['Date', 'Hour', 'DOLocationID']).count().reset_index()
    df_count.rename(columns={'passenger_count': 'vehicle_count'},inplace=True)

    #df_sum = df.groupby(['vehicle_type', 'Date', 'Hour', 'DOLocationID']).sum().reset_index()

    #aggregatedDf = pd.merge(df_count,df_sum, on=['vehicle_type', 'Date', 'Hour', 'DOLocationID'], how='inner')
    aggregatedDf = df_count
    
    print("Aggregated DataFrame Shape: "+str(aggregatedDf.shape))
    print(aggregatedDf.head(3))
    # save file
    if os.path.exists(processedFile):
        print('append to results...')
        aggregatedDf.to_csv(processedFile,index=False, header=False, mode='a+')      
    else:
        print('create results file...')
        aggregatedDf.to_csv(processedFile,index=False)
    print('file saved..')
    print("------------------------------------------------")

Processing green_tripdata_2018-06.csv
DataFrame Shape: (739373, 19)
JFK out DataFrame Shape: (0, 19)
Aggregated DataFrame Shape: (0, 4)
Empty DataFrame
Columns: [Date, Hour, DOLocationID, vehicle_count]
Index: []
create results file...
file saved..
------------------------------------------------
Processing green_tripdata_2018-12.csv
DataFrame Shape: (685373, 19)
JFK out DataFrame Shape: (0, 19)
Aggregated DataFrame Shape: (0, 4)
Empty DataFrame
Columns: [Date, Hour, DOLocationID, vehicle_count]
Index: []
append to results...
file saved..
------------------------------------------------
Processing green_tripdata_2018-07.csv
DataFrame Shape: (684455, 19)
JFK out DataFrame Shape: (0, 19)
Aggregated DataFrame Shape: (0, 4)
Empty DataFrame
Columns: [Date, Hour, DOLocationID, vehicle_count]
Index: []
append to results...
file saved..
------------------------------------------------
Processing green_tripdata_2018-11.csv
DataFrame Shape: (656573, 19)
JFK out DataFrame Shape: (0, 19)
Aggregate

DataFrame Shape: (21596443, 7)
JFK out DataFrame Shape: (2190, 8)
Aggregated DataFrame Shape: (2106, 4)
         Date  Hour  DOLocationID  vehicle_count
0  2018-07-01     0            41              1
1  2018-07-01     0            79              1
2  2018-07-01     0           140              2
append to results...
file saved..
------------------------------------------------
Processing fhv_tripdata_2018-06.csv
DataFrame Shape: (21135283, 7)
JFK out DataFrame Shape: (2069, 8)
Aggregated DataFrame Shape: (1992, 4)
         Date  Hour  DOLocationID  vehicle_count
0  2018-06-01     0            13              1
1  2018-06-01     0           107              1
2  2018-06-01     0           113              1
append to results...
file saved..
------------------------------------------------
Processing fhv_tripdata_2018-12.csv
DataFrame Shape: (23854144, 7)
JFK out DataFrame Shape: (2680, 8)
Aggregated DataFrame Shape: (2557, 4)
         Date  Hour  DOLocationID  vehicle_count
0  2018-1

### Further processing

In [21]:
def getcCompleteGridDf(minDate,maxDate, locations):
    minDate = [int(x) for x in minDate.split('-')]
    maxDate = [int(x) for x in maxDate.split('-')]
    sdate = date(minDate[0], minDate[1], minDate[2])   
    edate = date(maxDate[0], maxDate[1], maxDate[2])    

    delta = edate - sdate       
    days = []
    for i in range(delta.days + 1):
        days.append(sdate + timedelta(days=i))
    hours = list(range(24))
    print(len(days))
    print(len(hours))
    
    combList = list(itertools.product(*[days,hours,locations]))
    dfList = [{'Date':d, 'Hour':h, 'DOLocationID':l} for d,h,l in combList]
 
    dateHourDf = pd.DataFrame(dfList)
    dateHourDf['Date'] = pd.to_datetime(dateHourDf['Date']).dt.date
    return dateHourDf

In [22]:
processedDf = pd.read_csv(processedFile)
processedDf.head(2)

Unnamed: 0,Date,Hour,DOLocationID,vehicle_count
0,2018-04-17,16,66,1
1,2018-08-03,8,80,1


In [23]:
processedDf.shape

(27026, 4)

In [24]:
# ensuring proper grouping since files were grouped by independently
processedDf = processedDf.groupby(['Date', 'Hour', 'DOLocationID']).sum().reset_index()
processedDf.shape

(27013, 4)

In [25]:
# sanity checks
validYears = [2018]
processedDf = processedDf[processedDf.Date.apply(lambda x: int(x.split('-')[0]) in validYears)]

validMonths = list(range(1,13))
processedDf = processedDf[processedDf.Date.apply(lambda x: int(x.split('-')[1]) in validMonths)]

processedDf.shape    

(27013, 4)

In [26]:
minDate, maxDate = (processedDf.Date.min(), processedDf.Date.max()) 
#v_types = list(set(processedDf.vehicle_type))
locations = list(set(processedDf.DOLocationID))

#print(len(v_types))
print(len(locations))

dateHourDf = getcCompleteGridDf(minDate,maxDate,locations)
dateHourDf.shape

228
365
24


(1997280, 3)

In [27]:
dateHourDf['Date'] = pd.to_datetime(dateHourDf['Date'])
processedDf['Date'] = pd.to_datetime(processedDf['Date'])

In [28]:
mergedDf = pd.merge(dateHourDf,processedDf, on=['Date', 'Hour', 'DOLocationID'], how='left')
mergedDf.fillna(0, inplace=True)
mergedDf['Date'] = mergedDf['Date'].dt.date
print(mergedDf.shape)
mergedDf.head(3)

(1997280, 4)


Unnamed: 0,DOLocationID,Date,Hour,vehicle_count
0,3,2018-01-01,0,0.0
1,4,2018-01-01,0,0.0
2,5,2018-01-01,0,0.0


In [29]:
# sanity check
print(processedDf.vehicle_count.sum())
print(mergedDf.vehicle_count.sum())

28296
28296.0


In [30]:
# sanity check for size of new dataframe
366*258*24

2266272

In [77]:
# fraction of combinations that have data. Data is very scarce !!!
2332298/4520160

0.5159768680754664

In [62]:
mergedDf['Date'] = mergedDf['Date'].astype('str')
mergedDf['Hour'] = mergedDf['Hour'].astype('str')
mergedDf['DOLocationID'] = mergedDf['DOLocationID'].astype('str')

In [70]:
JFK = pd.read_csv("/Users/hemingyi/Documents/UrbanTemporalNetworks/processedData/JfkVehiceByHour.csv")
JFK['Date'] = JFK['Date'].astype('str')
JFK['Hour'] = JFK['Hour'].astype('str')
JFK['DOLocationID'] = JFK['DOLocationID'].astype('str')
JFK = JFK[JFK['DOLocationID']!='1']
del JFK['vehicle_count']

In [71]:
JFK.head(2)

Unnamed: 0,DOLocationID,Date,Hour
1,2,2018-01-01,0
2,3,2018-01-01,0


In [72]:
mergedDf.head(2)

Unnamed: 0,DOLocationID,Date,Hour,vehicle_count
0,3,2018-01-01,0,0.0
1,4,2018-01-01,0,0.0


In [75]:
EWR = JFK.merge(mergedDf,on=['DOLocationID','Date','Hour'],how='outer')
EWR = EWR.fillna(0)
EWR.to_csv(processedFile,index=False)