In [1]:
import pandas as pd
import datetime
import csv
import tqdm
import pickle

In [None]:
YEAR = 2019
_dir = 'data/Taxi Data/' + str(YEAR) + '_Yellow_Taxi_Trip_Data.csv'

In [None]:
colstouse = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'PULocationID', 'DOLocationID']
taxiDF = pd.read_csv(_dir,usecols=colstouse)
taxiDF['tpep_pickup_datetime'] = pd.to_datetime(taxiDF['tpep_pickup_datetime'],format='%m/%d/%Y %I:%M:%S %p')

In [8]:
def clean_year_error(df, year):
    timeLower = datetime.datetime.strptime('01/01/'+ str(year)+ ' 12:00:00 AM', '%m/%d/%Y %I:%M:%S %p')
    timeUpper = datetime.datetime.strptime('12/31/'+ str(year)+ ' 11:59:59 PM', '%m/%d/%Y %I:%M:%S %p')
    _df = df[df['tpep_pickup_datetime'].between(timeLower, timeUpper)]
    return _df
    

In [None]:
taxiDF_cleaned_1 = clean_year_error(taxiDF, YEAR)
taxiDF_cleaned_1

In [None]:
taxiDF_cleaned_1[taxiDF_cleaned_1['PULocationID']==68].groupby(pd.Grouper(key='tpep_pickup_datetime', axis=0, freq='H', sort=True)).size()

In [None]:
def read_by_zone(year, zone):
    with open('data/Taxi Data/' + str(year) + '_Yellow_Taxi_Trip_Data.csv') as input_file:
        reader = csv.reader(input_file)
        desired_rows = [row for row_number, row in enumerate(reader) if row[7] == zone ]
    df = pd.DataFrame(desired_rows)
    return df

In [None]:
taxiDF_2017_23 = read_by_zone(2017, 23)
taxiDF_2017_23

In [None]:
taxiDF_cleaned_1

In [7]:
def process_a_year_and_save(year, freq):
    _dir = 'data/Taxi Data/' + str(year) + '_Yellow_Taxi_Trip_Data.csv'
    df = pd.read_csv(_dir, usecols=['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'PULocationID', 'DOLocationID'])
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'],format='%m/%d/%Y %I:%M:%S %p')
    df = clean_year_error(df, year)
    for zone in tqdm.tqdm(set(df['PULocationID'].values)):
        sizeGroupByZone = df[df['PULocationID']==zone].groupby(pd.Grouper(key='tpep_pickup_datetime', axis=0, freq=freq, sort=True)).size()
        savefile = 'data/Taxi Data/' + str(year) + '_' + str(zone) + '_taxi_dataframe_' + str(freq)
        pickle.dump(sizeGroupByZone, open(savefile, 'wb'))

In [None]:
process_a_year_and_save(2019, '2H')

In [6]:
def fill_timestamps(year, zone):
    _df = pickle.load(open('data/Taxi Data/'+str(year)+'_'+str(zone)+'_taxi_dataframe','rb'))
    _dfStartTime, _dfEndTime = _df.index.min(), _df.index.max()
    _actualStart = pd.Timestamp(year=year, month=1, day=1, hour=0, minute=0, second=0) 
    _actualEnd = pd.Timestamp(year=year, month=12, day=31, hour=23, minute=0, second=0)
    timeRangeBefore = pd.date_range(_actualStart, _dfStartTime-pd.Timedelta('1 hour'), freq='1H')
    timeRangeAfter = pd.date_range(_dfEndTime+pd.Timedelta('1 hour'), _actualEnd,freq='1H')
    for _timestamp in timeRangeBefore:
        _df[_timestamp] = 0
    for _timestamp in timeRangeAfter:
        _df[_timestamp] = 0
    _df = _df.sort_index(ascending=True)
    return _df

def concatenate_and_save():
    for zone in tqdm.trange(1, 266):
        try:
            DFs = []
            for year in [2017, 2018, 2019, 2020]:
                _df = fill_timestamps(year, zone)
                DFs.append(_df)
            concatDF = pd.concat(DFs)
            pickle.dump(concatDF.values, open('data/Taxi Data/Concat/' + '_' + str(zone) + '_taxi_dataframe_Concat','wb'))
        except FileNotFoundError:
            continue
        

In [9]:
concatenate_and_save()

100%|████████████████████████████████████████████████████████████████████████████████| 265/265 [00:40<00:00,  6.60it/s]


In [12]:
zone = 100
concat_ = pickle.load(open('data/Taxi Data/Concat/' + '_' + str(zone) + '_taxi_dataframe_Concat','rb'))
concat_

array([100, 155, 148, ...,  15,  24,  15], dtype=int64)

In [None]:
taxiDF2017 = pd.read_csv('data/Taxi Data/' + str(2017) + '_Yellow_Taxi_Trip_Data.csv')
taxiDF2018 = pd.read_csv('data/Taxi Data/' + str(2018) + '_Yellow_Taxi_Trip_Data.csv')

In [None]:
taxiDF2017_2018 = pd.concat([taxiDF2017, taxiDF2018])

In [None]:
start = pd.Timestamp(year=2019, month=1, day=1, hour=1, minute=0, second=0)
end = pd.Timestamp(year=2019, month=1, day=1, hour=12, minute=0, second=0) - pd.Timedelta('1 hour')
pd.date_range(start, end, freq='H')

## Below for testing

In [None]:
taxiDF_cleaned_1['tpep_pickup_datetime'].max()

In [None]:
pickuptime = taxiDF['tpep_pickup_datetime'].loc[0]

pickuptime

In [None]:
taxiDF = taxiDF[taxiDF['tpep_pickup_datetime'] != '2002-12-31 23:06:55']

In [None]:
date_string = '01/01/2020 12:47:41 AM'
_datetime = datetime.datetime.strptime(date_string, '%m/%d/%Y %I:%M:%S %p')

In [None]:
taxiDF['tpep_pickup_datetime'].loc[2] == _datetime

In [None]:
taxiDF.groupby(pd.Grouper(key='tpep_pickup_datetime', axis=0, freq='2H', sort=True)).size()