## Get the combined Dataframes from Jan-17 to Jun-17 for Pickups originating at LGA = 138

In [1]:
from tqdm import tqdm
import glob
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

In [2]:
csvFiles = glob.glob("../raw_data/taxidata/*.csv")

In [3]:
combined_df = []

for file in tqdm(csvFiles):
    df = pd.read_csv(file, index_col = None, header = 0)
    df = df.loc[df["PULocationID"]==138, :] # 138 is for LaGuardia Airport
    df = df.reset_index()
    df = df.drop("index", axis=1)
    combined_df.append(df)

100%|██████████| 6/6 [05:07<00:00, 50.93s/it]


In [4]:
final_df = pd.concat(combined_df, ignore_index=True)

In [5]:
final_df.shape

(1471188, 17)

### Cleaning Taxi Data

In [6]:
final_df.shape

(1471188, 17)

In [7]:
final_df.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount'],
      dtype='object')

#### Removing the columns which are not required for analysis

In [8]:
final_df.drop(["VendorID", "RatecodeID", "store_and_fwd_flag", "payment_type", "fare_amount", "extra", "mta_tax", "tip_amount", "tolls_amount", "improvement_surcharge"], axis=1, inplace=True)

In [9]:
final_df.shape

(1471188, 7)

In [10]:
final_df.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,PULocationID,DOLocationID,total_amount
0,2017-01-01 00:00:05,2017-01-01 00:15:36,1,8.47,138,262,38.55
1,2017-01-01 00:00:12,2017-01-01 00:09:48,1,5.3,138,192,17.3
2,2017-01-01 00:00:17,2017-01-01 00:19:40,2,13.97,138,181,50.44
3,2017-01-01 00:00:21,2017-01-01 00:12:09,1,6.9,138,80,21.3
4,2017-01-01 00:00:33,2017-01-01 00:31:58,2,10.5,138,230,42.34


#### Adding a column containing trip duration in seconds

In [11]:
pickup = pd.to_datetime(final_df['tpep_pickup_datetime'])
dropoff = pd.to_datetime(final_df['tpep_dropoff_datetime'])
final_df['trip_duration'] = (dropoff - pickup).dt.total_seconds()

In [12]:
final_df.dtypes

tpep_pickup_datetime      object
tpep_dropoff_datetime     object
passenger_count            int64
trip_distance            float64
PULocationID               int64
DOLocationID               int64
total_amount             float64
trip_duration            float64
dtype: object

#### Adding the holiday data for the taxis

In [13]:
dr = pd.to_datetime(pd.to_datetime(final_df['tpep_pickup_datetime']).dt.date)

cal = calendar()
holidays = cal.holidays(start=dr.min(), end=dr.max())

final_df['holiday'] = dr.isin(holidays)

In [14]:
final_df.to_csv('../clean_data/TaxiData_Jan17-Jun17.gz', compression='gzip', index=False)