In [None]:
from constants import MAX_SPEED, TASK1_OUT_ROOT, TASK1_SCHEMA, CLEAN_PARQUET_DATA_ROOT, REASONABLE_MAX_TRIP_DURATION, REASONABLE_MAX_YEAR, REASONABLE_MIN_TRIP_DURATION, REASONABLE_MIN_YEAR, REASONABLE_PRICE_MAX, REASONABLE_PRICE_MIN, NYC_MOST_EAST_LONGITUDE, NYC_MOST_WEST_LONGITUDE, NYC_MOST_NORTH_LATITUDE, NYC_MOST_SOUTH_LATITUDE
from utils import write_parquet_sequentially
import dask.dataframe as dd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from glob import glob
from tqdm import tqdm

parquet_files = sorted(glob(os.path.join(TASK1_OUT_ROOT, "all", "**", "*.parquet")))
print(f"Found {len(parquet_files)} parquet files in {TASK1_OUT_ROOT}/all")

Found 193 parquet files in data/task1/all


In [2]:
dfs = [dd.read_parquet(file) for file in parquet_files]
print(dfs[0].columns)
print(dfs[0].dtypes)
print(len(dfs))

Index(['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count',
       'trip_distance', 'rate_code_id', 'store_and_fwd_flag', 'payment_type',
       'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
       'improvement_surcharge', 'total_amount', 'congestion_surcharge',
       'airport_fee', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'year'],
      dtype='object')
vendor_id                          int8
pickup_datetime          datetime64[ns]
dropoff_datetime         datetime64[ns]
passenger_count                   uint8
trip_distance                   float32
rate_code_id                      uint8
store_and_fwd_flag                 int8
payment_type                      uint8
fare_amount                     float32
extra                           float32
mta_tax                         float32
tip_amount                      float32
tolls_amount                    float32
improvement_surcharge           float32


## Cleaning
Filter rows based on:
- pickup/dropoff datetimes s.t.:
    - pickup is before dropoff at all times
    - in case where pickup is a year before the dropoff, we keep the rows where pickup was on the last day of the previous/current year and dropoff was on the first day of the current/next year.
- trip distance s.t.:
    - is positive
    - is within achievable range given 80mph for the given distance and computed trip duration based on dropoff - pickup.
= trip duration s.t.:
    - is positive (atleast 1min)
    - is within generous bounds (1min, 4hours)
- pickup/dropoff longitude/latitude
    - where either are NaN (not given)
    - are in reasonable bounds based on bbox given by: https://www.nyc.gov/assets/planning/download/pdf/data-maps/open-data/meta_nhood.pdf 
- prices:
    - positive for mandatory, nonegative for tips/surcharges
    - total_amount has to be atleast equal or larget to the sum of all charges (tips/extras not included)

In [None]:
for i in tqdm(range(len(dfs))):
    # Pickup/Dropoff datetimes cleaning
    dfs[i]['year'] = dfs[i]['pickup_datetime'].dt.year.astype(np.int16)
    pickup_before_dropoff = dfs[i]['pickup_datetime'] < dfs[i]['dropoff_datetime']
    delta_years = (dfs[i]['dropoff_datetime'].dt.year - dfs[i]['pickup_datetime'].dt.year)
    same_year = delta_years == 0
    dropoff_next_year = (delta_years == 1) & (dfs[i]['dropoff_datetime'].dt.month == 1) & (dfs[i]['dropoff_datetime'].dt.day == 1) & (dfs[i]['pickup_datetime'].dt.month == 12) & (dfs[i]['pickup_datetime'].dt.day == 31)
    reasonable_year = dfs[i]['year'].between(REASONABLE_MIN_YEAR, REASONABLE_MAX_YEAR, inclusive='both')
    correct_datetimes = pickup_before_dropoff & (same_year | dropoff_next_year) & reasonable_year

    # Trip distance cleaning
    trip_duration = (dfs[i]['dropoff_datetime'] - dfs[i]['pickup_datetime']).dt.total_seconds() / (60 * 60) # in hours
    achievable_trip_distance = trip_duration * MAX_SPEED
    reasonable_trip_duration = trip_duration.between(REASONABLE_MIN_TRIP_DURATION / 60, REASONABLE_MAX_TRIP_DURATION / 60) # in hours
    valid_trips = dfs[i]['trip_distance'].between(0, achievable_trip_distance, inclusive='neither') & reasonable_trip_duration

    # Coordinates cleaning
    latitudes_not_nan = dfs[i]['pickup_latitude'].notnull() & dfs[i]['dropoff_latitude'].notnull()
    longitudes_not_nan = dfs[i]['pickup_longitude'].notnull() & dfs[i]['dropoff_longitude'].notnull()
    latitudes_within_bounds = (dfs[i]['pickup_latitude'].between(NYC_MOST_SOUTH_LATITUDE, NYC_MOST_NORTH_LATITUDE) & dfs[i]['dropoff_latitude'].between(NYC_MOST_SOUTH_LATITUDE, NYC_MOST_NORTH_LATITUDE))
    longitudes_within_bounds = (dfs[i]['pickup_longitude'].between(NYC_MOST_WEST_LONGITUDE, NYC_MOST_EAST_LONGITUDE) & dfs[i]['dropoff_longitude'].between(NYC_MOST_WEST_LONGITUDE, NYC_MOST_EAST_LONGITUDE))
    coordinates_reasonable = latitudes_not_nan & longitudes_not_nan & latitudes_within_bounds & longitudes_within_bounds

    # Price cleaning
    positive_fare = dfs[i]['fare_amount'] > 2.5 # initial charge
    nonnegative_extra = dfs[i]['extra'] >= 0
    nonnegative_mta = dfs[i]['mta_tax'] >= 0
    nonnegative_tip = dfs[i]['tip_amount'] >= 0
    nonnegative_tolls = dfs[i]['tolls_amount'] >= 0
    nonnegative_improvement = dfs[i]['improvement_surcharge'] >= 0
    nonnegative_congestion = dfs[i]['congestion_surcharge'] >= 0
    nonnegative_airport_fee = dfs[i]['airport_fee'] >= 0
    positive_total = dfs[i]['total_amount'] > 0
    reasonable_sum = (
        dfs[i]['fare_amount'] +
        # dfs[i]['extra'] +
        # dfs[i]['tip_amount'] +
        dfs[i]['mta_tax'] +
        dfs[i]['improvement_surcharge'] +
        dfs[i]['tolls_amount'] +
        dfs[i]['congestion_surcharge'] +
        dfs[i]['airport_fee']).between(0, dfs[i]['total_amount'], inclusive='right')
    reasonable_price = dfs[i]['total_amount'].between(REASONABLE_PRICE_MIN, REASONABLE_PRICE_MAX, inclusive='both')
    valid_prices = positive_fare & nonnegative_extra & nonnegative_mta & nonnegative_tip & nonnegative_tolls & nonnegative_improvement & nonnegative_congestion & nonnegative_airport_fee & positive_total & reasonable_price & reasonable_sum

    # Apply all filters
    clean_mask = correct_datetimes & valid_trips & coordinates_reasonable & valid_prices
    # display(dfs[i][~clean_mask].head())
    dfs[i] = dfs[i][clean_mask].reset_index(drop=True)


 17%|█▋        | 32/193 [00:00<00:01, 109.65it/s]

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31


 32%|███▏      | 62/193 [00:00<00:01, 130.45it/s]

32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65


 48%|████▊     | 92/193 [00:00<00:00, 117.61it/s]

66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108


 64%|██████▎   | 123/193 [00:01<00:00, 99.48it/s] 

109
110
111
112
113
114
115
116
117
118
119
120
121
122


 80%|████████  | 155/193 [00:01<00:00, 124.50it/s]

123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170


100%|██████████| 193/193 [00:01<00:00, 112.01it/s]

171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192





## Write the clean dataset to parquet files

In [None]:
os.makedirs(os.path.join(CLEAN_PARQUET_DATA_ROOT), exist_ok=True)
# Write parquet (all)
print(f"Writing Parquet (all) to {CLEAN_PARQUET_DATA_ROOT}...")
write_parquet_sequentially(dfs, CLEAN_PARQUET_DATA_ROOT, partition_on=['year'], schema=TASK1_SCHEMA, row_group_size=2000000)

Writing Parquet (all) to data/clean2...


  2%|▏         | 3/192 [00:23<24:32,  7.79s/it]