In [None]:
import geopandas as gpd
import dask
import h5py
import numpy as np
import pandas as pd
import dask.dataframe as dd
from dask.distributed import LocalCluster
import os
from paths import ROOT, YEARS, TASK1_OUT_ROOT, get_files, TAXI_ZONES_SHAPEFILE, ZONES_TO_CENTROIDS_MAPPING_CSV, TASK1_SCHEMA, TASK1_NP_SCHEMA
pd.set_option('future.no_silent_downcasting', True)

# Local cluster
cluster = LocalCluster(n_workers=1)
client = cluster.get_client()
print(cluster.dashboard_link)

In [2]:
files = sum((get_files(ROOT, year) for year in YEARS), [])
dfs = [dd.read_parquet(f) for f in files]

consistency_mapping = {
    "End_Lat": "dropoff_latitude",
    "End_Lon": "dropoff_longitude",
    "Start_Lat": "pickup_latitude",
    "Start_Lon": "pickup_longitude",
    "Fare_Amt": "fare_amount",
    "Tip_Amt": "tip_amount",
    "Tolls_Amt": "tolls_amount",
    "Total_Amt": "total_amount",
    "Passenger_Count": "passenger_count",
    "Payment_Type": "payment_type",
    "Rate_Code": "rate_code_id",
    "rate_code": "rate_code_id",
    "RatecodeID": "rate_code_id",
    "Trip_Distance": "trip_distance",
    "Trip_Dropoff_DateTime": "tpep_dropoff_datetime",
    "Trip_Pickup_DateTime": "tpep_pickup_datetime",
    "tpep_dropoff_datetime": "dropoff_datetime",
    "tpep_pickup_datetime": "pickup_datetime",
    "Airport_fee": "airport_fee",
    "VendorID": "vendor_id",
    "vendor_name": "vendor_id",
    "surcharge": "extra",
    "store_and_forward": "store_and_fwd_flag",
}

for i, f in enumerate(files):
    # Rename columns
    for old_col, new_col in consistency_mapping.items():
        if old_col in dfs[i].columns.tolist():
            dfs[i] = dfs[i].rename(columns={old_col: new_col})
#     cols = sorted(dfs[i].columns)
#     # print(cols)
#     # get unique values of vendor_id
#     # if "airport_fee" in cols:
#     #     print(f"airport_fee: {sorted(dfs[i]['airport_fee'].unique().compute().tolist())}")
#     # if (i+1) % 12 == 1:
#     #     print(f"File: {os.path.basename(f)}")
#     #     print(cols)
#     #     display(dfs[i].head(5))

### Mapping column values for consistency

In [3]:
def map_vendor(value):
    # vendor_id (previously vendor_name but stands for the same thing)
    _old_vendorname2id = {
        "CMT": 1,
        "DDS": 2, # DDS and VTS were merged (Verifone) and later also merged with Curb Mobility
        "VTS": 2,
    }
    valid_vendor_ids= [1, 2, 6, 7]
    if str(value) in _old_vendorname2id:
        return _old_vendorname2id[value]
    if float(value) in valid_vendor_ids:
        return int(value)
    return -1 # Invalid vendor id
    
def map_rate_code(value):
    # rate_code_id
    # 1-6 and 99 (missing or unknown). All other values are set to 99
    valid_rate_code_ids = [1, 2, 3, 4, 5, 6, 99]
    if float(value) in valid_rate_code_ids:
        return int(value)
    return 99 # Invalid rate code id
    
def map_store_and_fwd_flag(value):
    # store_and_fwd_flag
    # npnan to "NA"
    letter_mapping = {
        "Y": 1,
        "N": 0,
    }
    if str(value).strip() in letter_mapping:
        return letter_mapping[str(value)]
    try:
        v = int(float(value))
        if v in [0, 1]:
            return v
    except:
        pass
    return -1

def map_payment_type(value):
    # ignore case and strip
    payment_type_mapping = {
        "credit": 1,
        "crd": 1,
        "cre":1,
        "cash": 2,
        "csh": 2,
        "cas": 2,
        "noc": 3,
        "no charge": 3,
        "no": 3,
        "dis": 4,
        "dispute": 4,
        "na": 5,
    }
    valid_vals = [0, 1, 2, 3, 4, 5, 6]
    if str(value).strip().lower() in payment_type_mapping:
        return payment_type_mapping[str(value).strip().lower()]
    if float(value) in valid_vals:
        return int(float(value))
    return 5 # unknown

def get_locationid_to_centroid(shapefile):
    # Load zones as GeoDataFrame
    zones = gpd.read_file(shapefile)

    # Reproject to NYC's local projected CRS (US feet) for correct geometry math
    zones_projected = zones.to_crs("EPSG:2263")

    # Calculate centroid in projected space
    zones_projected['centroid'] = zones_projected.geometry.centroid

    # Convert centroid back to WGS84 (lat/lon)
    centroids_wgs84 = zones_projected.set_geometry('centroid').to_crs("EPSG:4326")

    # Extract lat/lng from centroid geometry
    zones['centroid_lat'] = centroids_wgs84.geometry.y
    zones['centroid_lng'] = centroids_wgs84.geometry.x

    zones = zones[['LocationID', 'centroid_lat', 'centroid_lng']]
    zones['centroid_lat'] = zones['centroid_lat'].astype(float)
    zones['centroid_lng'] = zones['centroid_lng'].astype(float)
    zones['LocationID'] = zones['LocationID'].astype(int)

    return zones


locationid_to_centers_df = get_locationid_to_centroid(TAXI_ZONES_SHAPEFILE).sort_values(by='LocationID', ascending=True, ignore_index=True)
# save to base dir
locationid_to_centers_df.to_csv(ZONES_TO_CENTROIDS_MAPPING_CSV, index=False)


for i, f in enumerate(files):
    # map vendor_id column but with dask so i can compute later
    dfs[i]['vendor_id'] = dfs[i]['vendor_id'].map(map_vendor, meta=('vendor_id', 'int8'))
    dfs[i]['rate_code_id'] = dfs[i]['rate_code_id'].map(map_rate_code, meta=('rate_code_id', 'int8'))
    dfs[i]['store_and_fwd_flag'] = dfs[i]['store_and_fwd_flag'].map(map_store_and_fwd_flag, meta=('store_and_fwd_flag', 'int8'))
    dfs[i]['payment_type'] = dfs[i]['payment_type'].map(map_payment_type, meta=('payment_type', 'int8'))
    # pickup
    if "PULocationID" in dfs[i].columns:
        # set dtype to int
        dfs[i]['PULocationID'] = dfs[i]['PULocationID'].astype(int)
        dfs[i] = dfs[i].merge(locationid_to_centers_df, how='left', left_on='PULocationID', right_on='LocationID')
        # rename to pickup lattitude and longitude
        dfs[i] = dfs[i].rename(columns={'centroid_lat': 'pickup_latitude', 'centroid_lng': 'pickup_longitude'})

    # dropoff
    if "DOLocationID" in dfs[i].columns:
        # set dtype to int
        dfs[i]['DOLocationID'] = dfs[i]['DOLocationID'].astype(int)
        dfs[i] = dfs[i].merge(locationid_to_centers_df, how='left', left_on='DOLocationID', right_on='LocationID')
        # rename to dropoff lattitude and longitude
        dfs[i] = dfs[i].rename(columns={'centroid_lat': 'dropoff_latitude', 'centroid_lng': 'dropoff_longitude'})
    # drop LocationID columns
    dfs[i] = dfs[i].drop(columns=['DOLocationID', 'PULocationID', 'LocationID_x', 'LocationID_y'], errors='ignore')

    if 'airport_fee' not in dfs[i].columns:
        # set airport_fee to 0
        dfs[i]['airport_fee'] = 0.0
    if 'congestion_surcharge' not in dfs[i].columns:
        dfs[i]['congestion_surcharge'] = 0.0
    if 'improvement_surcharge' not in dfs[i].columns:
        dfs[i]['improvement_surcharge'] = 0.0
    year = int(os.path.basename(f).split('_')[2][:4])
    dfs[i]['year'] = year
    dfs[i]['year'] = dfs[i]['year'].astype(np.int16)
    dfs[i]['mta_tax'] = dfs[i]['mta_tax'].fillna(0.0)
    # parse datetimes
    dfs[i]['pickup_datetime'] = dd.to_datetime(dfs[i]['pickup_datetime'], errors='raise')
    dfs[i]['dropoff_datetime'] = dd.to_datetime(dfs[i]['dropoff_datetime'], errors='raise')
    dfs[i]['airport_fee'] = dfs[i]['airport_fee'].fillna(0.0).astype(float)
    dfs[i]['congestion_surcharge'] = dfs[i]['congestion_surcharge'].fillna(0.0).astype(float)
    dfs[i]['improvement_surcharge'] = dfs[i]['improvement_surcharge'].fillna(0.0).astype(float)
    dfs[i]['extra'] = dfs[i]['extra'].fillna(0.0).astype(float)
    dfs[i]['passenger_count'] = dfs[i]['passenger_count'].fillna(0).astype(int)
    # set index to year
    # dfs[i] = dfs[i].set_index('year', sorted=True, divisions=[year, year+1], drop=False)
    # dfs[i] = dfs[i].repartition(partition_size="100MB")

### Concating and storing

In [4]:
os.makedirs(os.path.join(TASK1_OUT_ROOT, "one_year"), exist_ok=True)
os.makedirs(os.path.join(TASK1_OUT_ROOT, "five_years"), exist_ok=True)
os.makedirs(os.path.join(TASK1_OUT_ROOT, "all"), exist_ok=True)

### DTypes

In [None]:
# apply TASK1_NP_SCHEMA schema to dfs
for i, f in enumerate(files):
    dfs[i] = dfs[i].astype(TASK1_NP_SCHEMA)

vendor_id                          int8
pickup_datetime          datetime64[ns]
dropoff_datetime         datetime64[ns]
passenger_count                   uint8
trip_distance                   float32
pickup_longitude                float32
pickup_latitude                 float32
rate_code_id                      uint8
store_and_fwd_flag                 int8
dropoff_longitude               float32
dropoff_latitude                float32
payment_type                      uint8
fare_amount                     float32
extra                           float16
mta_tax                         float16
tip_amount                      float16
tolls_amount                    float16
total_amount                    float32
airport_fee                     float16
congestion_surcharge            float16
improvement_surcharge           float16
year                             uint16
dtype: object
vendor_id                          int8
pickup_datetime          datetime64[ns]
dropoff_datetime         d

### Parquet (all)

In [None]:
dfs[-1].to_parquet(
    path=os.path.join(TASK1_OUT_ROOT, "all"),
    partition_on=['year'],
    engine='pyarrow',
    schema=TASK1_SCHEMA,
    write_index=False,
)

for i in range(len(dfs) - 1):
    dfs[i].to_parquet(
        path=os.path.join(TASK1_OUT_ROOT, "all"),
        partition_on=['year'],
        engine='pyarrow',
        schema=TASK1_SCHEMA,
        write_index=False,
        append=True,
    )


  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)


### CSV (5 years)

In [None]:
dd.to_csv(
    df=dd.concat(dfs[-(12*5)-1:-1], axis=0),
    filename=os.path.join(TASK1_OUT_ROOT, "five_years", "2020_2024.csv"),
    single_file=True,
    index=False
)

### CSV and HDF (1 year)
- using h5py as dask's implementation has problems (it can only save in the "tables" format resulting in a larger file than the equivalent CSV...)

In [None]:
dd.to_csv(
    df=dd.concat(dfs[-13:-1], axis=0),
    filename=os.path.join(TASK1_OUT_ROOT, "one_year", "2024.csv"),
    single_file=True,
    index=False
)

In [None]:
df = dd.concat(dfs[-13:-1], axis=0)
# convert datetimes to int64 (h5py can't handle datetimes directly)
for col in df.select_dtypes(include=['datetime64[ns]']):
    df[col] = df[col].astype('int64')

df = df.compute()
# to numpy (structured array) as h5py can't handle dataframes directly
dtype = np.dtype([(col, df[col].dtype) for col in df.columns])
structured_array = np.empty(len(df), dtype=dtype)
for col in df.columns:
    structured_array[col] = df[col].to_numpy()

# write as single dataset
with h5py.File(os.path.join(TASK1_OUT_ROOT, "one_year", "2024.h5"), 'w') as h5f:
    h5f.create_dataset('taxidata', data=structured_array, compression='gzip')