In [1]:
import geopandas as gpd
import pandas as pd
import dask.dataframe as dd
import os
from paths import ROOT, YEARS, TASK1_OUT_ROOT, get_files, TAXI_ZONES_SHAPEFILE, ZONES_TO_CENTROIDS_MAPPING_CSV

In [2]:
files = sum((get_files(ROOT, year) for year in YEARS), [])
dfs = [dd.read_parquet(f) for f in files]

consistency_mapping = {
    "End_Lat": "dropoff_latitude",
    "End_Lon": "dropoff_longitude",
    "Start_Lat": "pickup_latitude",
    "Start_Lon": "pickup_longitude",
    "Fare_Amt": "fare_amount",
    "Tip_Amt": "tip_amount",
    "Tolls_Amt": "tolls_amount",
    "Total_Amt": "total_amount",
    "Passenger_Count": "passenger_count",
    "Payment_Type": "payment_type",
    "Rate_Code": "rate_code_id",
    "rate_code": "rate_code_id",
    "RatecodeID": "rate_code_id",
    "Trip_Distance": "trip_distance",
    "Trip_Dropoff_DateTime": "tpep_dropoff_datetime",
    "Trip_Pickup_DateTime": "tpep_pickup_datetime",
    "tpep_dropoff_datetime": "dropoff_datetime",
    "tpep_pickup_datetime": "pickup_datetime",
    "Airport_fee": "airport_fee",
    "VendorID": "vendor_id",
    "vendor_name": "vendor_id",
    "surcharge": "extra",
    "store_and_forward": "store_and_fwd_flag",
}

for i, f in enumerate(files):
    # Rename columns
    for old_col, new_col in consistency_mapping.items():
        if old_col in dfs[i].columns.tolist():
            dfs[i] = dfs[i].rename(columns={old_col: new_col})
    cols = sorted(dfs[i].columns)
    print(cols)
    # get unique values of vendor_id
    # if "DOLocationID" in cols:
        # print(f"DOLocationID: {sorted(dfs[i]['DOLocationID'].unique().compute().tolist())}")
    # if (i+1) % 12 == 1:
    #     print(f"File: {os.path.basename(f)}")
    #     print(cols)
    #     display(dfs[i].head(5))

['dropoff_datetime', 'dropoff_latitude', 'dropoff_longitude', 'extra', 'fare_amount', 'mta_tax', 'passenger_count', 'payment_type', 'pickup_datetime', 'pickup_latitude', 'pickup_longitude', 'rate_code_id', 'store_and_fwd_flag', 'tip_amount', 'tolls_amount', 'total_amount', 'trip_distance', 'vendor_id']
['dropoff_datetime', 'dropoff_latitude', 'dropoff_longitude', 'extra', 'fare_amount', 'mta_tax', 'passenger_count', 'payment_type', 'pickup_datetime', 'pickup_latitude', 'pickup_longitude', 'rate_code_id', 'store_and_fwd_flag', 'tip_amount', 'tolls_amount', 'total_amount', 'trip_distance', 'vendor_id']
['dropoff_datetime', 'dropoff_latitude', 'dropoff_longitude', 'extra', 'fare_amount', 'mta_tax', 'passenger_count', 'payment_type', 'pickup_datetime', 'pickup_latitude', 'pickup_longitude', 'rate_code_id', 'store_and_fwd_flag', 'tip_amount', 'tolls_amount', 'total_amount', 'trip_distance', 'vendor_id']
['dropoff_datetime', 'dropoff_latitude', 'dropoff_longitude', 'extra', 'fare_amount', 'm

### Mapping column values for consistency

In [None]:
def map_vendor(value):
    # vendor_id (previously vendor_name but stands for the same thing)
    _old_vendorname2id = {
        "CMT": 1,
        "DDS": 2, # DDS and VTS were merged (Verifone) and later also merged with Curb Mobility
        "VTS": 2,
    }
    valid_vendor_ids= [1, 2, 6, 7]
    if str(value) in _old_vendorname2id:
        return _old_vendorname2id[value]
    if float(value) in valid_vendor_ids:
        return int(value)
    return -1 # Invalid vendor id
    
def map_rate_code(value):
    # rate_code_id
    # 1-6 and 99 (missing or unknown). All other values are set to 99
    valid_rate_code_ids = [1, 2, 3, 4, 5, 6, 99]
    if float(value) in valid_rate_code_ids:
        return int(value)
    return 99 # Invalid rate code id
    
def map_store_and_fwd_flag(value):
    # store_and_fwd_flag
    # npnan to "NA"
    letter_mapping = {
        "Y": 1,
        "N": 0,
    }
    if str(value).strip() in letter_mapping:
        return letter_mapping[str(value)]
    if pd.isna(value):
        return -1
    if float(value) in [0, 1]:
        return int(float(value))
    return -1

def map_payment_type(value):
    # ignore case and strip
    payment_type_mapping = {
        "credit": 1,
        "crd": 1,
        "cre":1,
        "cash": 2,
        "csh": 2,
        "cas": 2,
        "noc": 3,
        "no charge": 3,
        "no": 3,
        "dis": 4,
        "dispute": 4,
        "na": 5,
    }
    valid_vals = [0, 1, 2, 3, 4, 5, 6]
    if str(value).strip().lower() in payment_type_mapping:
        return payment_type_mapping[str(value).strip().lower()]
    if float(value) in valid_vals:
        return int(float(value))
    return 5 # unknown

def get_locationid_to_centroid(shapefile):
    # Load zones as GeoDataFrame
    zones = gpd.read_file(shapefile)

    # Reproject to NYC's local projected CRS (US feet) for correct geometry math
    zones_projected = zones.to_crs("EPSG:2263")

    # Calculate centroid in projected space
    zones_projected['centroid'] = zones_projected.geometry.centroid

    # Convert centroid back to WGS84 (lat/lon)
    centroids_wgs84 = zones_projected.set_geometry('centroid').to_crs("EPSG:4326")

    # Extract lat/lng from centroid geometry
    zones['centroid_lat'] = centroids_wgs84.geometry.y
    zones['centroid_lng'] = centroids_wgs84.geometry.x

    zones = zones[['LocationID', 'centroid_lat', 'centroid_lng']]
    zones['centroid_lat'] = zones['centroid_lat'].astype(float)
    zones['centroid_lng'] = zones['centroid_lng'].astype(float)
    zones['LocationID'] = zones['LocationID'].astype(int)

    return zones


locationid_to_centers_df = get_locationid_to_centroid(TAXI_ZONES_SHAPEFILE).sort_values(by='LocationID', ascending=True, ignore_index=True)
# save to base dir
locationid_to_centers_df.to_csv(ZONES_TO_CENTROIDS_MAPPING_CSV, index=False)


for i, f in enumerate(files):
    # map vendor_id column but with dask so i can compute later
    dfs[i]['vendor_id'] = dfs[i]['vendor_id'].map(map_vendor, meta=('vendor_id', 'int8'))
    dfs[i]['rate_code_id'] = dfs[i]['rate_code_id'].map(map_rate_code, meta=('rate_code_id', 'int8'))
    dfs[i]['store_and_fwd_flag'] = dfs[i]['store_and_fwd_flag'].map(map_store_and_fwd_flag, meta=('store_and_fwd_flag', 'int8'))
    dfs[i]['payment_type'] = dfs[i]['payment_type'].map(map_payment_type, meta=('payment_type', 'int8'))
    # pickup
    if "PULocationID" in dfs[i].columns:
        # set dtype to int
        dfs[i]['PULocationID'] = dfs[i]['PULocationID'].astype(int)
        dfs[i] = dfs[i].merge(locationid_to_centers_df, how='left', left_on='PULocationID', right_on='LocationID')
        # rename to pickup lattitude and longitude
        dfs[i] = dfs[i].rename(columns={'centroid_lat': 'pickup_latitude', 'centroid_lng': 'pickup_longitude'})

    # dropoff
    if "DOLocationID" in dfs[i].columns:
        # set dtype to int
        dfs[i]['DOLocationID'] = dfs[i]['DOLocationID'].astype(int)
        dfs[i] = dfs[i].merge(locationid_to_centers_df, how='left', left_on='DOLocationID', right_on='LocationID')
        # rename to dropoff lattitude and longitude
        dfs[i] = dfs[i].rename(columns={'centroid_lat': 'dropoff_latitude', 'centroid_lng': 'dropoff_longitude'})
    # drop LocationID columns
    dfs[i] = dfs[i].drop(columns=['DOLocationID', 'PULocationID', 'LocationID_x', 'LocationID_y'], errors='ignore')

    if 'airport_fee' not in dfs[i].columns:
        # set airport_fee to 0
        dfs[i]['airport_fee'] = 0.0
    if 'congestion_surcharge' not in dfs[i].columns:
        dfs[i]['congestion_surcharge'] = 0.0
    if 'improvement_surcharge' not in dfs[i].columns:
        dfs[i]['improvement_surcharge'] = 0.0
    dfs[i]['year'] = int(os.path.basename(f).split('_')[2][:4])
    dfs[i]['mta_tax'] = dfs[i]['mta_tax'].fillna(0.0)
    # parse datetimes
    dfs[i]['pickup_datetime'] = dd.to_datetime(dfs[i]['pickup_datetime'], errors='coerce')
    dfs[i]['dropoff_datetime'] = dd.to_datetime(dfs[i]['dropoff_datetime'], errors='coerce')

### Concating and storing

In [4]:
os.makedirs(os.path.join(TASK1_OUT_ROOT, "one_year"), exist_ok=True)
os.makedirs(os.path.join(TASK1_OUT_ROOT, "five_years"), exist_ok=True)
dd.to_csv(
    df=dd.concat(dfs[-12:], axis=0, interleave_partitions=True),
    filename=os.path.join(TASK1_OUT_ROOT, "one_year", "one_year.csv"),
    single_file=True,
)

['/var/home/kvovk/work/bd-project-25/data/task1/one_year/one_year.csv']

In [8]:
test_df = dfs[0]

In [12]:
display(test_df.head(5))

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code_id,store_and_fwd_flag,dropoff_longitude,...,fare_amount,extra,mta_tax,tip_amount,tolls_amount,total_amount,airport_fee,congestion_surcharge,improvement_surcharge,year
0,2,2009-01-04 02:52:00,2009-01-04 03:02:00,1,2.63,-73.991957,40.721567,99,-1,-73.993803,...,8.9,0.5,,0.0,0.0,9.4,0.0,0.0,0.0,2009
1,2,2009-01-04 03:31:00,2009-01-04 03:38:00,3,4.55,-73.982102,40.73629,99,-1,-73.95585,...,12.1,0.5,,2.0,0.0,14.6,0.0,0.0,0.0,2009
2,2,2009-01-03 15:43:00,2009-01-03 15:57:00,5,10.35,-74.002587,40.739748,99,-1,-73.869983,...,23.7,0.0,,4.74,0.0,28.44,0.0,0.0,0.0,2009
3,2,2009-01-01 20:52:58,2009-01-01 21:14:00,1,5.0,-73.974267,40.790955,99,-1,-73.996558,...,14.9,0.5,,3.05,0.0,18.45,0.0,0.0,0.0,2009
4,2,2009-01-24 16:18:23,2009-01-24 16:24:56,1,0.4,-74.00158,40.719382,99,-1,-74.008378,...,3.7,0.0,,0.0,0.0,3.7,0.0,0.0,0.0,2009


In [17]:
# sshow rows with na
test_df[test_df.isna().any(axis=1)].compute()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code_id,store_and_fwd_flag,dropoff_longitude,...,fare_amount,extra,mta_tax,tip_amount,tolls_amount,total_amount,airport_fee,congestion_surcharge,improvement_surcharge,year
0,2,2009-01-04 02:52:00,2009-01-04 03:02:00,1,2.63,-73.991957,40.721567,99,-1,-73.993803,...,8.9,0.5,,0.00,0.0,9.40,0.0,0.0,0.0,2009
1,2,2009-01-04 03:31:00,2009-01-04 03:38:00,3,4.55,-73.982102,40.736290,99,-1,-73.955850,...,12.1,0.5,,2.00,0.0,14.60,0.0,0.0,0.0,2009
2,2,2009-01-03 15:43:00,2009-01-03 15:57:00,5,10.35,-74.002587,40.739748,99,-1,-73.869983,...,23.7,0.0,,4.74,0.0,28.44,0.0,0.0,0.0,2009
3,2,2009-01-01 20:52:58,2009-01-01 21:14:00,1,5.00,-73.974267,40.790955,99,-1,-73.996558,...,14.9,0.5,,3.05,0.0,18.45,0.0,0.0,0.0,2009
4,2,2009-01-24 16:18:23,2009-01-24 16:24:56,1,0.40,-74.001580,40.719382,99,-1,-74.008378,...,3.7,0.0,,0.00,0.0,3.70,0.0,0.0,0.0,2009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14092408,2,2009-01-27 14:36:00,2009-01-27 14:46:00,5,0.89,-73.982013,40.743330,99,-1,-73.994328,...,6.5,0.0,,0.00,0.0,6.50,0.0,0.0,0.0,2009
14092409,2,2009-01-27 13:56:00,2009-01-27 14:02:00,1,1.94,-73.972788,40.761988,99,-1,-73.951477,...,8.1,0.0,,1.90,0.0,10.00,0.0,0.0,0.0,2009
14092410,1,2009-01-23 08:39:44,2009-01-23 09:02:15,1,3.80,-73.977467,40.751861,99,-1,-74.009913,...,14.5,0.0,,0.00,0.0,14.50,0.0,0.0,0.0,2009
14092411,2,2009-01-24 23:05:00,2009-01-24 23:15:00,3,3.85,-73.981295,40.753000,99,-1,-73.949453,...,10.9,0.5,,0.00,0.0,11.40,0.0,0.0,0.0,2009


In [None]:
test_df.dtypes