In [1]:
import polars as pl
import os

In [2]:
RAW_DATA_FOLDER = "raw_data"
STAGING_DATA_FOLDER = "staging_data"

In [3]:
ridership = (
    pl.scan_csv(os.path.join(RAW_DATA_FOLDER, "Ridership.csv"))
    .select(["Stop Id", "IVR Number"]) # Choose only IVR no. and stop id from raw data ridership
    .with_columns(
        pl.col("Stop Id").str.replace_all(",", "").cast(pl.UInt16), # converting them to integers
        pl.col("IVR Number").str.replace_all(",", "").cast(pl.UInt32),
    )
    .with_columns(
        pl.when(pl.col("IVR Number").is_null()) # if IVR no. is null add 60000 to its respective stop id and assign it to IVR no.
        .then(pl.col("Stop Id") + 60000)
        .otherwise(pl.col("IVR Number"))
        .alias("IVR Number")
    )
    .with_columns(
        pl.when(pl.col("IVR Number") < 60000) # if IVR no. is 0 or less then 60000 then perform same as above.
        .then(pl.col("Stop Id") + 60000)
        .otherwise(pl.col("IVR Number"))
        .alias("IVR Number")
    )
    .unique() # Select all the unique combination of stop id and ivr no.
)

In [4]:
feb2020lf = (
    pl.scan_csv(os.path.join(RAW_DATA_FOLDER, "Feb2020_RBS_Final.csv"))
    .select(["Stop_ID", "LAT_Num", "LON_Num"])
    .with_columns(
        pl.col("Stop_ID").cast(pl.UInt16),
        pl.col("LAT_Num").cast(pl.Float32),
        pl.col("LON_Num").cast(pl.Float32),
    )
    .unique()
)

In [5]:
oct2022lf = (
    pl.scan_csv(os.path.join(RAW_DATA_FOLDER, "RidershipbyStop2022.csv"))
    .select(["Stop ID text", "Lat", "Long"]) # here the given stop id is actually ivr no.
    .with_columns(
        pl.col("Stop ID text").cast(pl.UInt32),
        pl.col("Lat").cast(pl.Float32),
        pl.col("Long").cast(pl.Float32),
    )
    .unique()
)

In [6]:
stops = (
    pl.scan_csv(os.path.join(RAW_DATA_FOLDER, "stops.csv"))
    .select(["stop_id", "stop_lat", "stop_lon"]).with_columns(
        pl.col("stop_id").cast(pl.UInt16),
        pl.col("stop_lat").cast(pl.Float32),
        pl.col("stop_lon").cast(pl.Float32),
    )
    .unique()
)

In [7]:
tolerance = 0.0001 # roughly within 9 meters
# Basically tolence is - if two latitudes and longitudes only differ by this amount then they can be considered equal.
# example - 121.987987 - 121.987969 = 0.000018. the difference is smaller than 0.0001

In [8]:
joined_lf = ridership.join(
    feb2020lf, how="left", left_on="Stop Id", right_on="Stop_ID"
).join(oct2022lf, how="left", left_on="IVR Number", right_on="Stop ID text")
# merging ridership ( stop id and ivr) with feb 2020 and oct2022 data and storing in joined_lf

In [9]:
# if latitude is null in either the feb2020 data or oct2022 data, preserve the latitude of the not null data
# if longitude is null in either the feb2020 data or oct2022 data, preserve the latitude of the not null data
# if both (feb2020 and oct2022) have latitude information and they are consistent within the tolerance, choose the first one
# if they are not within tolerance, set that latitude or longitude to null
joined_lf = joined_lf.with_columns(
    pl.when(pl.col("LAT_Num").is_not_null())
    .then(
        pl.when(
            pl.col("Lat").is_not_null()
            & ((pl.col("LAT_Num") - pl.col("Lat")).abs() <= tolerance)
        )
        .then(pl.col("LAT_Num"))
        .otherwise(pl.col("LAT_Num"))
    )
    .otherwise(pl.col("Lat"))
    .alias("Latitude"),
    pl.when(pl.col("LON_Num").is_not_null())
    .then(
        pl.when(
            pl.col("Long").is_not_null()
            & ((pl.col("LON_Num") - pl.col("Long")).abs() <= tolerance)
        )
        .then(pl.col("LON_Num"))
        .otherwise(pl.col("LON_Num"))
    )
    .otherwise(pl.col("Long"))
    .alias("Longitude"),
).select(["Stop Id", "IVR Number", "Latitude", "Longitude"])

In [10]:
joined_lf = joined_lf.join(stops, how="left", left_on="Stop Id", right_on="stop_id")
# merging the stops into joined_lf

In [11]:
# same logic as above
joined_lf = joined_lf.with_columns(
    pl.when(pl.col("Latitude").is_not_null())
    .then(
        pl.when(
            pl.col("stop_lat").is_not_null()
            & ((pl.col("Latitude") - pl.col("stop_lat")).abs() <= tolerance)
        )
        .then(pl.col("Latitude"))
        .otherwise(pl.col("Latitude"))
    )
    .otherwise(pl.col("stop_lat"))
    .alias("Latitude"),
    pl.when(pl.col("Longitude").is_not_null())
    .then(
        pl.when(
            pl.col("stop_lon").is_not_null()
            & ((pl.col("Longitude") - pl.col("stop_lon")).abs() <= tolerance)
        )
        .then(pl.col("Longitude"))
        .otherwise(pl.col("Longitude"))
    )
    .otherwise(pl.col("stop_lon"))
    .alias("Longitude"),
).select(["Stop Id", "Latitude", "Longitude"])

In [12]:
joined_lf.count().collect()
# check for nulls

Stop Id,Latitude,Longitude
u32,u32,u32
3860,3860,3860


In [13]:
joined_lf.collect(streaming=True).write_csv(
    os.path.join(STAGING_DATA_FOLDER, "stops.csv")
)