In [1]:
import polars as pl
import os

In [2]:
STAGING_DATA_FOLDER = "staging_data"
CLEAN_DATA_FOLDER = "clean_data"

In [3]:
ridership = pl.scan_csv(os.path.join(STAGING_DATA_FOLDER, "ridership.csv"))
climate = pl.scan_csv(os.path.join(STAGING_DATA_FOLDER, "climate.csv"))
stops = pl.scan_csv(os.path.join(STAGING_DATA_FOLDER, "stops.csv"))
dates = (
    pl.scan_csv(os.path.join(STAGING_DATA_FOLDER, "dates.csv"))
    .with_columns(
        pl.col("id")
        .cast(pl.Utf8)
        .str.strptime(pl.Date, "%Y%m%d") # strip time - parse the sring into a date using given date format
        .cast(pl.String)
        .alias("Date")
    )
    .select(["Date"])
)

In [4]:
ridership.head().collect()

Block,Line,Service,Direction Number,Sequence,Stop Id,Date,On,Off
i64,i64,i64,i64,i64,i64,str,i64,i64
0,901,1,1,13,4806,"""2015-08-14""",13,0
0,901,1,1,13,4806,"""2015-08-17""",14,0
0,901,1,1,13,4806,"""2015-08-18""",11,0
0,901,1,1,13,4806,"""2015-08-20""",14,0
0,901,1,1,13,4806,"""2015-08-21""",12,0


In [5]:
stops.head().collect()

Stop Id,Latitude,Longitude
i64,f64,f64
4045,37.358547,-121.86032
5180,37.269485,-121.81747
2031,37.417297,-122.07814
3522,37.404118,-121.866486
743,37.307266,-121.90355


In [6]:
dates.head().collect()

Date
str
"""2014-01-01"""
"""2014-01-02"""
"""2014-01-03"""
"""2014-01-04"""
"""2014-01-05"""


In [7]:
stops_with_dates = dates.join(stops, how="cross")
stops_with_dates.head().collect()

Date,Stop Id,Latitude,Longitude
str,i64,f64,f64
"""2014-01-01""",4045,37.358547,-121.86032
"""2014-01-01""",5180,37.269485,-121.81747
"""2014-01-01""",2031,37.417297,-122.07814
"""2014-01-01""",3522,37.404118,-121.866486
"""2014-01-01""",743,37.307266,-121.90355


In [8]:
# formula to calculate the distance. 
# haversine distance is usually used to calculate distance bw lat and long,
# but we are using euclidean distance as all the cordinates are pretty close to each other.
def distance(lat1, long1, lat2, long2):
    return ((lat1 - lat2) ** 2 + (long1 - long2) ** 2) ** 0.5

In [9]:
stops_with_dates_and_climate = (
    stops_with_dates.join(climate, on="Date", suffix="_station")  # since both climate and stops have lat and long, sufix climate with _station when both are common.
    .with_columns(
        [
            distance(
                pl.col("Latitude"),
                pl.col("Longitude"),
                pl.col("Latitude_station"),
                pl.col("Longitude_station"),
            ).alias("Distance")
        ]
    )
    .sort(["Stop Id", "Date", "Latitude", "Longitude", "Distance"])
    .group_by(["Stop Id", "Date", "Latitude", "Longitude"], maintain_order=True)
    .first()
    .select(["Stop Id", "Date", "Latitude", "Longitude", "Tmax", "Tmin", "Prcp"])
)
stops_with_dates_and_climate.head().collect()

Stop Id,Date,Latitude,Longitude,Tmax,Tmin,Prcp
i64,str,f64,f64,f64,f64,f64
1,"""2014-01-01""",37.353054,-121.93667,16.7,1.7,0.0
1,"""2014-01-02""",37.353054,-121.93667,18.3,3.3,0.0
1,"""2014-01-03""",37.353054,-121.93667,17.8,4.4,0.0
1,"""2014-01-04""",37.353054,-121.93667,18.9,3.3,0.0
1,"""2014-01-05""",37.353054,-121.93667,18.9,1.7,0.0


In [10]:
ridership = (
    ridership.join(stops_with_dates_and_climate, on=["Date", "Stop Id"], how="left")
    .with_columns(pl.col("Date").cast(pl.Date))
    .with_columns(
        pl.col("Date").dt.year().alias("Year"), # convert date to year
        pl.col("Date").dt.ordinal_day().alias("Day"), # convert date to day of year or ordinal date
    )
    .drop("Date")
    .select(
        [
            "Year",
            "Day",
            "Block",
            "Line",
            "Service",
            "Direction Number",
            #"From Time",
            "Sequence",
            "Stop Id",
            "Latitude",
            "Longitude",
            "Tmax",
            "Tmin",
            "Prcp",
            "On",
            "Off",
        ]
    )
    .sort(
        [
            "Year",
            "Day",
            "Block",
            "Line",
            "Service",
            "Direction Number",
            #"From Time",
            "Sequence",
        ]
    )
)
ridership.head().collect()

Year,Day,Block,Line,Service,Direction Number,Sequence,Stop Id,Latitude,Longitude,Tmax,Tmin,Prcp,On,Off
i32,i16,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,i64,i64
2014,1,131,181,3,0,1,3413,37.330624,-121.902336,16.7,1.7,0.0,14,0
2014,1,131,181,3,0,3,3213,37.335556,-121.89013,16.7,1.7,0.0,63,4
2014,1,131,181,3,0,4,3214,37.338448,-121.89229,16.7,1.7,0.0,3,0
2014,1,131,181,3,0,5,22,37.35186,-121.90212,16.7,1.7,0.0,9,1
2014,1,131,181,3,0,6,5317,37.413536,-121.89989,16.7,1.7,0.0,52,15


In [11]:
ridership.collect(streaming=True).write_csv(
    os.path.join(CLEAN_DATA_FOLDER, "ridership.csv")
)