In [1]:
import polars as pl
import datetime
import os

In [2]:
RAW_DATA_FOLDER = "raw_data"
STAGING_DATA_FOLDER = "staging_data"

In [3]:
# Selecting only below features from the raw data because other features 
# were either not giving information 
# or were corelated with these features
ridership = pl.scan_csv(os.path.join(RAW_DATA_FOLDER, "Ridership.csv")).select(
    [
        "Date",
        "Block",
        "Line",
        "Service",
        "Direction Number",
        "From Time",
        "On",
        "Off",
        "Stop Id",
        "Sequence",
    ]
)
ridership.head().collect()

Date,Block,Line,Service,Direction Number,From Time,On,Off,Stop Id,Sequence
str,str,i64,i64,i64,i64,i64,i64,str,i64
"""09/30/2016 12:…","""7,080""",77,1,1,59940,13,20,"""3,951""",13
"""09/26/2016 12:…","""7,771""",77,1,0,67200,0,1,"""3,915""",29
"""09/19/2016 12:…","""122""",72,1,0,63785,1,2,"""2,945""",37
"""09/05/2016 12:…","""6,810""",68,3,0,55620,1,0,"""158""",32
"""09/17/2016 12:…","""549""",522,2,1,58560,0,4,"""519""",29


In [4]:
ridership.filter(pl.col("On") < 0).head().collect()

Date,Block,Line,Service,Direction Number,From Time,On,Off,Stop Id,Sequence
str,str,i64,i64,i64,i64,i64,i64,str,i64
"""08/10/2014 12:…","""9,021""",900,3,1,66840,-1,0,"""4,734""",2
"""09/11/2014 12:…","""9,021""",900,1,1,61260,-1,0,"""4,733""",1
"""09/11/2014 12:…","""9,021""",900,1,1,33660,-4,0,"""4,733""",1
"""07/14/2014 12:…","""9,021""",900,1,1,35460,-1,0,"""4,733""",1
"""07/22/2014 12:…","""9,021""",900,1,1,24660,-1,0,"""4,734""",2


In [5]:
# correcting some column datatypes

ridership = ridership.with_columns(
    pl.col("Date").str.strptime(pl.Datetime, "%m/%d/%Y %I:%M:%S %p").dt.date(), # converting string date to date datatype
    pl.col("Block").str.replace_all(",", "").cast(pl.UInt16), # Block has numbers with comma (string - 2,112 format) removign it and cnvertign it to integer
    pl.col("Line").cast(pl.UInt16), # converting it to integer with lower prescion to preserve memory
    pl.col("Service").cast(pl.UInt8),
    pl.col("Direction Number").cast(pl.UInt8),
    pl.col("From Time").cast(pl.UInt32),
    pl.col("On").cast(pl.Int16),
    pl.col("Off").cast(pl.Int16),
    pl.col("Stop Id").str.replace_all(",", "").cast(pl.UInt16), # similar to Block
    pl.col("Sequence").cast(pl.UInt8),
)

ridership = ridership.select(
    [
        "Date",
        "Block",
        "Line",
        "Service",
        "Direction Number",
        "From Time",
        "Stop Id",
        "Sequence",
        "On",
        "Off",
    ]
) # Giving an order to the columns
ridership.head().collect()

Date,Block,Line,Service,Direction Number,From Time,Stop Id,Sequence,On,Off
date,u16,u16,u8,u8,u32,u16,u8,i16,i16
2016-09-30,7080,77,1,1,59940,3951,13,13,20
2016-09-26,7771,77,1,0,67200,3915,29,0,1
2016-09-19,122,72,1,0,63785,2945,37,1,2
2016-09-05,6810,68,3,0,55620,158,32,1,0
2016-09-17,549,522,2,1,58560,519,29,0,4


In [6]:
ridership.filter(pl.col("On") < 0).collect()

Date,Block,Line,Service,Direction Number,From Time,Stop Id,Sequence,On,Off
date,u16,u16,u8,u8,u32,u16,u8,i16,i16
2014-08-10,9021,900,3,1,66840,4734,2,-1,0
2014-09-11,9021,900,1,1,61260,4733,1,-1,0
2014-09-11,9021,900,1,1,33660,4733,1,-4,0
2014-07-14,9021,900,1,1,35460,4733,1,-1,0
2014-07-22,9021,900,1,1,24660,4734,2,-1,0
…,…,…,…,…,…,…,…,…,…
2015-04-27,9021,900,1,1,39060,4734,2,-1,0
2015-04-07,9021,900,1,1,35460,4733,1,-1,0
2015-04-13,9021,900,1,1,76860,4734,2,-2,0
2015-04-05,9021,900,3,1,72360,4733,1,-1,0


In [7]:
ridership = ridership.filter(pl.col("On") >= 0)

In [8]:
ridership.filter(pl.col("On") < 0).collect()

Date,Block,Line,Service,Direction Number,From Time,Stop Id,Sequence,On,Off
date,u16,u16,u8,u8,u32,u16,u8,i16,i16


In [9]:
# ridership = ridership.with_columns(
#     pl.when(pl.col("From Time") > 86400)
#     .then(pl.col("Date") + timedelta(days=1))
#     .otherwise(pl.col("Date"))
#     .alias("Date"),
#     pl.when(pl.col("From Time") > 86400)
#     .then(pl.col("From Time") - 86400)
#     .otherwise(pl.col("From Time"))
#     .alias("From Time"),
# )

In [10]:
# ridership.filter(pl.col("From Time") > 86400).collect()

In [11]:
# ridership.filter((pl.col("Block") == 9012) & (pl.col("Sequence") == 41)).collect()

In [12]:
ridership.select(
    [
        "Block",
        "Line",
        "Service",
        "Direction Number",
        "Sequence",
        "Stop Id",
        "Date",
        "From Time",
        "On",
        "Off",
    ]
).sort(
    [
        "Block",
        "Line",
        "Service",
        "Direction Number",
        "Sequence",
        "Stop Id",
        "Date",
        "From Time",
    ]
).head(n=10).collect()

Block,Line,Service,Direction Number,Sequence,Stop Id,Date,From Time,On,Off
u16,u16,u8,u8,u8,u16,date,u32,i16,i16
0,901,1,1,13,4806,2015-08-14,58080,6,0
0,901,1,1,13,4806,2015-08-14,59940,7,0
0,901,1,1,13,4806,2015-08-17,59940,2,0
0,901,1,1,13,4806,2015-08-17,61680,12,0
0,901,1,1,13,4806,2015-08-18,58080,5,0
0,901,1,1,13,4806,2015-08-18,59940,6,0
0,901,1,1,13,4806,2015-08-20,58080,1,0
0,901,1,1,13,4806,2015-08-20,59940,9,0
0,901,1,1,13,4806,2015-08-20,61680,4,0
0,901,1,1,13,4806,2015-08-21,58080,5,0


In [13]:
# Aggregating on and off values
ridership = (
    ridership.group_by(
        [
            "Block",
            "Line",
            "Service",
            "Direction Number",
            "Sequence",
            "Stop Id",
            "Date",
        ]
    )
    .agg([pl.col("On").sum().alias("On"), pl.col("Off").sum().alias("Off")])
    .sort(
        [
            "Block",
            "Line",
            "Service",
            "Direction Number",
            "Sequence",
            "Stop Id",
            "Date",
        ]
    )
)
ridership.head(n=10).collect()

Block,Line,Service,Direction Number,Sequence,Stop Id,Date,On,Off
u16,u16,u8,u8,u8,u16,date,i64,i64
0,901,1,1,13,4806,2015-08-14,13,0
0,901,1,1,13,4806,2015-08-17,14,0
0,901,1,1,13,4806,2015-08-18,11,0
0,901,1,1,13,4806,2015-08-20,14,0
0,901,1,1,13,4806,2015-08-21,12,0
0,901,1,1,13,4806,2015-08-24,11,0
0,901,1,1,13,4806,2015-08-25,31,0
0,901,1,1,13,4806,2015-08-26,18,0
0,901,1,1,13,4806,2015-08-27,23,0
0,901,1,1,13,4806,2015-08-28,9,0


In [14]:
ridership.collect(streaming=True).write_csv(os.path.join(STAGING_DATA_FOLDER, "ridership.csv"))