In [1]:
import polars as pl 
import os

In [2]:
# read all csv in the data folder
data_path = os.path.join(os.getcwd(), 'data')
files = os.listdir(data_path)
files = [os.path.join(data_path, f) for f in files if f.endswith('.csv')]

In [24]:
# Define the functions
def clean_data(df: pl.DataFrame, year: int, month: int) -> pl.DataFrame:
    df = df.filter((pl.col("passenger_count") > 0) &
                   (pl.col("trip_distance") > 0) &
                   (pl.col("fare_amount") > 0) &
                   (pl.col("total_amount") > 0) &
                   (pl.col("year") == year) &
                   (pl.col("month") == month))
    return df

def feature_eng(df: pl.DataFrame) -> pl.DataFrame:


    df = df.with_columns([
        pl.col("tpep_pickup_datetime").dt.weekday().alias("day_of_week"),
        pl.col("tpep_pickup_datetime").dt.week().alias("weeknr"),
        pl.col("tpep_pickup_datetime").dt.hour().alias("hour_of_day"),
        pl.col("tpep_pickup_datetime").dt.month().alias("month"),
        pl.col("tpep_pickup_datetime").dt.year().alias("year"),
    ])

    df = df.sort("tpep_pickup_datetime")
    return df



In [21]:
X_cols= ["trip_distance", "passenger_count","PULocationID","DOLocationID", "RatecodeID", "hour_of_day", "day_of_week", "weeknr"]
y_cols= ["total_amount"]

In [22]:
# Process the files
train = pl.DataFrame()
test = pl.DataFrame()

In [27]:
schema = {
    "VendorID": pl.Int64,
    "tpep_pickup_datetime": pl.Datetime,
    "tpep_dropoff_datetime": pl.Datetime,
    "passenger_count": pl.Int64,
    "trip_distance": pl.Float64,
    "RatecodeID": pl.Int64,
    "store_and_fwd_flag": pl.Categorical,
    "PULocationID": pl.Int64,
    "DOLocationID": pl.Int64,
    "payment_type": pl.Int64,
    "fare_amount": pl.Float64,
    "extra": pl.Float64,
    "mta_tax": pl.Float64,
    "tip_amount": pl.Float64,
    "tolls_amount": pl.Float64,
    "improvement_surcharge": pl.Float64,
    "total_amount": pl.Float64,
    "congestion_surcharge": pl.Float64
}

In [29]:
for file in files:
    file_year = int(file.split('tripdata_')[1].split('-')[0])
    file_month = int(file.split('tripdata_')[1].split('-')[1].split('.')[0])

    print(f"reading file: {file}")
    raw = pl.read_csv(file,dtypes=schema, try_parse_dates=True)
    print(f"shape: {raw.shape}")

    _df = feature_eng(raw)
    _df = clean_data(_df, year=file_year, month=file_month)
    _df = _df.select(X_cols + y_cols)
    _df = _df.drop_nulls()

    if file_year == 2019:
        print(f"used for training")
        train = train.vstack(_df) if train.shape[0] > 0 else _df
    else:
        print(f"used for testing")
        test = test.vstack(_df) if test.shape[0] > 0 else _df

reading file: d:\Python\General\scania\lambda-sagemaker\data\yellow_tripdata_2019-01.csv
shape: (7667792, 18)
used for training
reading file: d:\Python\General\scania\lambda-sagemaker\data\yellow_tripdata_2019-02.csv
shape: (7019375, 18)
used for training
reading file: d:\Python\General\scania\lambda-sagemaker\data\yellow_tripdata_2019-03.csv
shape: (7832545, 18)
used for training
reading file: d:\Python\General\scania\lambda-sagemaker\data\yellow_tripdata_2019-04.csv
shape: (7433139, 18)
used for training
reading file: d:\Python\General\scania\lambda-sagemaker\data\yellow_tripdata_2019-05.csv
shape: (7565261, 18)
used for training
reading file: d:\Python\General\scania\lambda-sagemaker\data\yellow_tripdata_2019-06.csv
shape: (6941024, 18)
used for training
reading file: d:\Python\General\scania\lambda-sagemaker\data\yellow_tripdata_2019-07.csv
shape: (6310419, 18)
used for training
reading file: d:\Python\General\scania\lambda-sagemaker\data\yellow_tripdata_2019-08.csv
shape: (6073357

In [30]:
train.head()

trip_distance,passenger_count,PULocationID,DOLocationID,RatecodeID,hour_of_day,day_of_week,weeknr,total_amount
f64,i64,i64,i64,i64,u32,u32,u32,f64
7.37,2,237,264,1,0,2,1,24.8
1.73,6,263,74,1,0,2,1,8.3
0.6,1,80,112,1,0,2,1,6.3
1.53,1,231,148,1,0,2,1,10.3
3.2,1,114,79,1,0,2,1,32.75


## Reflection on time saved
### 8 Min pandas - 4min polars