# Baseline

### imports

In [2]:
import polars as pl
from polars import col as c
import os
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
from numpy.typing import ArrayLike, NDArray

### constants

In [3]:
DATA_DIR = "dataset"

### functions

In [4]:
def mape_f(y_true: ArrayLike, y_pred: ArrayLike) -> np.floating:
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    metric = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return round(metric, 2)

### data

In [5]:
path = os.path.join("../", DATA_DIR, "train.csv")
train = pl.read_csv(path).with_columns(
    c("pickup_date").str.to_datetime("%Y-%m-%d %H:%M:%S")
)
path = os.path.join("../", DATA_DIR, "validation.csv")
validation = pl.read_csv(path).with_columns(
    c("pickup_date").str.to_datetime("%Y-%m-%d %H:%M:%S")
)
train.sample()

rate,valid_miles,transport_type,weight,pickup_date,origin_kma,destination_kma
f64,f64,str,f64,datetime[μs],str,str
2.5633,650.192,"""MKPFX""",39950.0,2021-12-02 10:42:00,"""QGHCU""","""EKGTE"""


In [6]:
train["pickup_date"].dt.date().max()

datetime.date(2022, 9, 5)

### mean / median baseline

In [15]:
median_value = round(train["rate"].median(), 3)
mean_value = round(train["rate"].mean(), 3)

In [16]:
recent_median_value = round(
    train.filter(
        c("pickup_date") > datetime.strptime("2022-01-01", "%Y-%m-%d"),
    )["rate"].median(),
    3,
)
recent_mean_value = round(
    train.filter(
        c("pickup_date") > datetime.strptime("2022-01-01", "%Y-%m-%d"),
    )["rate"].mean(),
    3,
)

In [17]:
print("median:\t", median_value, recent_median_value)
print("mean:\t", mean_value, recent_mean_value)

median:	 4.574 4.416
mean:	 5.222 5.029


In [20]:
baseline = validation.with_columns(
    pl.lit(median_value).cast(pl.Float64).alias("median_value"),
    pl.lit(mean_value).cast(pl.Float64).alias("mean_value"),
    pl.lit(recent_median_value).cast(pl.Float64).alias("recent_median_value"),
    pl.lit(recent_mean_value).cast(pl.Float64).alias("recent_mean_value"),
)

In [27]:
print("median_value:\t\t", mape_f(baseline["rate"], baseline["median_value"]))
print("mean_value:\t\t", mape_f(baseline["rate"], baseline["mean_value"]))
print(
    "recent_median_value:\t", mape_f(baseline["rate"], baseline["recent_median_value"])
)
print("recent_mean_value:\t", mape_f(baseline["rate"], baseline["recent_mean_value"]))

median_value:		 31.74
mean_value:		 34.85
recent_median_value:	 31.57
recent_mean_value:	 33.54


### baseline by categories

In [7]:
path = os.path.join("../", DATA_DIR, "validation.parquet")
validation = pl.read_parquet(path)

In [10]:
validation.sample()

rate,valid_miles,transport_type,weight,pickup_date,origin_kma,destination_kma,month,weekday,week,year,date,is_holiday_next_week,is_holiday,weight_mile,weight_per_mile,kma_pair,kma_transport_median,shifted_week_rate
f64,f64,str,f64,datetime[μs],str,str,i8,i8,i8,i32,date,i64,i64,f64,f64,str,f64,f64
6.412,6.671675,"""MKPFX""",37100.0,2022-09-06 08:42:00,"""MJJOV""","""PEXPT""",9,2,36,2022,2022-09-06,0,0,29299000.0,46.978837,"""MJJOV_PEXPT""",6.412,6.5258


In [13]:
validation.null_count()

rate,valid_miles,transport_type,weight,pickup_date,origin_kma,destination_kma,month,weekday,week,year,date,is_holiday_next_week,is_holiday,weight_mile,weight_per_mile,kma_pair,kma_transport_median,shifted_week_rate
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,180


In [14]:
validation = validation.with_columns(c("shifted_week_rate").fill_null(strategy="mean"))

In [15]:
print("shifted_week_rate:", mape_f(validation["rate"], validation["shifted_week_rate"]))

shifted_week_rate: 12.89
