### imports

In [1]:
import polars as pl

pl.Config.set_tbl_rows(20)
pl.Config.set_fmt_str_lengths(100)

from polars import col as c
import os
import matplotlib.pyplot as plt
import numpy as np
import holidays
from datetime import datetime, timedelta
from numpy.typing import ArrayLike, NDArray

### constants

In [2]:
DATA_DIR = "dataset"
US_HOLIDAYS = holidays.US()  # this is a dict-like object

### functions

In [3]:
def mape_f(y_true: ArrayLike, y_pred: ArrayLike) -> np.floating:
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    metric = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return round(metric, 2)

### data

In [4]:
path = os.path.join("../", DATA_DIR, "train.csv")
train = pl.read_csv(path).with_columns(
    c("pickup_date").str.to_datetime("%Y-%m-%d %H:%M:%S")
)
path = os.path.join("../", DATA_DIR, "validation.csv")
validation = pl.read_csv(path).with_columns(
    c("pickup_date").str.to_datetime("%Y-%m-%d %H:%M:%S")
)
path = os.path.join("../", DATA_DIR, "test.csv")
test = pl.read_csv(path).with_columns(
    c("pickup_date").str.to_datetime("%Y-%m-%d %H:%M:%S")
)
train.sample()

rate,valid_miles,transport_type,weight,pickup_date,origin_kma,destination_kma
f64,f64,str,f64,datetime[μs],str,str
4.1694,1765.4421,"""MKPFX""",16026.15,2020-09-02 10:42:00,"""HRQLD""","""JESUD"""


### fill null

In [5]:
train.null_count()

rate,valid_miles,transport_type,weight,pickup_date,origin_kma,destination_kma
u32,u32,u32,u32,u32,u32,u32
0,0,0,80,0,0,0


In [6]:
validation.null_count()

rate,valid_miles,transport_type,weight,pickup_date,origin_kma,destination_kma
u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0


In [7]:
test.null_count()

valid_miles,transport_type,weight,pickup_date,origin_kma,destination_kma
u32,u32,u32,u32,u32,u32
0,0,0,0,0,0


In [8]:
train = train.with_columns(c("weight").fill_null(strategy="mean"))

### date features

In [9]:
def is_holiday_next_days(x):
    if x + timedelta(days=1) in US_HOLIDAYS:
        return 1
    elif x + timedelta(days=2) in US_HOLIDAYS:
        return 1
    elif x + timedelta(days=3) in US_HOLIDAYS:
        return 1
        # elif x + timedelta(days=4) in US_HOLIDAYS:
        #     return 1
        # elif x + timedelta(days=5) in US_HOLIDAYS:
        #     return 1
        # elif x + timedelta(days=6) in US_HOLIDAYS:
        return 1
    else:
        return 0

In [10]:
def add_date_features(df: pl.DataFrame) -> pl.DataFrame:
    df = df.with_columns(
        # make trigonometric
        c("pickup_date").dt.month().alias("month"),
        c("pickup_date").dt.weekday().alias("weekday"),
        c("pickup_date").dt.week().alias("week"),
        # c("pickup_date").dt.hour().alias("hour"),
        #
        c("pickup_date").dt.year().alias("year"),
        c("pickup_date").dt.date().alias("date"),
        #
        # holidays
        c("pickup_date")
        .dt.date()
        .map_elements(
            function=is_holiday_next_days,
            return_dtype=pl.Int64,
        )
        .alias("is_holiday_next_week"),
        c("pickup_date")
        .dt.date()
        .map_elements(
            function=lambda x: 1 if x in US_HOLIDAYS else 0,
            return_dtype=pl.Int64,
        )
        .alias("is_holiday"),
    )
    return df

In [11]:
train = add_date_features(train)
validation = add_date_features(validation)
test = add_date_features(test)
train.head()

rate,valid_miles,transport_type,weight,pickup_date,origin_kma,destination_kma,month,weekday,week,year,date,is_holiday_next_week,is_holiday
f64,f64,str,f64,datetime[μs],str,str,i8,i8,i8,i32,date,i64,i64
4.7203,521.8451,"""MKPFX""",9231.75,2019-11-10 10:42:00,"""OMUOI""","""LFUHN""",11,7,45,2019,2019-11-10,1,0
4.9005,532.6675,"""MKPFX""",11754.95,2019-11-10 10:42:00,"""OMUOI""","""LFUHN""",11,7,45,2019,2019-11-10,1,0
4.7018,523.9188,"""MKPFX""",9603.2,2019-11-10 10:42:00,"""OMUOI""","""LFUHN""",11,7,45,2019,2019-11-10,1,0
4.6388,562.8296,"""MKPFX""",8789.05,2019-11-10 10:42:00,"""OMUOI""","""LFUHN""",11,7,45,2019,2019-11-10,1,0
5.0206,519.8782,"""MKPFX""",9597.5,2019-11-10 10:42:00,"""OMUOI""","""LFUHN""",11,7,45,2019,2019-11-10,1,0


### weight/mile features

In [12]:
def add_weight_mile_features(df: pl.DataFrame) -> pl.DataFrame:
    df = df.with_columns(
        (c("weight") * c("valid_miles")).alias("weight_mile"),
        (c("weight") / c("valid_miles")).alias("weight_per_mile"),
        c("valid_miles").log(),
    )
    return df

In [13]:
train = add_weight_mile_features(train)
validation = add_weight_mile_features(validation)
test = add_weight_mile_features(test)
train.head()

rate,valid_miles,transport_type,weight,pickup_date,origin_kma,destination_kma,month,weekday,week,year,date,is_holiday_next_week,is_holiday,weight_mile,weight_per_mile
f64,f64,str,f64,datetime[μs],str,str,i8,i8,i8,i32,date,i64,i64,f64,f64
4.7203,6.257371,"""MKPFX""",9231.75,2019-11-10 10:42:00,"""OMUOI""","""LFUHN""",11,7,45,2019,2019-11-10,1,0,4817500.0,17.690594
4.9005,6.277897,"""MKPFX""",11754.95,2019-11-10 10:42:00,"""OMUOI""","""LFUHN""",11,7,45,2019,2019-11-10,1,0,6261500.0,22.068082
4.7018,6.261337,"""MKPFX""",9603.2,2019-11-10 10:42:00,"""OMUOI""","""LFUHN""",11,7,45,2019,2019-11-10,1,0,5031300.0,18.329558
4.6388,6.332977,"""MKPFX""",8789.05,2019-11-10 10:42:00,"""OMUOI""","""LFUHN""",11,7,45,2019,2019-11-10,1,0,4946700.0,15.615828
5.0206,6.253595,"""MKPFX""",9597.5,2019-11-10 10:42:00,"""OMUOI""","""LFUHN""",11,7,45,2019,2019-11-10,1,0,4989500.0,18.461055


### kma_pair_feature

In [14]:
def add_kma_pair_feature(df: pl.DataFrame) -> pl.DataFrame:
    df = df.with_columns(
        pl.concat_list("origin_kma", "destination_kma")
        # .list.sort()
        .list.join("_").alias("kma_pair"),
        pl.concat_list("origin_kma", "destination_kma")
        .list.sort()
        .list.join("_")
        .alias("kma_track"),  # track A -> B is the same as B -> A
    ).with_columns(
        c("valid_miles")
        .count()
        .over("date", "origin_kma")
        .alias("count_deliveries_from_kma"),
        c("valid_miles")
        .count()
        .over("date", "destination_kma")
        .alias("count_deliveries_to_kma"),
        c("valid_miles")
        .count()
        .over("date", "kma_pair")
        .alias("count_deliveries_kma_pair"),
    )
    # .drop("origin_kma", "destination_kma")
    return df

In [15]:
train = add_kma_pair_feature(train)
validation = add_kma_pair_feature(validation)
test = add_kma_pair_feature(test)
train.head()

rate,valid_miles,transport_type,weight,pickup_date,origin_kma,destination_kma,month,weekday,week,year,date,is_holiday_next_week,is_holiday,weight_mile,weight_per_mile,kma_pair,kma_track,count_deliveries_from_kma,count_deliveries_to_kma,count_deliveries_kma_pair
f64,f64,str,f64,datetime[μs],str,str,i8,i8,i8,i32,date,i64,i64,f64,f64,str,str,u32,u32,u32
4.7203,6.257371,"""MKPFX""",9231.75,2019-11-10 10:42:00,"""OMUOI""","""LFUHN""",11,7,45,2019,2019-11-10,1,0,4817500.0,17.690594,"""OMUOI_LFUHN""","""LFUHN_OMUOI""",5,5,5
4.9005,6.277897,"""MKPFX""",11754.95,2019-11-10 10:42:00,"""OMUOI""","""LFUHN""",11,7,45,2019,2019-11-10,1,0,6261500.0,22.068082,"""OMUOI_LFUHN""","""LFUHN_OMUOI""",5,5,5
4.7018,6.261337,"""MKPFX""",9603.2,2019-11-10 10:42:00,"""OMUOI""","""LFUHN""",11,7,45,2019,2019-11-10,1,0,5031300.0,18.329558,"""OMUOI_LFUHN""","""LFUHN_OMUOI""",5,5,5
4.6388,6.332977,"""MKPFX""",8789.05,2019-11-10 10:42:00,"""OMUOI""","""LFUHN""",11,7,45,2019,2019-11-10,1,0,4946700.0,15.615828,"""OMUOI_LFUHN""","""LFUHN_OMUOI""",5,5,5
5.0206,6.253595,"""MKPFX""",9597.5,2019-11-10 10:42:00,"""OMUOI""","""LFUHN""",11,7,45,2019,2019-11-10,1,0,4989500.0,18.461055,"""OMUOI_LFUHN""","""LFUHN_OMUOI""",5,5,5


### split train into train_df and eval_df

In [16]:
train_df = train.filter(
    c("pickup_date").dt.date() < train["pickup_date"].max().date() - timedelta(days=22)
)
eval_df = train.filter(
    c("pickup_date").dt.date() >= train["pickup_date"].max().date() - timedelta(days=22)
)

### kma features

In [17]:
kma_transport_median = (
    train_df.group_by("kma_pair", "transport_type")
    .agg(
        c("rate").count().alias("kma_transport_count"),
        c("rate").median().alias("kma_transport_median"),
    )
    .sort("kma_transport_count")
    # .filter(c("kma_transport_count") >= 5)
    .drop("kma_transport_count")
)
kma_transport_median.head()

kma_pair,transport_type,kma_transport_median
str,str,f64
"""HBILN_PNBXA""","""MKPFX""",3.9827
"""QGHCU_BFHYB""","""GJROY""",4.0947
"""XYHVH_WWRQI""","""MKPFX""",2.0203
"""FPZNC_AVEJW""","""MKPFX""",3.9074
"""DRRUD_QUERU""","""GJROY""",8.6261


In [18]:
kma_track_median = train_df.group_by("kma_track").agg(
    c("valid_miles").median().alias("kma_track_miles_median"),
    c("rate").median().alias("kma_track_rate_median"),
)

In [19]:
def add_kma_median(df: pl.DataFrame) -> pl.DataFrame:
    df = df.join(
        kma_transport_median,
        on=["kma_pair", "transport_type"],
        how="left",
    ).with_columns(
        c("kma_transport_median").fill_null(0),
    )
    df = df.join(
        kma_track_median,
        on=["kma_track"],
        how="left",
    ).with_columns(
        c("kma_track_miles_median").fill_null(0),
        c("kma_track_rate_median").fill_null(0),
    )
    return df

In [20]:
train_df = add_kma_median(train_df)
eval_df = add_kma_median(eval_df)
train = add_kma_median(train)
validation = add_kma_median(validation)
test = add_kma_median(test)
train_df.head()

rate,valid_miles,transport_type,weight,pickup_date,origin_kma,destination_kma,month,weekday,week,year,date,is_holiday_next_week,is_holiday,weight_mile,weight_per_mile,kma_pair,kma_track,count_deliveries_from_kma,count_deliveries_to_kma,count_deliveries_kma_pair,kma_transport_median,kma_track_miles_median,kma_track_rate_median
f64,f64,str,f64,datetime[μs],str,str,i8,i8,i8,i32,date,i64,i64,f64,f64,str,str,u32,u32,u32,f64,f64,f64
4.7203,6.257371,"""MKPFX""",9231.75,2019-11-10 10:42:00,"""OMUOI""","""LFUHN""",11,7,45,2019,2019-11-10,1,0,4817500.0,17.690594,"""OMUOI_LFUHN""","""LFUHN_OMUOI""",5,5,5,4.6315,6.286283,4.6315
4.9005,6.277897,"""MKPFX""",11754.95,2019-11-10 10:42:00,"""OMUOI""","""LFUHN""",11,7,45,2019,2019-11-10,1,0,6261500.0,22.068082,"""OMUOI_LFUHN""","""LFUHN_OMUOI""",5,5,5,4.6315,6.286283,4.6315
4.7018,6.261337,"""MKPFX""",9603.2,2019-11-10 10:42:00,"""OMUOI""","""LFUHN""",11,7,45,2019,2019-11-10,1,0,5031300.0,18.329558,"""OMUOI_LFUHN""","""LFUHN_OMUOI""",5,5,5,4.6315,6.286283,4.6315
4.6388,6.332977,"""MKPFX""",8789.05,2019-11-10 10:42:00,"""OMUOI""","""LFUHN""",11,7,45,2019,2019-11-10,1,0,4946700.0,15.615828,"""OMUOI_LFUHN""","""LFUHN_OMUOI""",5,5,5,4.6315,6.286283,4.6315
5.0206,6.253595,"""MKPFX""",9597.5,2019-11-10 10:42:00,"""OMUOI""","""LFUHN""",11,7,45,2019,2019-11-10,1,0,4989500.0,18.461055,"""OMUOI_LFUHN""","""LFUHN_OMUOI""",5,5,5,4.6315,6.286283,4.6315


In [21]:
train_df.select("rate", "kma_transport_median").corr()

rate,kma_transport_median
f64,f64
1.0,0.838627
0.838627,1.0


In [22]:
mape_f(validation["rate"], validation["kma_transport_median"])

21.16

In [23]:
mape_f(validation["rate"], validation["kma_track_rate_median"])

20.65

### lag features

#### rolling diff by "date", "kma_pair", "transport_type"

In [24]:
s = train_df.filter(
    c("kma_pair") == "HRQLD_AWWEE",
    c("transport_type") == "MKPFX",
    # c("date") == datetime.strptime("2021-09-19", "%Y-%m-%d"),
).sort("weight")
# s

In [25]:
(
    train_df.group_by("date", "kma_pair", "transport_type")
    .agg(
        c("rate").std(),
    )
    .drop_nulls()
    .sort("rate")
    # .describe([0.9, 0.99])
)

date,kma_pair,transport_type,rate
date,str,str,f64
2021-10-25,"""RONUZ_AWWEE""","""GJROY""",0.0
2021-12-19,"""PNBXA_QUERU""","""MKPFX""",0.0
2020-08-19,"""YXTDU_JESUD""","""MKPFX""",0.0
2019-11-24,"""NUTZC_NSBMC""","""GJROY""",0.0
2021-03-18,"""DNDBK_HRQLD""","""MKPFX""",0.0
2020-06-08,"""FPZNC_PEXPT""","""MKPFX""",0.0
2022-03-20,"""EPXAM_NTODX""","""MKPFX""",0.0
2022-02-15,"""SQSHO_QWBPO""","""MKPFX""",0.0
2020-09-17,"""RCDSS_FPZNC""","""MKPFX""",0.0
2022-08-02,"""CBZDP_NTODX""","""MKPFX""",0.0


In [26]:
df = train_df.sort("date")

# Compute rate trend (difference between consecutive rates)
df = df.with_columns(
    pl.col("rate")
    .diff()
    .over(["origin_kma", "destination_kma", "transport_type"])
    .alias("rate_trend_1")
)

In [27]:
def add_kma_pair_transport_lag_feature(
    train_df: pl.DataFrame,
    eval_df: pl.DataFrame,
) -> pl.DataFrame:

    cols = train_df.columns

    rolling_rate_diff = (
        train_df.vstack(
            eval_df.with_columns(
                pl.lit(None).alias("rate"),
            ).select(cols)
        )
        .group_by("date", "kma_pair", "transport_type")
        .agg(
            c("rate").median(),
        )
        .sort("date")
        .with_columns(
            c("rate").diff().over(["kma_pair", "transport_type"]).alias("rate_diff"),
        )
        .with_columns(
            c("rate_diff")
            .rolling_median(window_size=7, center=False, min_samples=1)
            .over("kma_pair", "transport_type")
            .alias("rolling_rate_diff"),
        )
        .with_columns(
            c("rolling_rate_diff")
            .shift(1)
            .over("kma_pair", "transport_type")
            .alias("rolling_rate_diff"),
        )
        .with_columns(
            c("rolling_rate_diff").forward_fill().over("kma_pair", "transport_type")
        )
        .select(
            "date",
            "kma_pair",
            "transport_type",
            "rolling_rate_diff",
        )
    )

    transformed_eval_df = eval_df.join(
        rolling_rate_diff,
        on=["date", "kma_pair", "transport_type"],
        how="left",
    )

    return transformed_eval_df

In [28]:
eval_df = add_kma_pair_transport_lag_feature(train_df, eval_df)

In [29]:
test = add_kma_pair_transport_lag_feature(train.vstack(validation), test)

In [30]:
validation = add_kma_pair_transport_lag_feature(train, validation)

In [31]:
rolling_rate_diff = (
    train_df.group_by("date", "kma_pair", "transport_type")
    .agg(
        c("rate").median(),
    )
    .sort("date")
    .with_columns(
        c("rate").diff().over(["kma_pair", "transport_type"]).alias("rate_diff"),
    )
    .with_columns(
        c("rate_diff")
        .rolling_median(window_size=7, center=False, min_samples=1)
        .over("kma_pair", "transport_type")
        .alias("rolling_rate_diff"),
    )
    .with_columns(
        c("rolling_rate_diff")
        .shift(1)
        .over("kma_pair", "transport_type")
        .alias("rolling_rate_diff"),
    )
    .with_columns(
        c("rolling_rate_diff").forward_fill().over("kma_pair", "transport_type")
    )
    .select(
        "date",
        "kma_pair",
        "transport_type",
        "rolling_rate_diff",
    )
)

In [32]:
train_df = train_df.join(
    rolling_rate_diff,
    on=["date", "kma_pair", "transport_type"],
    how="left",
)

In [33]:
train_df.null_count()

rate,valid_miles,transport_type,weight,pickup_date,origin_kma,destination_kma,month,weekday,week,year,date,is_holiday_next_week,is_holiday,weight_mile,weight_per_mile,kma_pair,kma_track,count_deliveries_from_kma,count_deliveries_to_kma,count_deliveries_kma_pair,kma_transport_median,kma_track_miles_median,kma_track_rate_median,rolling_rate_diff
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17096


#### for rolling median by "kma_pair", "transport_type"

In [34]:
shifted_week_rate = (
    train_df.group_by("date", "kma_pair", "transport_type")
    .agg(
        c("rate").median(),
    )
    .sort("date")
    .with_columns(
        c("rate")
        .rolling_median(window_size=7, center=False, min_samples=1)
        .over("kma_pair", "transport_type")
        .alias("rolling_rate"),
    )
    .with_columns(
        c("rolling_rate")
        .shift(1)
        .over("kma_pair", "transport_type")
        .alias("shifted_week_rate"),
    )
    .with_columns(
        c("shifted_week_rate").forward_fill().over("kma_pair", "transport_type")
    )
    .select(
        "date",
        "kma_pair",
        "transport_type",
        "shifted_week_rate",
    )
)

transformed_train_df = train_df.join(
    shifted_week_rate,
    on=["date", "kma_pair", "transport_type"],
    how="left",
)

In [35]:
transformed_train_df.shape

(289693, 26)

In [36]:
transformed_train_df.null_count()

rate,valid_miles,transport_type,weight,pickup_date,origin_kma,destination_kma,month,weekday,week,year,date,is_holiday_next_week,is_holiday,weight_mile,weight_per_mile,kma_pair,kma_track,count_deliveries_from_kma,count_deliveries_to_kma,count_deliveries_kma_pair,kma_transport_median,kma_track_miles_median,kma_track_rate_median,rolling_rate_diff,shifted_week_rate
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17096,10005


In [37]:
transformed_train_df = transformed_train_df.drop_nulls()

In [38]:
def add_kma_pair_transport_lag_feature(
    train_df: pl.DataFrame,
    eval_df: pl.DataFrame,
) -> pl.DataFrame:

    cols = train_df.columns

    shifted_week_rate = (
        train_df.vstack(
            eval_df.with_columns(
                pl.lit(None).alias("rate"),
            ).select(cols)
        )
        .group_by("date", "kma_pair", "transport_type")
        .agg(
            c("rate").median(),
        )
        .sort("date")
        .with_columns(
            c("rate")
            .rolling_median(window_size=7, center=False, min_samples=1)
            .over("kma_pair", "transport_type")
            .alias("rolling_rate"),
        )
        .with_columns(
            c("rolling_rate")
            .shift(1)
            .over("kma_pair", "transport_type")
            .alias("shifted_week_rate"),
        )
        .with_columns(
            c("shifted_week_rate").forward_fill().over("kma_pair", "transport_type")
        )
        .select(
            "date",
            "kma_pair",
            "transport_type",
            "shifted_week_rate",
        )
    )

    transformed_eval_df = eval_df.join(
        shifted_week_rate,
        on=["date", "kma_pair", "transport_type"],
        how="left",
    )

    return transformed_eval_df

In [39]:
transformed_eval_df = add_kma_pair_transport_lag_feature(train_df, eval_df)

In [40]:
transformed_validation = add_kma_pair_transport_lag_feature(train, validation)

In [43]:
transformed_test = add_kma_pair_transport_lag_feature(
    train_df.vstack(eval_df).vstack(validation), test
)

In [45]:
transformed_eval_df.null_count()

rate,valid_miles,transport_type,weight,pickup_date,origin_kma,destination_kma,month,weekday,week,year,date,is_holiday_next_week,is_holiday,weight_mile,weight_per_mile,kma_pair,kma_track,count_deliveries_from_kma,count_deliveries_to_kma,count_deliveries_kma_pair,kma_transport_median,kma_track_miles_median,kma_track_rate_median,rolling_rate_diff,shifted_week_rate
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,342,173


In [43]:
transformed_validation.null_count()

rate,valid_miles,transport_type,weight,pickup_date,origin_kma,destination_kma,month,weekday,week,year,date,is_holiday_next_week,is_holiday,weight_mile,weight_per_mile,kma_pair,kma_track,count_deliveries_from_kma,count_deliveries_to_kma,count_deliveries_kma_pair,kma_transport_median,kma_track_miles_median,kma_track_rate_median,shifted_week_rate
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,180


In [47]:
transformed_validation = transformed_validation.with_columns(
    c("shifted_week_rate").fill_null(strategy="mean"),
    c("rolling_rate_diff").fill_null(0),
)
transformed_eval_df = transformed_eval_df.with_columns(
    c("shifted_week_rate").fill_null(strategy="mean"),
    c("rolling_rate_diff").fill_null(0),
)

In [48]:
metric = mape_f(
    transformed_eval_df.drop_nulls()["rate"],
    transformed_eval_df.drop_nulls()["shifted_week_rate"],
)
print("transformed_eval_df \t", metric)
metric = mape_f(
    transformed_validation.drop_nulls()["rate"],
    transformed_validation.drop_nulls()["shifted_week_rate"],
)
print("transformed_validation \t", metric)

transformed_eval_df 	 14.35
transformed_validation 	 12.89


### save dataframes

In [49]:
path = os.path.join("../", DATA_DIR, "train_df.parquet")
transformed_train_df.write_parquet(path)

path = os.path.join("../", DATA_DIR, "eval_df.parquet")
transformed_eval_df.write_parquet(path)

path = os.path.join("../", DATA_DIR, "validation.parquet")
transformed_validation.write_parquet(path)

path = os.path.join("../", DATA_DIR, "test.parquet")
transformed_test.write_parquet(path)