In [1]:
import os
from typing import Optional

import joblib
import polars as pl
from catboost import CatBoostClassifier, CatBoostRegressor

from config import (
    DATA_FOLDER, DURATION_POLARS_COL, DURATION_SPARK_COL,
    CATEGORICAL_FEATURE, IS_SUCCESS_POLARS_COL, IS_SUCCESS_SPARK_COL,
    FEATURES, MODELS_FOLDER, TEST_FILE_NAME, TRAIN_FILE_NAME
)


def choose_engine(row):
    if row[IS_SUCCESS_POLARS_COL] == 0 and row[IS_SUCCESS_SPARK_COL] == 1:
        return "spark"
    if row[IS_SUCCESS_SPARK_COL] == 0 and row[IS_SUCCESS_POLARS_COL] == 1:
        return "polars"
    if row[IS_SUCCESS_SPARK_COL] == 0 and row[IS_SUCCESS_POLARS_COL] == 0:
        return "none"
    return "polars" if row[DURATION_POLARS_COL] < row[DURATION_SPARK_COL] else "spark"


agg = (
    pl.read_csv(os.path.join(DATA_FOLDER, f"{TRAIN_FILE_NAME}.csv"))
    .group_by([*FEATURES, "engine"])
    .agg([
        pl.mean("duration").alias("duration"),
        pl.min("success").alias("success")
    ])
)

pivoted = agg.pivot(values=["duration", "success"], index=FEATURES, on="engine")

clean_df = (
    pivoted
    .with_columns(pl.struct(pivoted.columns).map_elements(choose_engine).alias("label"))
    .with_columns([
        pl.col(DURATION_POLARS_COL).log1p().alias(f"{DURATION_POLARS_COL}_log"),
        pl.col(DURATION_SPARK_COL).log1p().alias(f"{DURATION_SPARK_COL}_log"),
    ])
)

clean_df.write_csv(os.path.join(DATA_FOLDER, f"{TRAIN_FILE_NAME}_cleaned_log.csv"))


def train_oom_classifier(df: pl.DataFrame, suc_column: str) -> Optional[CatBoostClassifier]:
    uniq = df[suc_column].unique().to_list()
    if len(uniq) == 1:
        print(f"Skipping classifier for {suc_column}, only one class: {uniq}")
        return None

    model = CatBoostClassifier(
        iterations=500,
        depth=6,
        learning_rate=0.05,
        loss_function="Logloss",
        auto_class_weights="Balanced",
        verbose=False,
    )

    model.fit(
        df[FEATURES].to_pandas(),
        df[suc_column].to_pandas(),
        cat_features=CATEGORICAL_FEATURE,
    )

    joblib.dump(model, os.path.join(MODELS_FOLDER, f"oom_{suc_column}_{TRAIN_FILE_NAME}_model.cbm"))
    return model


oom_polars_model = train_oom_classifier(clean_df, IS_SUCCESS_POLARS_COL)
oom_spark_model = train_oom_classifier(clean_df, IS_SUCCESS_SPARK_COL)


def train_log_regression(df: pl.DataFrame, suc_column: str, dur_column: str) -> CatBoostRegressor:
    model = CatBoostRegressor(
        iterations=1500,
        depth=6,
        learning_rate=0.03,
        loss_function="RMSE",
        verbose=False
    )

    X = df.filter(pl.col(suc_column) == 1).select(FEATURES).to_pandas()
    y = df.filter(pl.col(suc_column) == 1).select(f"{dur_column}_log").to_pandas()[f"{dur_column}_log"]

    model.fit(
        X,
        y,
        cat_features=CATEGORICAL_FEATURE
    )

    joblib.dump(model, os.path.join(MODELS_FOLDER, f"logreg_{dur_column}_{TRAIN_FILE_NAME}_model.cbm"))
    return model


polars_time_model = train_log_regression(clean_df, IS_SUCCESS_POLARS_COL, DURATION_POLARS_COL)
spark_time_model = train_log_regression(clean_df, IS_SUCCESS_SPARK_COL, DURATION_SPARK_COL)

  .with_columns(pl.struct(pivoted.columns).map_elements(choose_engine).alias("label"))


Skipping classifier for success_spark, only one class: [1]
