# Init

In [1]:
import os
from typing import Optional

import joblib
import polars as pl
from catboost import CatBoostClassifier, CatBoostRegressor

from config import (
    DATA_FOLDER, DURATION_POLARS_COL, DURATION_SPARK_COL,
    CATEGORICAL_FEATURE, IS_SUCCESS_POLARS_COL, IS_SUCCESS_SPARK_COL,
    FEATURES, MODELS_FOLDER, TEST_FILE_NAME, TRAIN_FILE_NAME
)
# xg-boost

In [None]:
# pl.read_csv(os.path.join(DATA_FOLDER, f"{TRAIN_FILE_NAME}.csv")).filter(pl.col('engine') == 'polars').filter(pl.col("memory") < pl.col("data_size")).filter(pl.col('success') == 1)

# Data preparing

In [2]:
def choose_engine(row):
    if row[IS_SUCCESS_POLARS_COL] == 0 and row[IS_SUCCESS_SPARK_COL] == 1:
        return "spark"
    if row[IS_SUCCESS_SPARK_COL] == 0 and row[IS_SUCCESS_POLARS_COL] == 1:
        return "polars"
    if row[IS_SUCCESS_SPARK_COL] == 0 and row[IS_SUCCESS_POLARS_COL] == 0:
        return "none"
    return "polars" if row[DURATION_POLARS_COL] < row[DURATION_SPARK_COL] else "spark"


agg = (
    pl.read_csv(os.path.join(DATA_FOLDER, f"{TRAIN_FILE_NAME}.csv"))
    .group_by([*FEATURES, "engine"])
    .agg([
        pl.mean("duration").alias("duration"),
        pl.min("success").alias("success")
    ])
)
pivoted = agg.pivot(values=["duration", "success"], index=FEATURES, on="engine")

clean_df = (
    pivoted
    .with_columns(pl.struct(pivoted.columns).map_elements(choose_engine).alias("label"))
)
clean_df.write_csv(os.path.join(DATA_FOLDER, f"{TRAIN_FILE_NAME}_cleaned.csv"))

  .with_columns(pl.struct(pivoted.columns).map_elements(choose_engine).alias("label"))


In [10]:
clean_df = pl.read_csv(os.path.join(DATA_FOLDER, f"{TRAIN_FILE_NAME}_cleaned.csv"))

# Модель класифікації OOM

In [4]:
def train_oom_classifier(df: pl.DataFrame, suc_column: str) -> Optional[CatBoostClassifier]:
    uniq = df[suc_column].unique().to_list()
    if len(uniq) == 1:
        print(f"Skipping classifier for {suc_column}, only one class: {uniq}")
        return None

    model = CatBoostClassifier(
        iterations=500,
        depth=6,
        learning_rate=0.05,
        loss_function="Logloss",
        auto_class_weights="Balanced",
        verbose=False,
    )
    model.fit(
        df[FEATURES].to_pandas(),
        df[suc_column].to_pandas(),
        cat_features=CATEGORICAL_FEATURE,
    )
    joblib.dump(model, os.path.join(MODELS_FOLDER, f"oom_{suc_column}_{TRAIN_FILE_NAME}_model.cbm"))
    return model

## Polars

In [5]:
oom_polars_model = train_oom_classifier(clean_df, IS_SUCCESS_POLARS_COL)

## Spark

In [6]:
oom_spark_model = train_oom_classifier(clean_df, IS_SUCCESS_SPARK_COL)

Skipping classifier for success_spark, only one class: [1]


# Модель регресії часу

In [12]:
def train_regression(df: pl.DataFrame, suc_column: str, dur_column: str) -> CatBoostRegressor:
    model = CatBoostRegressor(
        iterations=1500,
        depth=8,
        learning_rate=0.3,
        loss_function="RMSE",
        verbose=False
    )
    model.fit(
        df.filter(pl.col(suc_column) == 1).select(FEATURES).to_pandas(),
        df.filter(pl.col(suc_column) == 1).select(dur_column).to_pandas(),
        cat_features=CATEGORICAL_FEATURE
    )

    joblib.dump(model, os.path.join(MODELS_FOLDER, f"reg_{dur_column}_{TRAIN_FILE_NAME}_model.cbm"))
    return model

## Polars

In [28]:
polars_time_model = train_regression(clean_df, suc_column=IS_SUCCESS_POLARS_COL, dur_column=DURATION_POLARS_COL)

## Spark

In [13]:
spark_time_model = train_regression(clean_df, suc_column=IS_SUCCESS_SPARK_COL, dur_column=DURATION_SPARK_COL)

# Модель класифікації повна

In [3]:
uniq = clean_df['label'].unique().to_list()

model = CatBoostClassifier(
    iterations=5000,
    depth=10,
    learning_rate=0.05,
    loss_function="Logloss",
    auto_class_weights="Balanced",
    # l2_leaf_reg=6,
    # random_strength=1.8,
    # bootstrap_type="Bayesian",
    # bagging_temperature=0.3,
    # verbose=False
)

model.fit(
    clean_df[FEATURES].to_pandas(),
    clean_df['label'].to_pandas(),
    cat_features=CATEGORICAL_FEATURE,
)
joblib.dump(model, os.path.join(MODELS_FOLDER, f"oom_label_{TRAIN_FILE_NAME}_model.cbm"))

0:	learn: 0.6377501	total: 218ms	remaining: 18m 12s
1:	learn: 0.5662353	total: 318ms	remaining: 13m 14s
2:	learn: 0.5023411	total: 470ms	remaining: 13m 2s
3:	learn: 0.4508894	total: 546ms	remaining: 11m 21s
4:	learn: 0.4044037	total: 644ms	remaining: 10m 42s
5:	learn: 0.3683500	total: 757ms	remaining: 10m 30s
6:	learn: 0.3343812	total: 888ms	remaining: 10m 33s
7:	learn: 0.3066561	total: 1.01s	remaining: 10m 33s
8:	learn: 0.2941110	total: 1.05s	remaining: 9m 42s
9:	learn: 0.2724453	total: 1.14s	remaining: 9m 30s
10:	learn: 0.2520931	total: 1.24s	remaining: 9m 21s
11:	learn: 0.2368541	total: 1.32s	remaining: 9m 7s
12:	learn: 0.2213125	total: 1.45s	remaining: 9m 16s
13:	learn: 0.2084054	total: 1.57s	remaining: 9m 20s
14:	learn: 0.1967831	total: 1.7s	remaining: 9m 24s
15:	learn: 0.1896413	total: 1.78s	remaining: 9m 13s
16:	learn: 0.1820758	total: 1.86s	remaining: 9m 5s
17:	learn: 0.1738053	total: 1.98s	remaining: 9m 7s
18:	learn: 0.1721778	total: 2s	remaining: 8m 44s
19:	learn: 0.1704443	t

['data\\models\\oom_label_train_benchmarks_3000_new_model.cbm']

# Testing

In [10]:
def recommend_engine(input_dict: dict) -> str:
    x = pl.DataFrame([input_dict])

    if oom_polars_model is None:
        oom_polars = clean_df[IS_SUCCESS_POLARS_COL].unique().to_list()[0]
    else:
        oom_polars = oom_polars_model.predict(x.to_pandas())[0]

    if oom_spark_model is None:
        oom_spark = clean_df[IS_SUCCESS_SPARK_COL].unique().to_list()[0]
    else:
        oom_spark = oom_spark_model.predict(x.to_pandas())[0]

    if oom_polars == 0 and oom_spark == 1:
        return "spark"

    if oom_spark == 0 and oom_polars == 1:
        return "polars"

    if oom_polars == 0 and oom_spark == 0:
        return "none"

    pred_polars_time = polars_time_model.predict(x.to_pandas())[0]
    pred_spark_time = spark_time_model.predict(x.to_pandas())[0]

    if pred_polars_time < pred_spark_time:
        return "polars"
    else:
        return "spark"


In [24]:
input_data = {
    "task_name": "group_by",
    "data_size": 2500,
    "row_count": 7600,
    "column_count": 120,
    "cpu": 16,
    "memory": 64,
    "instance_count": 1
}
x = pl.DataFrame([input_data])
print(polars_time_model.predict(x.to_pandas())[0])
print(spark_time_model.predict(x.to_pandas())[0])
print(recommend_engine(input_data))

1754.0666469743983
5651.518996863086
polars
