- [previous file - EDA](2022-03-31_train-test_EDA.ipynb)
- [previous file - sklearn experiments](2022-04-12_experiments-sklearn.ipynb)

## imports

In [1]:
# !pip install catboost lightgbm xgboost optuna

In [2]:
from warnings import filterwarnings
import os
import pickle

import joblib
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.ensemble import (
    RandomForestRegressor,
    ExtraTreesRegressor,
    StackingRegressor,
)
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_percentage_error

sns.set()
filterwarnings("ignore")

  from pandas import MultiIndex, Int64Index


## reading data

In [3]:
path_to_data = "data"
# path_to_data = "https://github.com/XelorR/sf_project_6/raw/master/data"

train_raw = pd.read_parquet(f"{path_to_data}/2022-04-08_train_pre-model.parquet")
test_raw = pd.read_parquet(f"{path_to_data}/2022-04-08_test_pre-model.parquet")

train_raw.shape, test_raw.shape

((115367, 30), (34686, 28))

## functions

In [4]:
def train_or_load(clf, X, y, filepath: str = "model.joblib", complevel: int = 9):
    """
    Fits and serialize model as .pkl, .pickle or .joblib file.
    Loads serialized model if filepath exists.

    clf - model to fit
    X - dateset
    y - lables
    filepath - where to save
    complevel - compression level (0-9) for joblib, ignored for pickle
    """
    if filepath.endswith(".joblib"):
        if os.path.exists(filepath):
            with open(filepath, "rb") as f:
                clf = joblib.load(f)
        else:
            clf.fit(X, y)
            with open(filepath, "wb") as f:
                joblib.dump(clf, f, compress=complevel)
        return clf
    elif filepath.endswith(".pkl") or filepath.endswith(".pickle"):
        if os.path.exists(filepath):
            with open(filepath, "rb") as f:
                clf = pickle.load(f)
        else:
            clf.fit(X, y)
            with open(filepath, "wb") as f:
                pickle.dump(clf, f)
        return clf


def submit(hold_out: pd.DataFrame, model, name="submission"):
    preds = model.predict(hold_out)
    submission = pd.read_csv(f"{path_to_data}/sample_submission.csv")
    submission["price"] = preds
    submission.to_csv(f"{name}.csv", index=False)


def submit_log(hold_out: pd.DataFrame, model, name="submission"):
    preds = model.predict(hold_out)
    submission = pd.read_csv(f"{path_to_data}/sample_submission.csv")
    submission["price"] = np.exp(preds)
    submission.to_csv(f"{name}.csv", index=False)

## encoding

In [5]:
train_raw["train/test"] = "train"
test_raw["train/test"] = "test"

data = train_raw.append(test_raw)
data["ptc"].fillna("Оригинал", inplace=True)

data[data.select_dtypes("object").columns.tolist()] = data[
    data.select_dtypes("object").columns.tolist()
].astype(str)

for col in set(data.select_dtypes(exclude=("object")).columns) - {"price"}:
    data[col] = (
        RobustScaler().fit_transform(data[col].values.reshape(-1, 1)).reshape(-1, 1)
    )

for col in ["model_name"]:
    data[col] = LabelEncoder().fit_transform(data[col].astype("str"))

data = pd.get_dummies(
    data,
    columns=[
        "vehicle_transmission",
        "vendor",
        "brand",
        "fuel_type",
        "body_type",
        "color",
        "ptc",
        "drive",
        "wheel",
        "age_cat",
    ],
)

train = data.loc[data["train/test"] == "train"]

train_jane = train.loc[train["sample"] == "jane"]
train_sokolov = train.loc[train["sample"] == "sokolov"]
train_jane["price"] = train_jane["price"] * 0.86
train = train_jane.append(train_sokolov)

train.drop(columns=["sample", "description", "train/test"], inplace=True)
test = data.loc[data["train/test"] == "test"].drop(
    columns=["sample", "description", "train/test", "price"]
)

## preparing to train models

In [6]:
if os.path.exists("models"):
    pass
else:
    os.mkdir("models")

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(
    train.drop(columns="price"), train["price"], random_state=42, shuffle=True
)
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

((86525, 112), (86525,), (28842, 112), (28842,))

## base models

### lightgbm

[notebook with related experiments](model_LightGBM_optuna.ipynb)

#### lightgbm_v1_optuned, kaggle 17.44099

In [8]:
model_name = "lightgbm_v1_optuned"

lightgbm_v1_optuned = train_or_load(
    LGBMRegressor(
        **{
            "bagging_fraction": 0.9079273070338828,
            "bagging_freq": 4,
            "feature_fraction": 0.716472706585253,
            "lambda_l1": 0.0007127314011370048,
            "lambda_l2": 1.4991431139899208e-08,
            "learning_rate": 0.24273738931459424,
            "min_child_samples": 27,
            "num_leaves": 129,
            "random_state": 42,
            "silent": True,
        }
    ),
    X_train,
    y_train,
    f"models/{model_name}.joblib",
)

In [9]:
print(
    model_name,
    mean_absolute_percentage_error(y_valid, lightgbm_v1_optuned.predict(X_valid)),
)
submit(test, lightgbm_v1_optuned, model_name)

lightgbm_v1_optuned 0.1562352982059385


#### lightgbm_v2_optuned, kaggle 13.36829

In [10]:
model_name = "lightgbm_v2_optuned"

lightgbm_v2_optuned = train_or_load(
    LGBMRegressor(
        **{
            "learning_rate": 0.2200394016092361,
            "lambda_l1": 3.6405456215002115e-08,
            "lambda_l2": 3.9256724979441087,
            "num_leaves": 251,
            "feature_fraction": 0.7849386830734889,
            "bagging_fraction": 0.999471799816821,
            "bagging_freq": 7,
            "min_child_samples": 5,
            "random_state": 42,
            "silent": True,
        }
    ),
    X_train,
    np.log(y_train),
    f"models/{model_name}.joblib",
)

In [11]:
print(
    model_name,
    mean_absolute_percentage_error(
        y_valid, np.exp(lightgbm_v2_optuned.predict(X_valid))
    ),
)
submit_log(test, lightgbm_v2_optuned, model_name)

lightgbm_v2_optuned 0.1266083430966481


#### lightgbm_v3_optuned, kaggle 13.39206

In [12]:
model_name = "lightgbm_v3_optuned"

lightgbm_v3_optuned = train_or_load(
    LGBMRegressor(
        **{
            "learning_rate": 0.2034225924278744,
            "lambda_l1": 1.6905457446408715e-07,
            "lambda_l2": 3.410817513919556,
            "num_leaves": 237,
            "feature_fraction": 0.8139002011435048,
            "bagging_fraction": 0.9996914517711281,
            "bagging_freq": 2,
            "min_child_samples": 5,
        }
    ),
    X_train,
    np.log(y_train),
    f"models/{model_name}.joblib",
)

In [13]:
print(
    model_name,
    mean_absolute_percentage_error(
        y_valid, np.exp(lightgbm_v3_optuned.predict(X_valid))
    ),
)
submit_log(test, lightgbm_v3_optuned, model_name)

lightgbm_v3_optuned 0.12681490786747857


### xgboost

[notebook with related experiments](2022-04-12_experiments-sklearn.ipynb)

#### xgb_v1_manual, kaggle 11.68799

In [14]:
model_name = "xgb_v1_manual"

xgb_v1_manual = train_or_load(
    xgb.XGBRegressor(
        objective="reg:squarederror",
        colsample_bytree=0.5,
        learning_rate=0.05,
        max_depth=12,
        alpha=1,
        n_estimators=1000,
        random_state=42,
        n_jobs=-1,
    ),
    X_train,
    np.log(y_train),
    f"models/{model_name}.joblib",
)

In [15]:
print(
    model_name,
    mean_absolute_percentage_error(y_valid, np.exp(xgb_v1_manual.predict(X_valid))),
)
submit_log(test, xgb_v1_manual, model_name)

xgb_v1_manual 0.1196754335163977


#### xgb_v2_optuned, kaggle 11.74262

In [16]:
model_name = "xgb_v2_optuned"

xgb_v2_optuned = train_or_load(
    xgb.XGBRegressor(
        verbosity=0,
        tree_method="hist",
        random_state=42,
        silent=True,
        n_jobs=-1,
        **{
            "alpha": 0.044903341303693216,
            "booster": "gbtree",
            "colsample_bytree": 0.2782856821187278,
            "eta": 3.4353303842042365e-08,
            "gamma": 0.00048033429580361897,
            "grow_policy": "depthwise",
            "lambda": 4.940599898474283e-07,
            "learning_rate": 0.029621998365714833,
            "max_depth": 15,
            "min_child_weight": 6,
            "n_estimators": 736,
            "objective": "reg:squarederror",
            "subsample": 0.7203114713416401,
        },
    ),
    X_train,
    np.log(y_train),
    f"models/{model_name}.joblib",
)

In [17]:
print(
    model_name,
    mean_absolute_percentage_error(y_valid, np.exp(xgb_v2_optuned.predict(X_valid))),
)
submit_log(test, xgb_v2_optuned, model_name)

xgb_v2_optuned 0.11972912524466967


### extra trees

[notebook with related experiments](model_ExtraTrees_optuna.ipynb)

#### etr_v1_manual, kaggle 14.41262

In [18]:
model_name = "etr_v1_manual"

etr_v1_manual = train_or_load(
    ExtraTreesRegressor(
        n_estimators=800,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features="auto",
        max_depth=15,
        bootstrap=True,
        random_state=42,
        n_jobs=-1,
        verbose=0,
    ),
    X_train,
    np.log(y_train),
    f"models/{model_name}.joblib",
)

In [19]:
print(
    model_name,
    mean_absolute_percentage_error(y_valid, np.exp(etr_v1_manual.predict(X_valid))),
)
submit_log(test, etr_v1_manual, model_name)

etr_v1_manual 0.1376278076519815


#### etr_v2_optuned, kaggle 11.82102

In [20]:
model_name = "etr_v2_optuned"

etr_v2_optuned = train_or_load(
    ExtraTreesRegressor(
        **{
            "n_estimators": 936,
            "min_samples_split": 3,
            "min_samples_leaf": 1,
            "max_samples": 0.9894458395539251,
            "max_features": "auto",
            "max_depth": None,
            "bootstrap": True,
            "random_state": 42,
            "n_jobs": -1,
            "verbose": 0,
        }
    ),
    X_train,
    np.log(y_train),
    f"models/{model_name}.joblib",
)

In [None]:
print(
    model_name,
    mean_absolute_percentage_error(y_valid, np.exp(etr_v2_optuned.predict(X_valid))),
)
submit_log(test, etr_v2_optuned, model_name)

etr_v2_optuned 0.1243004166689037


#### etr_v3_default, 11.08465

In [None]:
model_name = "etr_v3_default"

etr_v3_default = train_or_load(
    ExtraTreesRegressor(), X_train, np.log(y_train), f"models/{model_name}.joblib"
)

In [None]:
print(
    model_name,
    mean_absolute_percentage_error(y_valid, np.exp(etr_v3_default.predict(X_valid))),
)
submit_log(test, etr_v3_default, model_name)

etr_v3_default 0.12964593868253957


### random forest

[notebook with relative experiments](model_RandomForest_optuna.ipynb)

#### rf_v1_manual, kaggle 12.43493

In [None]:
model_name = "rf_v1_manual"

rf_v1_manual = train_or_load(
    RandomForestRegressor(
        random_state=42,
        n_estimators=800,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features="log2",
        max_depth=None,
        bootstrap=True,
    ),
    X_train,
    np.log(y_train),
    f"models/{model_name}.joblib",
)

In [None]:
print(
    model_name,
    mean_absolute_percentage_error(y_valid, np.exp(rf_v1_manual.predict(X_valid))),
)
submit_log(test, rf_v1_manual, model_name)

rf_v1_manual 0.1315218319297354


#### rf_v2_optuned, kaggle 12.18921

In [None]:
model_name = "rf_v2_optuned"

rf_v2_optuned = train_or_load(
    RandomForestRegressor(
        **{
            "n_estimators": 450,
            "min_samples_split": 4,
            "min_samples_leaf": 2,
            "max_samples": 0.9899165552020569,
            "max_features": "auto",
            "random_state": 42,
            "max_depth": None,
            "bootstrap": True,
        }
    ),
    X_train,
    np.log(y_train),
    f"models/{model_name}.joblib",
)

In [None]:
print(
    model_name,
    mean_absolute_percentage_error(y_valid, np.exp(rf_v2_optuned.predict(X_valid))),
)
submit_log(test, rf_v2_optuned, model_name)

rf_v2_optuned 0.12683767196612786


#### rf_v3_default, kaggle 11.97721

In [None]:
model_name = "rf_v3_default"

rf_v3_default = train_or_load(
    RandomForestRegressor(), X_train, np.log(y_train), f"models/{model_name}.joblib"
)

In [None]:
print(
    model_name,
    mean_absolute_percentage_error(y_valid, np.exp(rf_v3_default.predict(X_valid))),
)
submit_log(test, rf_v3_default, model_name)

rf_v3_default 0.12747472477338362


## meta models

### blending

#### blending_v1 kaggle 11.89414

In [None]:
models = [xgb_v1_manual, etr_v2_optuned, rf_v3_default]
model_names = ["xgb_v1_manual", "etr_v2_optuned", "rf_v3_default"]

meta_df = pd.DataFrame()

for name, model in zip(model_names, models):
    meta_df[name] = model.predict(X_valid)

meta_df.sample(3, random_state=42)

Unnamed: 0,xgb_v1_manual,etr_v2_optuned,rf_v3_default
5576,12.531177,12.63903,12.606403
6108,13.544144,13.576177,13.569565
7934,13.246906,13.215294,13.222257


In [None]:
meta_model = CatBoostRegressor(random_state=42, silent=True)
meta_model.fit(meta_df, y_valid)

meta_for_hold_out = pd.DataFrame()

for name, model in zip(model_names, models):
    meta_for_hold_out[name] = model.predict(test)

meta_for_hold_out["preds"] = meta_model.predict(meta_for_hold_out)
meta_for_hold_out.sample(3, random_state=42)

Unnamed: 0,xgb_v1_manual,etr_v2_optuned,rf_v3_default,preds
19026,13.114839,13.050964,13.051629,489133.462439
3337,12.092997,12.02187,12.045096,171220.717123
11807,13.74321,13.761394,13.721139,935459.984616


In [None]:
submit(
    meta_for_hold_out.drop(columns=["preds"]), meta_model, "blending_v1"
)

#### blending_v2, kaggle 11.50377

In [None]:
meta_model = CatBoostRegressor(random_state=42, silent=True)
meta_model.fit(meta_df, np.log(y_valid))

meta_for_hold_out = pd.DataFrame()

for name, model in zip(model_names, models):
    meta_for_hold_out[name] = model.predict(test)

meta_for_hold_out["preds"] = np.exp(meta_model.predict(meta_for_hold_out))
meta_for_hold_out.sample(3, random_state=42)

Unnamed: 0,xgb_v1_manual,etr_v2_optuned,rf_v3_default,preds
19026,13.114839,13.050964,13.051629,486516.406756
3337,12.092997,12.02187,12.045096,171045.145424
11807,13.74321,13.761394,13.721139,925643.343998


In [None]:
submit_log(
    meta_for_hold_out.drop(columns=["preds"]), meta_model, "blending_v2"
)

#### blending_v3, kaggle 11.59602

In [None]:
models = [xgb_v1_manual, etr_v2_optuned, rf_v3_default, LinearRegression().fit(X_train, np.log(y_train))]
model_names = ["xgb_v1_manual", "etr_v2_optuned", "rf_v3_default", "lr"]

meta_df = pd.DataFrame()

for name, model in zip(model_names, models):
    meta_df[name] = model.predict(X_valid)

meta_df.sample(3, random_state=42)

Unnamed: 0,xgb_v1_manual,etr_v2_optuned,rf_v3_default,lr
5576,12.531177,12.63903,12.606403,12.500602
6108,13.544144,13.576177,13.569565,13.809972
7934,13.246906,13.215294,13.222257,12.996579


In [None]:
meta_model = CatBoostRegressor(random_state=42, silent=True)
meta_model.fit(meta_df, np.log(y_valid))

meta_for_hold_out = pd.DataFrame()

for name, model in zip(model_names, models):
    meta_for_hold_out[name] = model.predict(test)

meta_for_hold_out["preds"] = np.exp(meta_model.predict(meta_for_hold_out))
meta_for_hold_out.sample(3, random_state=42)

Unnamed: 0,xgb_v1_manual,etr_v2_optuned,rf_v3_default,lr,preds
19026,13.114839,13.050964,13.051629,13.21598,492556.443835
3337,12.092997,12.02187,12.045096,12.104591,170250.150848
11807,13.74321,13.761394,13.721139,13.636175,929929.070964


In [None]:
submit_log(
    meta_for_hold_out.drop(columns=["preds"]), meta_model, "blending_v3"
)

#### blending_v4, kaggle 12.27095

In [None]:
models = [xgb_v1_manual, etr_v2_optuned, rf_v3_default]
model_names = ["xgb_v1_manual", "etr_v2_optuned", "rf_v3_default"]

meta_df = pd.DataFrame()

for name, model in zip(model_names, models):
    meta_df[name] = model.predict(X_valid)

meta_df.sample(3, random_state=42)

Unnamed: 0,xgb_v1_manual,etr_v2_optuned,rf_v3_default
5576,12.531177,12.63903,12.606403
6108,13.544144,13.576177,13.569565
7934,13.246906,13.215294,13.222257


In [None]:
meta_model = ExtraTreesRegressor(random_state=42)
meta_model.fit(meta_df, np.log(y_valid))

meta_for_hold_out = pd.DataFrame()

for name, model in zip(model_names, models):
    meta_for_hold_out[name] = model.predict(test)

meta_for_hold_out["preds"] = np.exp(meta_model.predict(meta_for_hold_out))
meta_for_hold_out.sample(3, random_state=42)

Unnamed: 0,xgb_v1_manual,etr_v2_optuned,rf_v3_default,preds
19026,13.114839,13.050964,13.051629,487100.304442
3337,12.092997,12.02187,12.045096,182856.5398
11807,13.74321,13.761394,13.721139,959368.900737


In [None]:
submit_log(
    meta_for_hold_out.drop(columns=["preds"]), meta_model, "blending_v4"
)

### stacking