- [previous file - EDA](2022-03-31_train-test_EDA.ipynb)
- [next tile - models with pre-tuned parameters and ensembles](2022-04-15_ensemble.ipynb)

## imports

In [4]:
from warnings import filterwarnings

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, StackingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_percentage_error

from lib.model_related import *

sns.set()
filterwarnings("ignore")

## reading data

In [5]:
train_raw = pd.read_parquet("data/2022-04-08_train_pre-model.parquet")
test_raw = pd.read_parquet("data/2022-04-08_test_pre-model.parquet")

train_raw.shape, test_raw.shape

((115367, 30), (34686, 28))

In [6]:
def submit(hold_out: pd.DataFrame, model, name="submission"):
    preds = model.predict(hold_out)
    submission = pd.read_csv("data/sample_submission.csv")
    submission["price"] = preds
    submission.to_csv(f"{name}.csv", index=False)
    
    
def submit_log(hold_out: pd.DataFrame, model, name="submission"):
    preds = model.predict(hold_out)
    submission = pd.read_csv("data/sample_submission.csv")
    submission["price"] = np.exp(preds)
    submission.to_csv(f"{name}.csv", index=False)

## encoding

In [7]:
train_raw["train/test"] = "train"
test_raw["train/test"] = "test"

data = train_raw.append(test_raw)
data["ptc"].fillna("Оригинал", inplace=True)

data[data.select_dtypes("object").columns.tolist()] = data[
    data.select_dtypes("object").columns.tolist()
].astype(str)

for col in set(data.select_dtypes(exclude=("object")).columns) - {"price"}:
    data[col] = (
        RobustScaler().fit_transform(data[col].values.reshape(-1, 1)).reshape(-1, 1)
    )

for col in ["model_name"]:
    data[col] = LabelEncoder().fit_transform(data[col].astype("str"))

data = pd.get_dummies(
    data,
    columns=[
        "vehicle_transmission",
        "vendor",
        "brand",
        "fuel_type",
        "body_type",
        "color",
        "ptc",
        "drive",
        "wheel",
        "age_cat",
    ],
)

train = data.loc[data["train/test"] == "train"]

train_jane = train.loc[train["sample"] == "jane"]
train_sokolov = train.loc[train["sample"] == "sokolov"]
train_jane["price"] = train_jane["price"] * 0.86
train = train_jane.append(train_sokolov)

train.drop(columns=["sample", "description", "train/test"], inplace=True)
test = data.loc[data["train/test"] == "test"].drop(
    columns=["sample", "description", "train/test", "price"]
)

## modelling

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(train.drop(columns="price"), train["price"], random_state = 42, shuffle=True)
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

((86525, 112), (86525,), (28842, 112), (28842,))

## model tuning

### lightgbm

In [20]:
def objective(trial):

    param = {
        "objective": "regression",
        "metric": "mape",
        "learning_rate": trial.suggest_uniform("learning_rate", 0.001, 1.0),
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "verbosity": -1,
    }

    gbm = LGBMRegressor(**param, silent=True)
    cv_roc_auc = cross_val_score(gbm, X_train, y_train, cv=8, scoring="neg_mean_absolute_percentage_error", n_jobs=-1)

    return np.mean(cv_roc_auc)


study = optuna.create_study(
    direction="maximize",
    storage="sqlite:///LGBMRegressor.db",
    study_name="LGBMRegressor",
    load_if_exists=True,
)
study.optimize(objective, n_trials=1000)

[32m[I 2022-04-14 15:43:16,930][0m Using an existing study with name 'LGBMRegressor' instead of creating a new one.[0m
[32m[I 2022-04-14 15:43:25,432][0m Trial 445 finished with value: -0.14180395741799445 and parameters: {'learning_rate': 0.19347733786437052, 'lambda_l1': 3.355228574586955e-05, 'lambda_l2': 1.1601895771957918e-05, 'num_leaves': 252, 'feature_fraction': 0.8970063024698147, 'bagging_fraction': 0.9606519036808351, 'bagging_freq': 5, 'min_child_samples': 7}. Best is trial 365 with value: -0.1408362612546663.[0m
[32m[I 2022-04-14 15:43:33,612][0m Trial 446 finished with value: -0.14215296021720059 and parameters: {'learning_rate': 0.15531498091829857, 'lambda_l1': 5.7423118976627957e-05, 'lambda_l2': 3.860936427136809e-06, 'num_leaves': 242, 'feature_fraction': 0.9507236756761598, 'bagging_fraction': 0.9823924219102652, 'bagging_freq': 5, 'min_child_samples': 5}. Best is trial 365 with value: -0.1408362612546663.[0m
[32m[I 2022-04-14 15:43:39,859][0m Trial 447 f

In [18]:
study.best_params

{'bagging_fraction': 0.9079273070338828,
 'bagging_freq': 4,
 'feature_fraction': 0.716472706585253,
 'lambda_l1': 0.0007127314011370048,
 'lambda_l2': 1.4991431139899208e-08,
 'learning_rate': 0.24273738931459424,
 'min_child_samples': 27,
 'num_leaves': 129}

#### Version 1

In [19]:
lightgbm_optuned = LGBMRegressor(
    **{
        "bagging_fraction": 0.9079273070338828,
        "bagging_freq": 4,
        "feature_fraction": 0.716472706585253,
        "lambda_l1": 0.0007127314011370048,
        "lambda_l2": 1.4991431139899208e-08,
        "learning_rate": 0.24273738931459424,
        "min_child_samples": 27,
        "num_leaves": 129,
        "random_state": 42,
        "silent": True,
    }
).fit(X_train, y_train)

print("lightgbm_optuned", mean_absolute_percentage_error(y_valid, lightgbm_optuned.predict(X_valid)))
submit(test, lightgbm_optuned, "lightgbm_optuned")

lightgbm_optuned 0.1562352982059385


#### Version 2

In [23]:
lightgbm_optuned_1899 = LGBMRegressor(
    **{
        'learning_rate': 0.2200394016092361, 
        'lambda_l1': 3.6405456215002115e-08, 
        'lambda_l2': 3.9256724979441087, 
        'num_leaves': 251, 
        'feature_fraction': 0.7849386830734889, 
        'bagging_fraction': 0.999471799816821, 
        'bagging_freq': 7, 
        'min_child_samples': 5, 
        "random_state": 42,
        "silent": True
    }
).fit(X_train, np.log(y_train))

print("lightgbm_optuned_1899_log", mean_absolute_percentage_error(y_valid, np.exp(lightgbm_optuned_1899.predict(X_valid))))
submit_log(test, lightgbm_optuned_1899, "lightgbm_optuned_log_1899")

lightgbm_optuned_1899_log 0.1266083430966481


#### Version 3

In [24]:
lightgbm_optuned_1258 = LGBMRegressor(
    **{
        'learning_rate': 0.2034225924278744, 
        'lambda_l1': 1.6905457446408715e-07, 
        'lambda_l2': 3.410817513919556, 
        'num_leaves': 237, 
        'feature_fraction': 0.8139002011435048, 
        'bagging_fraction': 0.9996914517711281, 
        'bagging_freq': 2, 
        'min_child_samples': 5
    }
).fit(X_train, np.log(y_train))

print("lightgbm_optuned_1258_log", mean_absolute_percentage_error(y_valid, np.exp(lightgbm_optuned_1258.predict(X_valid))))
submit_log(test, lightgbm_optuned_1258, "lightgbm_optuned_log_1258")

lightgbm_optuned_1258_log 0.12681490786747857
