## imports

In [None]:
# !pip install pycaret catboost lightgbm

In [None]:
from warnings import filterwarnings

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, LabelEncoder
from pycaret.regression import *

sns.set()
filterwarnings("ignore")

  defaults = yaml.load(f)


## pre-saved data loading

In [None]:
train_raw = pd.read_parquet("https://github.com/XelorR/sf_project_6/raw/master/data/2022-04-08_train_pre-model.parquet")
test_raw = pd.read_parquet("https://github.com/XelorR/sf_project_6/raw/master/data/2022-04-08_test_pre-model.parquet")

train_raw.shape, test_raw.shape

((115367, 30), (34686, 28))

## encoding features

In [None]:
train_raw["train/test"] = "train"
test_raw["train/test"] = "test"

data = train_raw.append(test_raw)
data["ptc"].fillna("Оригинал", inplace=True)

data[data.select_dtypes("object").columns.tolist()] = data[data.select_dtypes("object").columns.tolist()].astype(str)

# for col in data.select_dtypes(exclude=("object")).columns:
#     data[col] = RobustScaler().fit_transform(data[col].values.reshape(-1, 1)).reshape(-1, 1)

# data = pd.get_dummies(data, columns=["vehicle_transmission", "vendor", "brand", "fuel_type", "body_type", "color", "ptc", "drive", "wheel", "age_cat"])

# for col in ["model_name"]:
#     data[col] = LabelEncoder().fit_transform(data[col].astype("str"))

train = data.loc[data["train/test"] == "train"].drop(columns=["sample", "description", "train/test"])
test = data.loc[data["train/test"] == "test"].drop(columns=["sample", "description", "train/test", "price"])

## pycaret setup

In [None]:
s = setup(
    data = train,
    target="price",
    date_features=["parsed_date"],
    high_cardinality_features = ["model_name"], 
    normalize=True,
    transformation=True,
    remove_outliers = True,
    handle_unknown_categorical = True,
    remove_multicollinearity = True,
    # preprocess = False,
    # categorical_imputation = "mode",
    # imputation_type="iterative",
    # numeric_iterative_imputer = "catboost",
    # categorical_iterative_imputer = "catboost",
    feature_selection=True,
    feature_selection_threshold = 0.5,
    combine_rare_levels = True,
)

Unnamed: 0,Description,Value
0,session_id,7271
1,Target,price
2,Original Data,"(115367, 28)"
3,Missing Values,False
4,Numeric Features,9
5,Categorical Features,17
6,Ordinal Features,False
7,High Cardinality Features,True
8,High Cardinality Method,frequency
9,Transformed Train Set,"(76718, 63)"


## comparing models

In [None]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Linear Regression,sklearn.linear_model._base.LinearRegression,True
lasso,Lasso Regression,sklearn.linear_model._coordinate_descent.Lasso,True
ridge,Ridge Regression,sklearn.linear_model._ridge.Ridge,True
en,Elastic Net,sklearn.linear_model._coordinate_descent.Elast...,True
lar,Least Angle Regression,sklearn.linear_model._least_angle.Lars,True
llar,Lasso Least Angle Regression,sklearn.linear_model._least_angle.LassoLars,True
omp,Orthogonal Matching Pursuit,sklearn.linear_model._omp.OrthogonalMatchingPu...,True
br,Bayesian Ridge,sklearn.linear_model._bayes.BayesianRidge,True
ard,Automatic Relevance Determination,sklearn.linear_model._bayes.ARDRegression,False
par,Passive Aggressive Regressor,sklearn.linear_model._passive_aggressive.Passi...,True


In [None]:
best = compare_models(n_select=7)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,154399.6,155704000000.0,388560.1,0.9557,0.2313,0.1612,18.389
et,Extra Trees Regressor,140889.2,157550400000.0,393725.2,0.954,0.2044,0.143,61.68
rf,Random Forest Regressor,143489.3,185144300000.0,421516.8,0.9479,0.2058,0.1435,69.401
lightgbm,Light Gradient Boosting Machine,182464.2,206588200000.0,442624.3,0.9424,0.2536,0.1924,0.964
gbr,Gradient Boosting Regressor,247478.1,309520000000.0,552365.7,0.911,0.3483,0.2637,15.104
dt,Decision Tree Regressor,193344.6,322296800000.0,564618.6,0.9051,0.2808,0.1942,1.029
knn,K Neighbors Regressor,216142.5,355379900000.0,587846.1,0.8996,0.2631,0.1962,17.897
br,Bayesian Ridge,598045.7,1285667000000.0,1120957.0,0.6364,0.8816,0.9602,0.572
llar,Lasso Least Angle Regression,597799.5,1285664000000.0,1120956.0,0.6364,0.8813,0.9593,0.079
ridge,Ridge Regression,598246.7,1285645000000.0,1120946.0,0.6364,0.8777,0.9601,0.06


In [None]:
print(best)

[<catboost.core.CatBoostRegressor object at 0x7fd07eabb750>, ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=100, n_jobs=-1, oob_score=False,
                    random_state=7271, verbose=0, warm_start=False), RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_st

In [None]:
# evaluate_model(best)

In [None]:
# predictions = predict_model(best, data = test)

In [None]:
save_model(best, "2022-04-09_model")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[], target='price',
                                       time_features=['parsed_date'])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numer...
                                         max_depth=None, max_features=None,
                                         max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                                         