## imports

In [1]:
from warnings import filterwarnings

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, LabelEncoder
from pycaret.regression import *

sns.set()
filterwarnings("ignore")

## pre-saved data loading

In [2]:
train_raw = pd.read_parquet("https://github.com/XelorR/sf_project_6/raw/master/data/2022-04-08_train_pre-model.parquet")
test_raw = pd.read_parquet("https://github.com/XelorR/sf_project_6/raw/master/data/2022-04-08_test_pre-model.parquet")

train_raw.shape, test_raw.shape

((115367, 30), (34686, 28))

## pre-preprocessing

In [3]:
train_raw["train/test"] = "train"
test_raw["train/test"] = "test"

data = train_raw.append(test_raw)
data["ptc"].fillna("Оригинал", inplace=True)
data[data.select_dtypes("object").columns.tolist()] = data[data.select_dtypes("object").columns.tolist()].astype(str)

train = data.loc[data["train/test"] == "train"].drop(columns=["sample", "description", "train/test"])
test = data.loc[data["train/test"] == "test"].drop(columns=["sample", "description", "train/test", "price"])

## preprocessing

In [4]:
s = setup(
    data = train,
    target="price",
    date_features=["parsed_date"],
    high_cardinality_features = ["model_name"], 
    normalize=True,
    transformation=True,
    remove_outliers = True,
    handle_unknown_categorical = True,
    remove_multicollinearity = True,
    # preprocess = False,
    # categorical_imputation = "mode",
    # imputation_type="iterative",
    # numeric_iterative_imputer = "catboost",
    # categorical_iterative_imputer = "catboost",
    # feature_selection=True,
    # feature_selection_threshold = 0.5,
    combine_rare_levels = True,
)

Unnamed: 0,Description,Value
0,session_id,3213
1,Target,price
2,Original Data,"(115367, 28)"
3,Missing Values,False
4,Numeric Features,9
5,Categorical Features,17
6,Ordinal Features,False
7,High Cardinality Features,True
8,High Cardinality Method,frequency
9,Transformed Train Set,"(76718, 89)"


## saving preprocessed datasets for future

In [6]:
train_encoded = get_config("X")
train_encoded["price"] = get_config("y")

prep_pipe = get_config("prep_pipe")
test_encoded = prep_pipe.transform(test)

train_encoded.shape, test_encoded.shape

((115367, 90), (34686, 89))

In [7]:
train_encoded.to_parquet("data/2022-04-11_train_encoded_full.parquet")
test_encoded.to_parquet("data/2022-04-11_test_encoded_full.parquet")

## comparing models

In [8]:
best = compare_models(exclude=["dummy", "ada"], n_select=7)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,154520.9054,195083135484.442,418959.1588,0.9481,0.2278,0.1605,8.559
et,Extra Trees Regressor,141823.8047,204674463100.7356,431128.3114,0.9457,0.2081,0.1461,53.907
xgboost,Extreme Gradient Boosting,160054.6781,205508395008.0,431799.1062,0.9452,0.2396,0.166,10.072
rf,Random Forest Regressor,143688.3994,220588339247.8722,448960.6547,0.9415,0.2071,0.1447,40.759
lightgbm,Light Gradient Boosting Machine,183881.0534,229491187904.0496,461453.6431,0.9388,0.2578,0.1945,0.711
gbr,Gradient Boosting Regressor,251894.0976,368420090885.7047,596274.7162,0.9008,0.3496,0.2693,12.373
knn,K Neighbors Regressor,214915.6,400910644019.2,625430.4688,0.8915,0.2599,0.1934,8.935
dt,Decision Tree Regressor,190559.8928,469168695335.5873,639594.6583,0.876,0.2793,0.1938,0.73
lasso,Lasso Regression,607322.7875,1451461646745.6,1198967.875,0.6083,0.8907,0.966,4.241
ridge,Ridge Regression,607268.5188,1451459084288.0,1198966.575,0.6083,0.8879,0.9659,0.09


## prediction - first try

In [11]:
predictions = predict_model(best[0], data = test)

In [14]:
submission = pd.read_csv("data/sample_submission.csv")
submission["price"] = predictions["Label"]
submission.to_csv("submission.csv", index=False)

## saving models and variables

In [10]:
save_config("models/2022-04-11_config")
save_model(best, "models/2022-04-11_model")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[], target='price',
                                       time_features=['parsed_date'])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numer...
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100,
                                             n_iter_no_change=None,
                                         

## creating models

In [None]:
catboost = create_model("catboost", fold=8)
rf = create_model("rf", fold=8)
knn = create_model("knn", fold=8)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,157332.8126,571484708730.0208,755966.0764,0.854,0.2299,0.1596
1,153244.9817,131571212306.1318,362727.4629,0.9618,0.2254,0.1609
2,153938.8423,139421391944.11,373391.7406,0.9647,0.2331,0.1628
3,157277.073,190313576764.3681,436249.4433,0.9456,0.2307,0.1632
4,161978.6365,226046641963.5348,475443.6265,0.9413,0.2353,0.1647
5,153782.025,144470610835.1919,380092.8976,0.9609,0.2183,0.156
6,154578.867,129693485511.01,360129.8176,0.9622,0.2298,0.1629
7,153136.1506,112601400680.9762,335561.3218,0.9687,0.2258,0.16
Mean,155658.6736,205700378591.918,434945.2983,0.9449,0.2285,0.1613
Std,2854.7883,142535298078.6476,128541.6899,0.0355,0.005,0.0026


IntProgress(value=0, description='Processing: ', max=4)

Unnamed: 0,Fold,MAE,MSE,RMSE,R2,RMSLE,MAPE


In [None]:
tuned_rf = tune_model(rf)
tuned_knn = tune_model(knn)