## imports

In [1]:
from warnings import filterwarnings

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pycaret.regression import *

sns.set()
filterwarnings("ignore")

## pre-saved data loading

In [2]:
train_raw = pd.read_parquet("https://github.com/XelorR/sf_project_6/raw/master/data/2022-04-08_train_pre-model.parquet")
test_raw = pd.read_parquet("https://github.com/XelorR/sf_project_6/raw/master/data/2022-04-08_test_pre-model.parquet")

train_raw.shape, test_raw.shape

((115367, 30), (34686, 28))

## pre-preprocessing

In [3]:
train_raw["train/test"] = "train"
test_raw["train/test"] = "test"

data = train_raw.append(test_raw)
data["ptc"].fillna("Оригинал", inplace=True)
data[data.select_dtypes("object").columns.tolist()] = data[data.select_dtypes("object").columns.tolist()].astype(str)

train = data.loc[data["train/test"] == "train"].drop(columns=["sample", "description", "train/test"])
test = data.loc[data["train/test"] == "test"].drop(columns=["sample", "description", "train/test", "price"])

## preprocessing

In [4]:
s = setup(
    data = train,
    target="price",
    date_features=["parsed_date"],
    high_cardinality_features = ["model_name"], 
    normalize=True,
    transformation=True,
    remove_outliers = True,
    handle_unknown_categorical = True,
    remove_multicollinearity = True,
    # preprocess = False,
    # categorical_imputation = "mode",
    # imputation_type="iterative",
    # numeric_iterative_imputer = "catboost",
    # categorical_iterative_imputer = "catboost",
    # feature_selection=True,
    # feature_selection_threshold = 0.5,
    combine_rare_levels = True,
)

Unnamed: 0,Description,Value
0,session_id,6413
1,Target,price
2,Original Data,"(115367, 28)"
3,Missing Values,False
4,Numeric Features,9
5,Categorical Features,17
6,Ordinal Features,False
7,High Cardinality Features,True
8,High Cardinality Method,frequency
9,Transformed Train Set,"(76718, 90)"


## creating models

In [5]:
catboost = create_model("catboost")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,152002.5215,116378660312.6858,341143.1669,0.9626,0.2252,0.1616
1,156673.6439,153996767563.6994,392424.2189,0.9568,0.2387,0.1666
2,149425.0129,125676773189.121,354509.2004,0.9643,0.2274,0.1579
3,150142.3321,102343853447.1975,319912.259,0.9663,0.2277,0.1607
4,153405.8104,144059012708.66,379551.0673,0.9599,0.2316,0.1592
5,156590.6446,148252457271.6293,385035.6571,0.9627,0.2228,0.1552
6,155956.7997,709213773784.8662,842148.3087,0.8117,0.2318,0.1604
7,154134.781,227314202558.9358,476774.7923,0.94,0.2286,0.1567
8,154417.6015,152200688635.9,390129.0666,0.9608,0.2274,0.16
9,160853.7654,243250478932.2458,493204.2974,0.9264,0.2344,0.1623


In [6]:
lightgbm = create_model("lightgbm")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,177319.9655,143723849993.429,379109.285,0.9538,0.2512,0.1908
1,185158.7029,193465157867.7304,439846.7436,0.9457,0.264,0.2005
2,176917.1378,158978194542.0402,398720.6974,0.9548,0.2549,0.1879
3,178497.4406,137423384644.0428,370706.6018,0.9548,0.2536,0.1917
4,177463.883,160929078666.2304,401159.6673,0.9552,0.2462,0.1823
5,183405.4431,167097915605.5264,408776.1192,0.958,0.2579,0.1869
6,184674.9911,757687588195.0922,870452.5192,0.7988,0.2554,0.1932
7,178658.725,259891630217.5882,509795.675,0.9314,0.2461,0.1855
8,178825.0541,159773607962.3896,399716.9098,0.9588,0.2512,0.1924
9,184968.7149,243992456442.471,493955.9256,0.9261,0.2588,0.1905


In [7]:
rf = create_model("rf")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,142875.625,134277716043.5105,366439.2392,0.9569,0.2058,0.1431
1,144960.4827,171992501188.4998,414719.7863,0.9518,0.2145,0.148
2,137781.1552,130220308847.7251,360860.5116,0.963,0.2068,0.1466
3,135558.7029,92740325921.3341,304532.9636,0.9695,0.2085,0.148
4,139543.3919,137296527828.6858,370535.461,0.9618,0.201,0.1399
5,142227.4781,167450632028.7688,409207.3216,0.9579,0.2017,0.1377
6,147165.816,715692401116.5433,845986.0526,0.81,0.2093,0.1438
7,149803.4085,254819212697.8726,504796.2091,0.9327,0.2085,0.1441
8,140849.3264,149150138536.1176,386199.6097,0.9616,0.2083,0.1464
9,143074.2032,229690694502.8298,479260.5706,0.9305,0.2097,0.1449


In [8]:
knn = create_model("knn")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,210183.2188,266336845824.0,516078.3438,0.9144,0.2577,0.1925
1,216416.7969,354883239936.0,595720.75,0.9005,0.2689,0.2027
2,202568.6406,219669954560.0,468689.625,0.9376,0.261,0.1938
3,208227.2812,272442769408.0,521960.5,0.9103,0.2579,0.1948
4,209045.3594,257177436160.0,507126.6562,0.9284,0.2556,0.1893
5,215488.7812,297429925888.0,545371.375,0.9253,0.262,0.1929
6,210604.9688,832086671360.0,912187.875,0.7791,0.2636,0.1933
7,214175.2188,572843687936.0,756864.375,0.8487,0.2601,0.1915
8,212325.2656,313772998656.0,560154.4375,0.9191,0.2535,0.191
9,213075.0469,373914632192.0,611485.625,0.8868,0.2638,0.1966


In [9]:
lr = create_model("lr")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,589259.125,953328795648.0,976385.5625,0.6937,0.8891,0.9384
1,598627.0,1329666064384.0,1153111.5,0.6271,0.8882,0.9481
2,582515.3125,1371203305472.0,1170983.875,0.6102,0.8988,0.9309
3,589420.6875,980886683648.0,990397.25,0.6772,0.9191,0.992
4,598185.75,1365668003840.0,1168618.0,0.6197,0.8901,0.9254
5,611441.4375,1564390850560.0,1250756.125,0.6069,0.8744,0.9113
6,597233.3125,1593659621376.0,1262402.375,0.5769,0.908,0.9774
7,590620.6875,1692782559232.0,1301069.75,0.553,0.8987,0.9613
8,611019.25,1572587569152.0,1254028.5,0.5947,0.8939,0.9895
9,600755.1875,1170315149312.0,1081811.0,0.6457,0.9119,0.9732


## model tuning

In [12]:
tuned_lightgbm = tune_model(lightgbm, search_library="optuna", n_iter=100, optimize = "MAPE")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,144825.6355,131169448046.8542,362173.2293,0.9579,0.2252,0.1522
1,148078.3325,210965304702.7144,459309.5957,0.9408,0.2327,0.1551
2,144679.5695,242369814984.7287,492310.6895,0.9311,0.2384,0.1465
3,139488.9744,102291002400.5382,319829.6459,0.9663,0.2278,0.15
4,146407.1756,268805687469.6126,518464.7408,0.9251,0.2152,0.1437
5,152015.3462,272840638747.5416,522341.4963,0.9314,0.2107,0.144
6,149708.7516,723369802957.169,850511.4949,0.808,0.2165,0.1483
7,151690.5115,454264323513.7657,673991.3379,0.8801,0.2173,0.1436
8,151183.5539,258739856066.7448,508664.7777,0.9333,0.2151,0.1512
9,153988.8326,259323139127.7986,509237.8021,0.9215,0.23,0.1542


In [13]:
tuned_rf = tune_model(rf, choose_better = True, optimize = "MAPE", search_library="optuna")

IntProgress(value=0, description='Processing: ', max=7)

Unnamed: 0,Fold,MAE,MSE,RMSE,R2,RMSLE,MAPE


[32m[I 2022-04-12 08:12:44,879][0m Searching the best hyperparameters using 76718 samples...[0m
exception calling callback for <Future at 0x7f12e81b1bd0 state=finished raised ShutdownExecutorError>
Traceback (most recent call last):
  File "/home/user/Documents/sf_project_6/env_ml/lib64/python3.7/site-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "/home/user/Documents/sf_project_6/env_ml/lib64/python3.7/site-packages/joblib/parallel.py", line 359, in __call__
    self.parallel.dispatch_next()
  File "/home/user/Documents/sf_project_6/env_ml/lib64/python3.7/site-packages/joblib/parallel.py", line 792, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/home/user/Documents/sf_project_6/env_ml/lib64/python3.7/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File "/home/user/Documents/sf_project_6/env_ml/lib64/python3.7/site-packages/joblib/parallel.py", 

KeyboardInterrupt: 

In [None]:
tuned_knn = tune_model(knn, optimize = "MAPE", search_library="optuna")

## emsemble - stacking

In [None]:
stacker = stack_models(estimator_list=[catboost, tuned_lightgbm, tuned_rf, tuned_knn, lr])

In [None]:
save_model(stacker, "stacker")

In [None]:
save_model(tuned_rf, "tuned_rf")

In [None]:
save_model(tuned_knn, "tuned_knn")

In [14]:
save_model(tuned_lightgbm, "tuned_lightgbm")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[], target='price',
                                       time_features=['parsed_date'])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numer...
                                importance_type='split',
                                learning_rate=0.07138121645973822, max_depth=-1,
                                min_child_samples=78, min_child_weight=0.001,
                                min_split_gain=0.726386549714634,
                               

In [15]:
save_model(catboost, "catboost")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[], target='price',
                                       time_features=['parsed_date'])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numer...
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'),
                 ('fix_multi',
                  Fix_multicollinearity(correlation_with_target_preference=None,
                                        correlation_with_target_threshold=0.0,
               

In [16]:
save_model(lr, "lr")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[], target='price',
                                       time_features=['parsed_date'])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numer...
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'),
                 ('fix_multi',
                  Fix_multicollinearity(correlation_with_target_preference=None,
                                        correlation_with_target_threshold=0.0,
               

exception calling callback for <Future at 0x7f12e8072410 state=finished raised ShutdownExecutorError>
Traceback (most recent call last):
  File "/home/user/Documents/sf_project_6/env_ml/lib64/python3.7/site-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "/home/user/Documents/sf_project_6/env_ml/lib64/python3.7/site-packages/joblib/parallel.py", line 359, in __call__
    self.parallel.dispatch_next()
  File "/home/user/Documents/sf_project_6/env_ml/lib64/python3.7/site-packages/joblib/parallel.py", line 792, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/home/user/Documents/sf_project_6/env_ml/lib64/python3.7/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File "/home/user/Documents/sf_project_6/env_ml/lib64/python3.7/site-packages/joblib/parallel.py", line 777, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/home/user/D