In [3]:
!pip install shap

Collecting shap
  Downloading shap-0.41.0-cp39-cp39-win_amd64.whl (435 kB)
     -------------------------------------- 435.6/435.6 kB 4.6 MB/s eta 0:00:00
Collecting slicer==0.0.7
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.41.0 slicer-0.0.7




In [5]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, mean_absolute_percentage_error

from sklearn.preprocessing import OneHotEncoder
from catboost import CatBoostRegressor, CatBoostClassifier, Pool, cv

try:
    import optuna
except Exception:
    ! pip install optuna
    import optuna
    
import shap

pd.set_option("display.max_columns", None)

Collecting optuna
  Using cached optuna-3.0.5-py3-none-any.whl (348 kB)
Collecting cliff
  Using cached cliff-4.1.0-py3-none-any.whl (81 kB)
Collecting alembic>=1.5.0
  Using cached alembic-1.9.1-py3-none-any.whl (210 kB)
Collecting colorlog
  Using cached colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.8.2
  Using cached cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting Mako
  Using cached Mako-1.2.4-py3-none-any.whl (78 kB)
Collecting autopage>=0.4.0
  Using cached autopage-0.5.1-py3-none-any.whl (29 kB)
Collecting stevedore>=2.0.1
  Using cached stevedore-4.1.1-py3-none-any.whl (50 kB)
Collecting PrettyTable>=0.7.2
  Using cached prettytable-3.5.0-py3-none-any.whl (26 kB)
Collecting cmd2>=1.0.0
  Using cached cmd2-2.4.2-py3-none-any.whl (147 kB)
Collecting pbr!=2.1.0,>=2.0.0
  Using cached pbr-5.11.0-py2.py3-none-any.whl (112 kB)
Installing collected packages: PrettyTable, pbr, Mako, colorlog, cmd2, cmaes, autopage, stevedore, alembic, cliff, optuna
Successfully instal



In [6]:
df = pd.read_csv('df_engineering_new.csv')

# Baseline - Human learning

In [7]:
SEED = 26

In [8]:
df.columns.tolist()

['Price',
 'Num_of_rooms',
 'Total_area',
 'Floor',
 'Ceiling_height',
 'Repair',
 'Decoration',
 'Sale_method',
 'House_type',
 'Passenger_lift',
 'Cargo_lift',
 'Participation_type',
 'lat_object',
 'lon_object',
 'комендантский проспект_dist',
 'крестовский остров_dist',
 'шушары_dist',
 'парнас_dist',
 'купчино_dist',
 'улица дыбенко_dist',
 'девяткино_dist',
 'проспект ветеранов_dist',
 'Price_m_2',
 'Price_m_2_cat',
 'Balcony_convoul',
 'Deal_type_convoul',
 'In_house_convoul',
 'Yard_convoul',
 'Furniture_convoul',
 'Windows_convoul',
 'Parking_convoul',
 'Bathroom_convoul',
 'Appliances_convoul',
 'Room_type_convoul',
 'Publication_date_parsed',
 'Years_house',
 'Cargo_lift_no_info',
 'Passenger_lift_no_info']

In [9]:
train, test = train_test_split(df, test_size=.2, random_state=SEED)

y = ['Price_m_2']

square_categories = [0, 30, 40, 50, 60, 70, float('inf')]

train['Total_area_cat'] = pd.cut(train['Total_area'], square_categories)
test['Total_area_cat'] = pd.cut(test['Total_area'], square_categories)

learning_mask = train.groupby(by='Total_area_cat')[y].agg('median').reset_index()

test = test.merge(learning_mask, how='left', on='Total_area_cat', suffixes=('', '_pred'))

mean_absolute_percentage_error(test['Price_m_2'], test['Price_m_2_pred'])

0.23202139825698953

## Catboost

In [10]:
def ohe_transformer(df, cat_features, encoder, encoder_columns):
    ohe = encoder.transform(df[cat_features])
    ohe = pd.DataFrame(ohe, columns=encoder_columns, index=df.index)
    ohe = df.drop(cat_features, axis=1).join(ohe)#.astype('int')
    return ohe

In [11]:
def ohe(train, test, cat_features):
    encoder = OneHotEncoder(drop="first", sparse=False)
    encoder.fit(train[cat_features])
    encoder_columns = ["_".join(x.split()) for x in encoder.get_feature_names_out()]
    return ohe_transformer(
        train, cat_features, encoder, encoder_columns
    ), ohe_transformer(test, cat_features, encoder, encoder_columns)

### Classifier

In [12]:
train, test = train_test_split(df, test_size=.2, random_state=SEED)

In [13]:
y_cl = ['Price_m_2_cat']

cat_features_cb = [
    'Repair', 
    'Decoration',
    'Sale_method',
    'House_type', 
    'Participation_type', 
    ]

train_ohe, test_ohe = ohe(train, test, cat_features_cb)

train_cl, train_reg = train_test_split(train_ohe, test_size=.5, random_state=SEED)

In [17]:
X_cl = train_cl.drop(y + y_cl + ['Price'], axis=1).columns.tolist()

pool_train_cl = Pool(
    train_cl[X_cl], 
    train_cl[y_cl],
    )

CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=26]="2022-10-22": Cannot convert 'b'2022-10-22'' to float

In [16]:
params = {
    "learning_rate": 0.2,
    'custom_loss':['Accuracy'],
    "loss_function": "MultiClass",
    #"loss_function": "Logloss",
    "random_seed": SEED,
    "early_stopping_rounds": 200,
    #"auto_class_weights": "Balanced",    
    #'class_names': cost_categories_labels,
    'classes_count': len(cost_categories_labels),
}

cv_cl = cv(
    pool=pool_train_cl,
    params=params,
    fold_count=5,
    shuffle=True,
    verbose=250,
    return_models=True,
)

NameError: name 'cost_categories_labels' is not defined