# Импорт данных

In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import GridSearchCV

from BorutaShap import BorutaShap

ImportError: cannot import name 'binom_test' from 'scipy.stats' (D:\.Distrib\anaconda3\Lib\site-packages\scipy\stats\__init__.py)

In [None]:
train = pd.read_csv('processed_data.csv').drop(['target', 'index'], axis=1)
target = pd.read_csv('input_data/train_target.csv')
test = pd.read_csv('test_data.csv')

print(test.shape, train.shape)

In [None]:
train

In [None]:
# Для Boruta
cat_columns = train.select_dtypes(include=['object', 'category']).columns.tolist()

train = train.drop(cat_columns, axis=1)
test = test.drop(cat_columns, axis=1)

# Модели

In [None]:
from sklearn.model_selection import train_test_split

features = train.columns
categorical_features = train[features].select_dtypes(include=['object']).columns
for feature in categorical_features:
    train[feature] = train[feature].astype(str)
categorical_features_indices = np.where(train.dtypes == 'object')[0]

# Разделение данных
X_train, X_test, y_train, y_test = train_test_split(
    train,
    target['target'], 
    test_size=0.85,
    random_state=42 
)

y_train = np.log1p(y_train)  # Логарифмируем таргет как в исходном коде
y_test = np.log1p(y_test)

# Для ctboost
train_pool = Pool(data = X_train, 
                  label = y_train, 
                  cat_features = categorical_features_indices)

test_pool = Pool(data = X_test, 
                  cat_features = categorical_features_indices)

In [None]:
model_3 = CatBoostRegressor(
    bootstrap_type='MVS',
    grow_policy='Lossguide',
    max_leaves=64,
    min_data_in_leaf=20,
    boosting_type='Plain',
    score_function='L2',
    iterations=500,
    learning_rate=0.05,
    loss_function='RMSE',
    eval_metric='R2',
    random_seed=42,
    verbose = 1,
    cat_features=categorical_features_indices
)

In [None]:
# Катбуст
model_3.fit(train_pool)

In [None]:
# Вычисляем RMSE CatBoostRegressor
rmse = np.sqrt(np.mean((y_test - model_3.predict(test_pool)) ** 2))
print(f"RMSE: {rmse:.4f}")

# Boruta

In [None]:
# прежде модель не учить
boruta = BorutaShap(
    estimator=model_3, # Любую модель c методам .fit(X, y) и .feature_importances_ или .coef_
    n_estimators='auto',
    importance_measure='shap',  # Используем SHAP-значения
    classification=False         # Для классификации (False для регрессии)
)

boruta_shap.fit(
    X=X_train,
    y=y_train,
    n_trials=50,               # Количество итераций
    #random_state=42,
    sample=False,               # Не использовать подвыборку данных (если данных много)
    train_or_test='test',       # Использовать тестовый набор для оценки
    normalize=True              # Нормализовать SHAP-значения
)

selected_features = boruta_shap.Subset()
print("Отобранные признаки:", selected_features)

In [None]:
X_test = X_test[selected_features]
X_train = X_train[selected_features]

In [None]:
# Указываем категории (если они остались в отобранных)
new_cat_features = list(set(cat_features) & set(selected_features))

# Создаем Pool
pool = Pool(X_selected, y, cat_features=new_cat_features)

# Обучаем финальную модель
model_3.fit(pool)

In [None]:
# Вычисляем RMSE CatBoostRegressor
rmse = np.sqrt(np.mean((y_test - model_3.predict(test_pool)) ** 2))
print(f"RMSE: {rmse:.4f}")

In [None]:
test = test[selected_features]

# Сабмит

In [None]:
test = test.drop('index', axis=1)
for col in categorical_features:
    test[col] = test[col].astype(str)

submit_test_pool = Pool(data = test, 
                        cat_features = categorical_features_indices)

# Делаем предсказания
test_predict = model_3.predict(submit_test_pool)
test_full_predict = np.exp(test_predict) - 1  # Обратное преобразование из логарифма

# Формируем сабмит
submission = pd.DataFrame()
submission['id'] = test['id']
submission['target'] = test_full_predict
submission.to_csv('submission.csv', index=False)
pd.read_csv('submission.csv').shape # 127756 2