In [None]:
import numpy as np
from boruta import BorutaPy
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split

# --- 1. Генерация данных ---

train = pd.read_csv('processed_data.csv').drop(['target', 'index'], axis=1)
target = pd.read_csv('input_data/train_target.csv')
test = pd.read_csv('test_data.csv')

print(test.shape, train.shape)

# Разделение на train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# --- 2. Отбор признаков с Boruta ---
catboost_model = CatBoostRegressor(iterations=100, verbose=False)

boruta_selector = BorutaPy(
    estimator=catboost_model,
    n_estimators='auto',
    verbose=2,
    random_state=42,
    max_iter=50
)
boruta_selector.fit(X_train, y_train)

# Маска отобранных признаков
selected_features = boruta_selector.support_
print("Отобранные признаки (индексы):", np.where(selected_features)[0])

# --- 3. Обучение CatBoost на отобранных признаках ---
X_train_selected = X_train[:, selected_features]
X_test_selected = X_test[:, selected_features]

train_pool = Pool(X_train_selected, y_train)
test_pool = Pool(X_test_selected, y_test)

final_model = CatBoostRegressor(iterations=100, verbose=False)
final_model.fit(train_pool)

# Предсказание и RMSE с Boruta
y_pred = final_model.predict(test_pool)
rmse_boruta = np.sqrt(np.mean((y_test - y_pred) ** 2))
print(f"RMSE с Boruta: {rmse_boruta:.4f}")

# --- 4. Сравнение с моделью без отбора ---
full_model = CatBoostRegressor(iterations=100, verbose=False)
full_model.fit(X_train, y_train)

y_pred_full = full_model.predict(X_test)
rmse_full = np.sqrt(np.mean((y_test - y_pred_full) ** 2))
print(f"RMSE без Boruta: {rmse_full:.4f}")

# --- 5. Анализ результатов ---
if rmse_boruta < rmse_full:
    print("✅ Boruta улучшил модель!")
else:
    print("❌ Boruta не дал улучшения. Попробуйте уменьшить max_iter или использовать select_features() из CatBoost.")