In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

# === Загрузка данных ===
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Сохраняем ID для submission
train_id = train['Id']
test_id = test['Id']

# Удаляем Id (не нужен для обучения)
train.drop('Id', axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)

# === Целевая переменная: логарифм SalePrice ===
y = np.log1p(train['SalePrice'])
train.drop('SalePrice', axis=1, inplace=True)

# === Объединение данных для единообразной обработки ===
n_train = len(train)
data = pd.concat([train, test], ignore_index=True)

# === Фичи: добавим инженерные признаки ===

# Площадь
data['TotalSF'] = data['GrLivArea'] + data['TotalBsmtSF'] + data['GarageArea'] + data['WoodDeckSF'] + data['OpenPorchSF']
data['TotalBath'] = data['FullBath'] + data['HalfBath'] + data['BsmtFullBath'] + data['BsmtHalfBath']
data['TotalPorchSF'] = data['OpenPorchSF'] + data['EnclosedPorch'] + data['3SsnPorch'] + data['ScreenPorch']
data['TotalRoom'] = data['TotRmsAbvGrd'] + data['BedroomAbvGr'] + data['KitchenAbvGr']

# Возраст
current_year = 2025  # важно: актуальный год!
data['Age'] = current_year - data['YearBuilt']
data['RemodAge'] = current_year - data['YearRemodAdd']
data['AgeSinceRemodel'] = data['Age'] - data['RemodAge']
data['AgeSinceRemodel'].fillna(0, inplace=True)

# Признаки наличия
data['HasGarage'] = (data['GarageCars'] > 0).astype(int)
data['HasBasement'] = (data['TotalBsmtSF'] > 0).astype(int)
data['HasFireplace'] = (data['Fireplaces'] > 0).astype(int)
data['HasPool'] = (data['PoolArea'] > 0).astype(int)
data['HasFence'] = (data['Fence'] != 'None').astype(int)
data['HasAlley'] = (data['Alley'] != 'None').astype(int)
data['HasWoodDeck'] = (data['WoodDeckSF'] > 0).astype(int)
data['Has3SsnPorch'] = (data['3SsnPorch'] > 0).astype(int)
data['HasOpenPorch'] = (data['OpenPorchSF'] > 0).astype(int)

# Отношения площадей
data['LivingAreaRatio'] = data['GrLivArea'] / data['LotArea']
data['BasementToTotalSF'] = data['TotalBsmtSF'] / data['TotalSF']
data['GarageToTotalSF'] = data['GarageArea'] / data['TotalSF']

# Качество и состояние
data['OverallQuality'] = data['OverallQual']
data['OverallCondition'] = data['OverallCond']
data['QualCond'] = data['OverallQuality'] * data['OverallCondition']

# Категориальные признаки: замена NA
categorical_cols = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                    'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
                    'PoolQC', 'Fence', 'MiscFeature', 'MasVnrType']
for col in categorical_cols:
    data[col].fillna('None', inplace=True)

# Преобразование категориальных признаков в числовые
le = LabelEncoder()
for col in categorical_cols:
    data[col] = le.fit_transform(data[col])

# One-Hot Encoding для некоторых категорий
data = pd.get_dummies(data, columns=['Neighborhood', 'MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
                                      'LotConfig', 'LandSlope', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
                                      'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'CentralAir',
                                      'Electrical', 'Functional', 'GarageType', 'PavedDrive', 'SaleType', 'SaleCondition'])

# Пропуски в числовых признаках — заполнение медианой
num_cols = data.select_dtypes(include=[np.number]).columns
data[num_cols] = data[num_cols].fillna(data[num_cols].median())

# Нормализация (RobustScaler лучше, так как устойчив к выбросам)
scaler = RobustScaler()
data[num_cols] = scaler.fit_transform(data[num_cols])

# Разделение обратно на train и test
X_train = data[:n_train]
X_test = data[n_train:]

# === Стекинг (Stacking) ===
# Модели на первом уровне
base_models = [
    ('xgb', XGBRegressor(random_state=42, n_estimators=1000, max_depth=6, learning_rate=0.01)),
    ('lgbm', LGBMRegressor(random_state=42, n_estimators=1000, max_depth=6, learning_rate=0.01)),
    ('catb', CatBoostRegressor(silent=True, random_state=42, n_estimators=1000, depth=6))
]

# Второй уровень модель (регрессор, который обучается на прогнозах базовых моделей)
meta_model = LinearRegression()

# Создаем стекинг
class StackingRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, base_models, meta_model):
        self.base_models = base_models
        self.meta_model = meta_model
        self.models = []

    def fit(self, X, y):
        # Обучаем базовые модели
        for name, model in self.base_models:
            model.fit(X, y)
            self.models.append((name, model))

        # Предсказания базовых моделей на тренировочных данных
        predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, (name, model) in enumerate(self.models):
            predictions[:, i] = model.predict(X)

        # Обучаем мета-модель
        self.meta_model.fit(predictions, y)
        return self

    def predict(self, X):
        # Предсказания базовых моделей
        predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, (name, model) in enumerate(self.models):
            predictions[:, i] = model.predict(X)

        # Предсказание мета-модели
        return self.meta_model.predict(predictions)

# === Обучение и оценка ===
stacker = StackingRegressor(base_models=base_models, meta_model=meta_model)

# Cross-validation
cv_scores = cross_val_score(stacker, X_train, y, cv=5, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-cv_scores)
print(f"CV RMSE: {rmse_scores.mean():.4f} (+/- {rmse_scores.std() * 2:.4f})")

# Обучаем на всей выборке
stacker.fit(X_train, y)

# Предсказание на тесте
preds = stacker.predict(X_test)
preds = np.expm1(preds)  # обратное преобразование от log

# === Сабмит ===
submission = pd.DataFrame({'Id': test_id, 'SalePrice': preds})
submission.to_csv('submission_stacking.csv', index=False)
print("Submission saved!")

XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/opt/miniconda3/lib/python3.13/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <636BF463-1886-392D-B8B3-6011C44DCEE9> /opt/miniconda3/lib/python3.13/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/miniconda3/lib/python3.13/lib-dynload/../../libomp.dylib' (no such file), '/opt/miniconda3/bin/../lib/libomp.dylib' (no such file)"]


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import Ridge
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
import optuna
from optuna.samplers import TPESampler
import warnings
warnings.filterwarnings('ignore')

# === Загрузка данных ===
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Сохраняем ID
train_id = train['Id']
test_id = test['Id']

# Удаляем Id
train.drop('Id', axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)

# === Целевая переменная: логарифм SalePrice ===
y = np.log1p(train['SalePrice'])
train.drop('SalePrice', axis=1, inplace=True)

# === Объединение данных ===
n_train = len(train)
data = pd.concat([train, test], ignore_index=True)

# === Инженерия признаков (пример) ===
# (заменить на твой код, если нужно)

# Пример: добавление новых фичей
data['TotalSF'] = data['GrLivArea'] + data['TotalBsmtSF'] + data['GarageArea']
data['Age'] = 2025 - data['YearBuilt']
data['HasGarage'] = (data['GarageCars'] > 0).astype(int)
data['HasBasement'] = (data['TotalBsmtSF'] > 0).astype(int)

# Категориальные признаки
categorical_cols = ['Alley', 'BsmtQual', 'BsmtCond', 'FireplaceQu', 'GarageType', 'PoolQC']
for col in categorical_cols:
    data[col].fillna('None', inplace=True)

# OneHotEncoding
one_hot_coder = ['Neighborhood', 'MSZoning', 'Street', 'LotShape', 'BldgType']
target_coder = ['Exterior1st', 'Exterior2nd', 'KitchenQual', 'Functional']

# Преобразование категориальных
le = LabelEncoder()
for col in categorical_cols:
    data[col] = le.fit_transform(data[col])

# Создание preprocessor
preprocessor = ColumnTransformer(
    [
        ('ohe', OneHotEncoder(sparse_output=False), one_hot_coder),
        ('target', TargetEncoder(), target_coder),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

# === Функция для оценки модели ===
def objective(trial):
    # Гиперпараметры для CatBoost
    cat_params = {
        'iterations': trial.suggest_int('cat_iterations', 200, 800),
        'learning_rate': trial.suggest_float('cat_learning_rate', 1e-3, 0.1, log=True),
        'depth': trial.suggest_int('cat_depth', 4, 10),
        'random_state': 42,
        'verbose': 0
    }

    # LGBM
    lgbm_params = {
        'n_estimators': trial.suggest_int('lgbm_n_estimators', 200, 600),
        'max_depth': trial.suggest_int('lgbm_max_depth', 4, 10),
        'learning_rate': trial.suggest_float('lgbm_learning_rate', 1e-3, 0.1, log=True),
        'random_state': 42
    }

    # RF
    rf_params = {
        'n_estimators': trial.suggest_int('rf_n_estimators', 200, 600),
        'max_depth': trial.suggest_int('rf_max_depth', 4, 10),
        'random_state': 42
    }

    # Ridge (мета-модель)
    ridge_alpha = trial.suggest_float('ridge_alpha', 0.1, 10.0)

    # Создание базовых моделей
    base_models = [
        ('catboost', CatBoostRegressor(**cat_params)),
        ('lgbm', LGBMRegressor(**lgbm_params)),
        ('rf', RandomForestRegressor(**rf_params))
    ]

    # Мета-модель
    meta_model = Ridge(alpha=ridge_alpha)

    # Стекинг
    stacking_regressor = StackingRegressor(
        estimators=base_models,
        final_estimator=meta_model,
        cv=5,
        n_jobs=-1,
        passthrough=False
    )

    # Пайплайн
    ml_stack_pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('stack', stacking_regressor)
    ])

    # Разделение данных
    X_train = data[:n_train]
    X_test = data[n_train:]

    # Обучение и CV
    scores = []
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    for train_idx, val_idx in kf.split(X_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        ml_stack_pipe.fit(X_tr, y_tr)
        preds = ml_stack_pipe.predict(X_val)
        
        # RMSLE
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        scores.append(rmse)

    return np.mean(scores)

# === Запуск Optuna ===
study = optuna.create_study(
    study_name="HousePrices_Stacking_Optuna",
    direction="minimize",  # минимизируем RMSLE
    sampler=TPESampler(seed=42)
)

study.optimize(objective, n_trials=)  # можно увеличить до 100+

# === Вывод лучших параметров ===
print("=== Лучшие параметры ===")
print(study.best_params)
print(f"Лучшее значение RMSLE: {study.best_value:.4f}")

# === Использование лучших параметров ===
best_params = study.best_params

# Настройка моделей
cat_best = CatBoostRegressor(
    iterations=best_params['cat_iterations'],
    learning_rate=best_params['cat_learning_rate'],
    depth=best_params['cat_depth'],
    verbose=0,
    random_state=42
)

lgbm_best = LGBMRegressor(
    n_estimators=best_params['lgbm_n_estimators'],
    max_depth=best_params['lgbm_max_depth'],
    learning_rate=best_params['lgbm_learning_rate'],
    random_state=42
)

rf_best = RandomForestRegressor(
    n_estimators=best_params['rf_n_estimators'],
    max_depth=best_params['rf_max_depth'],
    random_state=42
)

meta_model = Ridge(alpha=best_params['ridge_alpha'])

# Стекинг с лучшими параметрами
base_models = [('catboost', cat_best), ('lgbm', lgbm_best), ('rf', rf_best)]
stacking_regressor = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1,
    passthrough=False
)

# Пайплайн
ml_stack_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('stack', stacking_regressor)
])

# Обучение на всей тренировочной выборке
X_train = data[:n_train]
X_test = data[n_train:]
ml_stack_pipe.fit(X_train, y)

# Предсказания
preds = ml_stack_pipe.predict(X_test)
preds = np.expm1(preds)  # обратное преобразование от log

# === Сабмит ===
submission = pd.DataFrame({'Id': test_id, 'SalePrice': preds})
submission.to_csv('submission_optuna_stacking.csv', index=False)
print("✅ Submission saved!")

In [8]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import Ridge
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split


preprocessor = ColumnTransformer(
    [
        ('ohe_hot_coder', OneHotEncoder(sparse_output=False), one_hot_coder),
        ('target_coder', TargetEncoder(), target_coder),
        
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough'
)



base_models = [
    ('catboost', CatBoostRegressor(iterations=300, verbose=0, random_state=42)),
    ('lgbm', LGBMRegressor(n_estimators=300, random_state=42)),
    ('rf', RandomForestRegressor(n_estimators=300, random_state=42))]
meta_model = Ridge(alpha=1.0)

stacking_regressor = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,                # кросс-валидация для генерации OOF-предсказаний
    n_jobs=-1,
    passthrough=False    # если True — добавит исходные признаки к предсказаниям базовых моделей
)

ml_stack_pipe = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('stack', stacking_regressor)
    ]
)

OSError: dlopen(/opt/miniconda3/lib/python3.13/site-packages/lightgbm/lib/lib_lightgbm.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib
  Referenced from: <D44045CD-B874-3A27-9A61-F131D99AACE4> /opt/miniconda3/lib/python3.13/site-packages/lightgbm/lib/lib_lightgbm.dylib
  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/local/lib/libomp/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/local/lib/libomp/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/local/lib/libomp/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/local/lib/libomp/libomp.dylib' (no such file), '/opt/miniconda3/lib/python3.13/lib-dynload/../../libomp.dylib' (no such file), '/opt/miniconda3/bin/../lib/libomp.dylib' (no such file)

In [None]:

import optuna
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder  # pip install category_encoders
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import Ridge
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_log_error, make_scorer
import warnings
warnings.filterwarnings('ignore')

# --- 1. Подготовка данных (пример) ---
# Предположим, у вас есть X_train, y_train
# one_hot_coder = [...]  # список категориальных столбцов для OHE
# target_coder = [...]   # список категориальных столбцов для TargetEncoder


# --- 2. Функция для расчёта RMSLE (как в соревновании House Prices) ---
def rmsle_score(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))


rmsle_scorer = make_scorer(rmsle_score, greater_is_better=False)

# Note: make_scorer инвертирует знак, поэтому в Optuna будем минимизировать (-mean_cv_score)


# --- 3. Целевая функция для Optuna ---
def objective(trial):
    # --- Подбираем гиперпараметры для базовых моделей ---
    catboost_params = {
        'iterations': trial.suggest_int('catboost_iterations', 100, 500),
        'learning_rate': trial.suggest_float('catboost_lr', 0.01, 0.3, log=True),
        'depth': trial.suggest_int('catboost_depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('catboost_l2', 1e-2, 10.0, log=True),
        'random_strength': trial.suggest_float('catboost_random_strength', 1e-2, 1.0, log=True),
        'bagging_temperature': trial.suggest_float('catboost_bagging_temp', 0.0, 1.0),
        'verbose': 0,
        'random_state': 42
    }

    lgbm_params = {
        'n_estimators': trial.suggest_int('lgbm_n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('lgbm_lr', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('lgbm_max_depth', 3, 10),
        'num_leaves': trial.suggest_int('lgbm_num_leaves', 8, 64),
        'subsample': trial.suggest_float('lgbm_subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('lgbm_colsample', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('lgbm_reg_alpha', 1e-4, 10.0, log=True),
        'reg_lambda': trial.suggest_float('lgbm_reg_lambda', 1e-4, 10.0, log=True),
        'random_state': 42,
        'n_jobs': -1
    }

    rf_params = {
        'n_estimators': trial.suggest_int('rf_n_estimators', 100, 500),
        'max_depth': trial.suggest_int('rf_max_depth', 3, None),
        'min_samples_split': trial.suggest_int('rf_min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('rf_min_samples_leaf', 1, 10),
        'max_features': trial.suggest_float('rf_max_features', 0.1, 1.0),
        'random_state': 42,
        'n_jobs': -1
    }

    # --- Гиперпараметры мета‑модели (Ridge) ---
    ridge_alpha = trial.suggest_float('ridge_alpha', 1e-4, 10.0, log=True)

    # --- Настройки пайплайна (опционально) ---
    # Можно также подбирать, например, sparse_output для OHE, но обычно фиксируют
    ohe_sparse = trial.suggest_categorical('ohe_sparse', [False])

    # --- Собираем пайплайн с текущими параметрами ---
    preprocessor = ColumnTransformer(
        [
            ('ohe_hot_coder', OneHotEncoder(sparse_output=ohe_sparse), one_hot_coder),
            ('target_coder', TargetEncoder(), target_coder),
        ],
        verbose_feature_names_out=False,
        remainder='passthrough'
    )

    base_models = [
        ('catboost', CatBoostRegressor(**catboost_params)),
        ('lgbm', LGBMRegressor(**lgbm_params)),
        ('rf', RandomForestRegressor(**rf_params))
    ]
    meta_model = Ridge(alpha=ridge_alpha)

    stacking_regressor = StackingRegressor(
        estimators=base_models,
        final_estimator=meta_model,
        cv=5,
        n_jobs=-1,
        passthrough=False
    )

    ml_stack_pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('stack', stacking_regressor)
    ])

    # --- Кросс‑валидация на train данных ---
    cv_scores = cross_val_score(ml_stack_pipe, X_train, y_train, cv=5, scoring=rmsle_scorer)
    mean_cv_score = -np.mean(cv_scores)  # инвертируем знак (Optuna минимизирует)

    return mean_cv_score 