In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Важная настройка для корректной настройки pipeline!
import sklearn
sklearn.set_config(transform_output="pandas")

# Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler, OrdinalEncoder
from category_encoders.target_encoder import TargetEncoder

from sklearn.model_selection import GridSearchCV, KFold

# for model learning
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score

#models
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier
from catboost import CatBoostRegressor

# Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error


# tunning hyperparamters model
import optuna


In [2]:
train = pd.read_csv("/home/savely/ds_bootcamp/ds-phase-1/House_Prices_project/data/train.csv")
test = pd.read_csv("/home/savely/ds_bootcamp/ds-phase-1/House_Prices_project/data/test.csv")
submission = pd.read_csv("/home/savely/ds_bootcamp/ds-phase-1/House_Prices_project/data/sample_submission.csv")

In [None]:
# Предположим, что train и test уже загружены
X, y = train.drop("SalePrice", axis=1), train["SalePrice"]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Определение списков признаков
num_features = X.select_dtypes(exclude='object')
cat_features = X.select_dtypes(include='object')

# Обработка пропущенных значений
nan_num = pd.DataFrame(data={'nan':  num_features.isna().sum()})
nan_cat = pd.DataFrame(data={'nan':  cat_features.isna().sum()})

drop_nan_cat = nan_cat[nan_cat['nan']>500]['nan'].index.to_list()

num_trans = nan_num[nan_num['nan']>0]['nan'].index.to_list()
cat_trans = nan_cat[nan_cat['nan']>0]['nan'].index.to_list()

drop = [
    'Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature',
    'MoSold', '3SsnPorch', 'BsmtFinSF2', 'BsmtHalfBath', 'MiscVal', 'Id',
    'LowQualFinSF', 'YrSold'
]

num_imp_avg = ["MasVnrArea", "BsmtFinSF1", "BsmtUnfSF", "TotalBsmtSF", "BsmtFullBath", "GarageCars", "GarageArea"]
num_imp_no = ["LotFrontage", "GarageYrBlt"]
cat_imp_mode = ["Functional", "Utilities"]
cat_imp_no = [
    "GarageCond", "GarageType", "GarageFinish", "GarageQual", "BsmtFinType2",
    "BsmtExposure", "BsmtQual", "BsmtCond", "BsmtFinType1", "Electrical"
]

my_imputer = ColumnTransformer(
    transformers=[
        ("drop_features", "drop", drop),
        ("num_imp_avg", SimpleImputer(strategy="mean"), num_imp_avg),
        ("cat_imp_mode", SimpleImputer(strategy="most_frequent"), cat_imp_mode),
        ("cat_imp_no", SimpleImputer(strategy="constant", fill_value="missing"), cat_imp_no),
        ("num_imp_no", SimpleImputer(strategy="constant", fill_value=-1), num_imp_no),
    ],
    verbose_feature_names_out=False,
    remainder="passthrough",
)

# Кодирование категориальных признаков
mas_one_hot_encoder = ['ExterQual', 'HeatingQC', 'ExterCond', 'BsmtCond', 'BldgType', 'LotShape', 'LotConfig', 'GarageFinish', 'KitchenQual', 'PavedDrive', 'BsmtQual', 'BsmtExposure', 'LandSlope', 'LandContour', 'MSZoning']
mas_ordinal_encoder = ['Street', 'Utilities', 'CentralAir']
mas_target_encoder = ['RoofStyle', 'Exterior1st', 'GarageQual', 'Foundation', 'Exterior2nd', 'HouseStyle', 'Condition2', 'GarageType', 'Neighborhood', 'GarageCond', 'BsmtFinType2', 'Heating', 'Functional', 'RoofMatl', 'Condition1', 'Electrical', 'SaleCondition', 'SaleType', 'BsmtFinType1']

my_encoder = ColumnTransformer(
    [
        ("ordinalEncoder", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), mas_ordinal_encoder),
        ("oneHotEncoder", OneHotEncoder(handle_unknown='ignore', sparse_output=False), mas_one_hot_encoder),
        ("targetencoder", TargetEncoder(), mas_target_encoder)
    ],
    verbose_feature_names_out=False,
    remainder='passthrough'
)

# Масштабирование числовых признаков
standart_scaler_columns = ['1stFlrSF', '2ndFlrSF', 'BedroomAbvGr', 'BsmtFinSF1', 'BsmtFullBath', 'BsmtUnfSF', 'EnclosedPorch', 'Fireplaces', 'FullBath', 'GarageArea', 'GarageCars', 'GarageYrBlt', 'GrLivArea', 'HalfBath', 'KitchenAbvGr', 'LotArea', 'LotFrontage', 'MSSubClass', 'MasVnrArea', 'OpenPorchSF', 'OverallCond', 'OverallQual', 'PoolArea', 'ScreenPorch', 'TotRmsAbvGrd', 'TotalBsmtSF', 'WoodDeckSF', 'YearBuilt', 'YearRemodAdd']

my_scaler = ColumnTransformer(
    [
        ("scaling_num_columns", StandardScaler(), standart_scaler_columns)
    ],
    verbose_feature_names_out=False,
    remainder="passthrough"
)

# Создание пайплайна
preprocessor = Pipeline(
    [
        ("imputer", my_imputer),
        ("encoder", my_encoder),
        ("scaler", my_scaler)
    ]
)

# Обучение пайплайна
preprocessor.fit(X_train, np.log(y_train))

# Определение функции для Optuna
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 500, 1500),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-8, 100),
        'random_strength': trial.suggest_uniform('random_strength', 1e-8, 10),
        'bagging_temperature': trial.suggest_uniform('bagging_temperature', 0.0, 1.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'verbose': False
    }

    ml_pipeline = Pipeline(
        [
            ("preprocessor", preprocessor),
            ("model", CatBoostRegressor(**params))
        ]
    )

    ml_pipeline.fit(X_train, np.log(y_train))
    y_pred = ml_pipeline.predict(X_valid)
    msle = mean_squared_error(np.log(y_valid), y_pred)

    return msle

# Оптимизация с помощью Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=200)

# Лучшие параметры
best_params = study.best_params
print("Best parameters:", best_params)

# Обучение модели с лучшими параметрами
ml_pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("model", CatBoostRegressor(**best_params))
    ]
)
ml_pipeline.fit(X_train, np.log(y_train))

# Предсказание на тестовых данных
testy_preds = ml_pipeline.predict(test)

# Оценка качества
print('MSLE:', mean_squared_error(np.log(submission['SalePrice']), testy_preds))

# Сохранение результатов
result = pd.DataFrame({'Id': test['Id'], 'SalePrice': np.exp(testy_preds)})
result.to_csv('submission_new_optuna.csv', index=False)

[I 2025-03-13 22:57:07,527] A new study created in memory with name: no-name-7783e268-b4e8-45a4-8a9f-b6de3561ddb3
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-8, 100),
  'random_strength': trial.suggest_uniform('random_strength', 1e-8, 10),
  'bagging_temperature': trial.suggest_uniform('bagging_temperature', 0.0, 1.0),
[I 2025-03-13 22:57:29,340] Trial 0 finished with value: 0.02018230573136654 and parameters: {'iterations': 1310, 'learning_rate': 0.04995205552994851, 'depth': 9, 'l2_leaf_reg': 7.834982364026298e-07, 'random_strength': 5.026706855606484, 'bagging_temperature': 0.7968451360176834, 'border_count': 100}. Best is trial 0 with value: 0.02018230573136654.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-8, 100),
  'random_strength': trial.suggest_uniform('random_strength', 1e-8, 10),
  'bagging_temperatur

Best parameters: {'iterations': 1381, 'learning_rate': 0.04909330950239233, 'depth': 4, 'l2_leaf_reg': 0.2255407192268056, 'random_strength': 3.1920059019831255, 'bagging_temperature': 0.4952625793593215, 'border_count': 43}
0:	learn: 0.3796087	total: 940us	remaining: 1.3s
1:	learn: 0.3715148	total: 1.89ms	remaining: 1.31s
2:	learn: 0.3618029	total: 2.66ms	remaining: 1.22s
3:	learn: 0.3527184	total: 3.4ms	remaining: 1.17s
4:	learn: 0.3432428	total: 4.28ms	remaining: 1.18s
5:	learn: 0.3351726	total: 5.05ms	remaining: 1.16s
6:	learn: 0.3269732	total: 5.92ms	remaining: 1.16s
7:	learn: 0.3184411	total: 6.77ms	remaining: 1.16s
8:	learn: 0.3107321	total: 7.57ms	remaining: 1.15s
9:	learn: 0.3035420	total: 8.63ms	remaining: 1.18s
10:	learn: 0.2959141	total: 11.4ms	remaining: 1.42s
11:	learn: 0.2884247	total: 13.2ms	remaining: 1.5s
12:	learn: 0.2831113	total: 14.3ms	remaining: 1.5s
13:	learn: 0.2768083	total: 15.4ms	remaining: 1.5s
14:	learn: 0.2702704	total: 16.3ms	remaining: 1.48s
15:	learn: 