In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Lasso, ElasticNet

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

In [168]:
df = pd.read_csv('train.csv')
df.loc[df['LotFrontage'].isna(), 'LotFrontage'] = df['LotFrontage'].mean()
df.loc[df['MasVnrArea'].isna(), 'MasVnrArea'] = 0
df.loc[df['GarageYrBlt'].isna(), 'GarageYrBlt'] = 10000
#Все остальное не требует заполнения
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1460 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [169]:
label_cols = ['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'LotConfig',
 'LandSlope',
 'Condition1',
 'BldgType',
 'HouseStyle',
 'Exterior1st',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtFinType2',
 'CentralAir',
 'KitchenQual',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'SaleCondition']
onehot_cols = ['Utilities',
 'Neighborhood',
 'Condition2',
 'RoofStyle',
 'RoofMatl',
 'Exterior2nd',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'Heating',
 'HeatingQC',
 'Electrical',
 'Functional',
 'Fence',
 'MiscFeature',
 'SaleType']
str_cols=onehot_cols+label_cols
#for col in label_cols:
#    df[col] = LabelEncoder().fit_transform(df[col])
drop_feature= ['BsmtCond',
 '3SsnPorch',
 'RoofStyle',
 'BsmtFullBath',
 'BsmtFinType2',
 'BldgType',
 'GarageQual',
 'EnclosedPorch',
 'MasVnrType',
 'Functional',
 'FireplaceQu',
 'MSSubClass',
 'Electrical',
 'MasVnrArea',
 'Fence',
 'Exterior2nd',
 'Exterior1st']
df = pd.get_dummies(df.drop(columns=drop_feature), columns=list(set(str_cols)-set(drop_feature)))
#df = pd.get_dummies(df, columns=str_cols)

In [170]:
X_train, X_val, y_train, y_val = train_test_split(
    df.drop(columns=['Id','SalePrice']), np.log(df.SalePrice), test_size=0.2, random_state=42)


In [173]:
xgbr_model = XGBRegressor(
n_estimators = 656,
max_depth = 3,
learning_rate = 0.03157223564603808,
subsample = 0.8737643695824285,
colsample_bytree = 0.8977579145799933,
reg_alpha=0.00733063035498116,
reg_lambda=6.713846748376149,
min_child_weight= 2,
random_state =  42)
xgbr_model.fit(X_train,y_train)
root_mean_squared_error(y_val,xgbr_model.predict(X_val))

0.12346106004789488

In [154]:
# Смотрим как изменяется модель при изменении признаков
from sklearn.inspection import permutation_importance

result = permutation_importance(xgbr_model, X_val, y_val, n_repeats=10, random_state=42)
importance_df = pd.DataFrame({
    'feature': X_val.columns,
    'importance': result.importances_mean
}).sort_values(by='importance', ascending=False)

print(importance_df)


                 feature  importance
15             GrLivArea    0.155703
3            OverallQual    0.155073
11           TotalBsmtSF    0.028449
5              YearBuilt    0.023235
4            OverallCond    0.017550
..                   ...         ...
126     Electrical_FuseA   -0.000266
93   Exterior2nd_MetalSd   -0.000296
7             MasVnrArea   -0.000296
139           Fence_GdWo   -0.000313
205  Exterior1st_BrkFace   -0.000377

[287 rows x 2 columns]


In [155]:
#Находим лишние признаки
importance_df['start_feature'] = importance_df['feature'].apply(lambda x: '_'.join(x.split('_')[:-1]) if '_'.join(x.split('_')[:-1]) in str_cols else x)#.query('importance<0.001')
drop_feature = list(importance_df[['importance','start_feature']].groupby('start_feature').agg(lambda x: x.sum()).sort_values(by='importance', ascending=False).query('importance<0').index)

In [156]:
drop_feature

['BsmtCond',
 '3SsnPorch',
 'RoofStyle',
 'BsmtFullBath',
 'BsmtFinType2',
 'BldgType',
 'GarageQual',
 'EnclosedPorch',
 'MasVnrType',
 'Functional',
 'FireplaceQu',
 'MSSubClass',
 'Electrical',
 'MasVnrArea',
 'Fence',
 'Exterior2nd',
 'Exterior1st']

In [176]:
lgbm_model = LGBMRegressor(n_estimators = 1969,
    max_depth = 2,
    num_leaves = 37,
    learning_rate = 0.028872442872394172,
    subsample = 0.8224941244239035,
    colsample_bytree = 0.5911390140527809,
    reg_alpha = 0.33710269270900395,
    reg_lambda = 1.4505375123078616e-07,
    min_child_samples = 8,
    random_state = 42
    )
lgbm_model.fit(X_train,y_train)
root_mean_squared_error(y_val,lgbm_model.predict(X_val))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001863 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2910
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 149
[LightGBM] [Info] Start training from score 12.030652


0.12618743121374612

In [178]:
lasso_model = Lasso(
    alpha = 0.00016964172244575732,
    fit_intercept = True,
    max_iter = 8704,
    tol = 0.00027696118295903035,
    selection = 'cyclic',
    random_state = 42
)
lasso_model.fit(X_train,y_train)
root_mean_squared_error(y_val,lasso_model.predict(X_val))

0.11945510323439838

In [177]:
import optuna
#Тут подбирал параметры, каждый раз меняя код
def objective(trial):
    # Определяем пространство параметров для поиска
    model_class = Lasso 
    params = {
        'alpha': trial.suggest_float('alpha', 1e-4, 10.0, log=True),
        'max_iter': trial.suggest_int('max_iter', 1000, 10000),
        'tol': trial.suggest_float('tol', 1e-5, 1e-2, log=True),
        'selection': trial.suggest_categorical('selection', ['cyclic', 'random']),
        'fit_intercept' : True,
        'random_state' : 42
    }
    # Создаем и обучаем модель
    model = model_class(**params)
    model.fit(X_train.fillna(0), y_train)
    
    # Делаем предсказания и вычисляем RMSE
    y_pred = model.predict(X_val.fillna(0))
    rmse = root_mean_squared_error(y_val, y_pred)
    
    return rmse

# Запускаем оптимизацию
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=200)

print("Лучшие параметры:")
for key, value in study.best_params.items():
    print(f"{key}: {value}")
print(f"Лучшее RMSE: {study.best_value}")

[I 2025-08-26 17:02:09,319] A new study created in memory with name: no-name-6a0f9e4a-247c-47f8-a899-dd190185cfd2
[I 2025-08-26 17:02:09,357] Trial 0 finished with value: 0.13113339103188865 and parameters: {'alpha': 0.0008640426822287815, 'max_iter': 2242, 'tol': 4.279520630608992e-05, 'selection': 'cyclic'}. Best is trial 0 with value: 0.13113339103188865.
  model = cd_fast.enet_coordinate_descent(
[I 2025-08-26 17:02:09,641] Trial 1 finished with value: 0.12020244212569611 and parameters: {'alpha': 0.000312706194798803, 'max_iter': 1976, 'tol': 6.420595127898673e-05, 'selection': 'random'}. Best is trial 1 with value: 0.12020244212569611.
[I 2025-08-26 17:02:09,664] Trial 2 finished with value: 0.1307745297182389 and parameters: {'alpha': 0.0008355836987635953, 'max_iter': 4512, 'tol': 0.0024187937222389703, 'selection': 'cyclic'}. Best is trial 1 with value: 0.12020244212569611.
[I 2025-08-26 17:02:09,686] Trial 3 finished with value: 0.18902363940940636 and parameters: {'alpha': 0

Лучшие параметры:
alpha: 0.00016964172244575732
max_iter: 8704
tol: 0.00027696118295903035
selection: cyclic
Лучшее RMSE: 0.11945510323439838


In [181]:
#Тут подбор весов

def objective_ensemble_weights(trial):
    # Оптимизируем веса для ансамбля
    weight_lasso = trial.suggest_float('weight_lasso', 0.0, 0.1)
    weight_xgb = trial.suggest_float('weight_xgb', 0.0, 1.0)
    weight_lgbm = trial.suggest_float('weight_lgbm', 0.0, 1.0)

    # Нормализуем веса
    total = weight_xgb + weight_lgbm + weight_lasso
    weight_lasso /= total
    weight_xgb /= total
    weight_lgbm /= total

    # Предсказания моделей
    pred_lasso = lasso_model.predict(X_val)
    pred_xgb = xgbr_model.predict(X_val)
    pred_lgbm = lgbm_model.predict(X_val)

    # Взвешенное усреднение
    y_pred = (weight_lasso * pred_lasso + 
              weight_xgb * pred_xgb + 
              weight_lgbm * pred_lgbm)
    
    rmse = root_mean_squared_error(y_val, y_pred)
    return rmse

# Оптимизация весов
study_weights = optuna.create_study(direction='minimize')
study_weights.optimize(objective_ensemble_weights, n_trials=1000)

best_weights = study_weights.best_params
print("Лучшие веса:", best_weights)
print(f"Лучшее RMSE: {study_weights.best_value}")

[I 2025-08-26 17:05:21,287] A new study created in memory with name: no-name-f501d79e-8530-4fb8-ab50-552ff745e95c
[I 2025-08-26 17:05:21,349] Trial 0 finished with value: 0.12331029126952335 and parameters: {'weight_lasso': 0.008899892856878634, 'weight_xgb': 0.8818393745495692, 'weight_lgbm': 0.5466923437323968}. Best is trial 0 with value: 0.12331029126952335.
[I 2025-08-26 17:05:21,418] Trial 1 finished with value: 0.12316228600441836 and parameters: {'weight_lasso': 0.019198285609382783, 'weight_xgb': 0.7863742728778124, 'weight_lgbm': 0.5305040657831079}. Best is trial 1 with value: 0.12316228600441836.
[I 2025-08-26 17:05:21,502] Trial 2 finished with value: 0.12374693078335292 and parameters: {'weight_lasso': 0.009486964397670562, 'weight_xgb': 0.6127585130850901, 'weight_lgbm': 0.7884282286422353}. Best is trial 1 with value: 0.12316228600441836.
[I 2025-08-26 17:05:21,571] Trial 3 finished with value: 0.12312149607371653 and parameters: {'weight_lasso': 0.03869000987331253, 'w

Лучшие веса: {'weight_lasso': 0.08990821446118184, 'weight_xgb': 0.06095925639046547, 'weight_lgbm': 8.614009482042084e-05}
Лучшее RMSE: 0.11609723316820351


In [None]:
import random
#Тут подбирал какие колонки какими будут
def objective_cols(trial):
    
    # Для каждой колонки решаем: 0 = OneHot, 1 = LabelEncoder
    encoding_choices = []
    for _, col in enumerate(str_cols):
        choice = trial.suggest_categorical(col, [0, 1])
        encoding_choices.append(choice)
    
    label_cols = [str_cols[i] for i, choice in enumerate(encoding_choices) if choice == 1]
    onehot_cols = [str_cols[i] for i, choice in enumerate(encoding_choices) if choice == 0]

    df_new = pd.get_dummies(df, columns=onehot_cols)
    for col in label_cols:
        df_new[col] = LabelEncoder().fit_transform(df[col])


    X_train, X_val, y_train, y_val = train_test_split(
        df_new.drop(columns=['Id','SalePrice']), np.log(df_new.SalePrice), test_size=0.2, random_state=42)
    # Создаем и обучаем модель
    model = XGBRegressor()
    model.fit(X_train, y_train)
    
    # Делаем предсказания и вычисляем RMSE
    y_pred = model.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    
    return rmse

study_cols = optuna.create_study(direction='minimize')
study_cols.optimize(objective_cols, n_trials=100)

best_label_cols = study_cols.best_params
print("Лучшие label_cols:", best_label_cols)
print(f"Лучшее RMSE: {study_cols.best_value}")

# но по итогу, других ваинатов как сделать все onehot не увидел

[I 2025-08-20 12:09:42,775] A new study created in memory with name: no-name-1cceace3-5c9b-4e95-a376-f0ad6a1eb8ca
[I 2025-08-20 12:09:43,290] Trial 0 finished with value: 0.14069955933833778 and parameters: {'MSZoning': 1, 'Street': 1, 'Alley': 1, 'LotShape': 0, 'LandContour': 1, 'Utilities': 1, 'LotConfig': 1, 'LandSlope': 1, 'Neighborhood': 0, 'Condition1': 0, 'Condition2': 1, 'BldgType': 1, 'HouseStyle': 1, 'RoofStyle': 0, 'RoofMatl': 1, 'Exterior1st': 1, 'Exterior2nd': 0, 'MasVnrType': 1, 'ExterQual': 1, 'ExterCond': 0, 'Foundation': 1, 'BsmtQual': 1, 'BsmtCond': 0, 'BsmtExposure': 1, 'BsmtFinType1': 0, 'BsmtFinType2': 1, 'Heating': 0, 'HeatingQC': 0, 'CentralAir': 1, 'Electrical': 1, 'KitchenQual': 1, 'Functional': 1, 'FireplaceQu': 1, 'GarageType': 1, 'GarageFinish': 1, 'GarageQual': 1, 'GarageCond': 1, 'PavedDrive': 1, 'PoolQC': 1, 'Fence': 0, 'MiscFeature': 0, 'SaleType': 0, 'SaleCondition': 1}. Best is trial 0 with value: 0.14069955933833778.
[I 2025-08-20 12:09:43,878] Trial 

Лучшие label_cols: {'MSZoning': 1, 'Street': 1, 'Alley': 1, 'LotShape': 1, 'LandContour': 1, 'Utilities': 0, 'LotConfig': 1, 'LandSlope': 1, 'Neighborhood': 0, 'Condition1': 1, 'Condition2': 0, 'BldgType': 1, 'HouseStyle': 1, 'RoofStyle': 0, 'RoofMatl': 0, 'Exterior1st': 1, 'Exterior2nd': 0, 'MasVnrType': 1, 'ExterQual': 1, 'ExterCond': 1, 'Foundation': 1, 'BsmtQual': 1, 'BsmtCond': 0, 'BsmtExposure': 0, 'BsmtFinType1': 0, 'BsmtFinType2': 1, 'Heating': 0, 'HeatingQC': 0, 'CentralAir': 1, 'Electrical': 0, 'KitchenQual': 1, 'Functional': 0, 'FireplaceQu': 1, 'GarageType': 1, 'GarageFinish': 1, 'GarageQual': 1, 'GarageCond': 1, 'PavedDrive': 1, 'PoolQC': 1, 'Fence': 0, 'MiscFeature': 0, 'SaleType': 0, 'SaleCondition': 1}
Лучшее RMSE: 0.1370133931166678


Полный код с уже готовыми параметрами

In [230]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Lasso, ElasticNet

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
def df_to_Xy(df):
    df.loc[df['LotFrontage'].isna(), 'LotFrontage'] = df['LotFrontage'].mean()
    df.loc[df['MasVnrArea'].isna(), 'MasVnrArea'] = 0
    df.loc[df['GarageYrBlt'].isna(), 'GarageYrBlt'] = 10000

    label_cols = ['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'LotConfig',
 'LandSlope',
 'Condition1',
 'BldgType',
 'HouseStyle',
 'Exterior1st',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtFinType2',
 'CentralAir',
 'KitchenQual',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'SaleCondition']
    onehot_cols = ['Utilities',
 'Neighborhood',
 'Condition2',
 'RoofStyle',
 'RoofMatl',
 'Exterior2nd',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'Heating',
 'HeatingQC',
 'Electrical',
 'Functional',
 'Fence',
 'MiscFeature',
 'SaleType']
    str_cols=onehot_cols+label_cols
    drop_feature= ['BsmtCond',
 '3SsnPorch',
 'RoofStyle',
 'BsmtFullBath',
 'BsmtFinType2',
 'BldgType',
 'GarageQual',
 'EnclosedPorch',
 'MasVnrType',
 'Functional',
 'FireplaceQu',
 'MSSubClass',
 'Electrical',
 'MasVnrArea',
 'Fence',
 'Exterior2nd',
 'Exterior1st']
    df = pd.get_dummies(df.drop(columns=drop_feature), columns=list(set(str_cols)-set(drop_feature)))
    #df = pd.get_dummies(df, columns=str_cols)
    if 'SalePrice' in df.columns:
        return df.drop(columns=['Id','SalePrice']), np.log(df.SalePrice)
    return df.drop(columns=['Id'])

X_train,y_train = df_to_Xy(train)
X_test = df_to_Xy(test)
# Полное выравнивание колонок под train: лишние — отбросить, недостающие — заполнить нулями
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# (опционально) привести типы, если где-то получились bool / int вместо float
X_test = X_test.fillna(0).astype(X_train.dtypes.to_dict())

xgbr_model = XGBRegressor(
    n_estimators = 656,
    max_depth = 3,
    learning_rate = 0.03157223564603808,
    subsample = 0.8737643695824285,
    colsample_bytree = 0.8977579145799933,
    reg_alpha=0.00733063035498116,
    reg_lambda=6.713846748376149,
    min_child_weight= 2,
    random_state =  42)
xgbr_model.fit(X_train,y_train)

lgbm_model = LGBMRegressor(n_estimators = 1969,
    max_depth = 2,
    num_leaves = 37,
    learning_rate = 0.028872442872394172,
    subsample = 0.8224941244239035,
    colsample_bytree = 0.5911390140527809,
    reg_alpha = 0.33710269270900395,
    reg_lambda = 1.4505375123078616e-07,
    min_child_samples = 8,
    random_state = 42
    )
lgbm_model.fit(X_train,y_train)

lasso_model = Lasso(
    alpha = 0.00016964172244575732,
    fit_intercept = True,
    max_iter = 8704,
    tol = 0.00027696118295903035,
    selection = 'cyclic',
    random_state = 42
)
lasso_model.fit(X_train,y_train)

weight_lasso= 0.08990821446118184
weight_xgb= 0.06095925639046547
weight_lgbm= 8.614009482042084e-05
total = weight_lasso+weight_xgb+weight_lgbm

y_pred = (weight_lasso * lasso_model.predict(X_test) + 
    weight_xgb * xgbr_model.predict(X_test) + 
    weight_lgbm * lgbm_model.predict(X_test))/total


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002360 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3123
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 155
[LightGBM] [Info] Start training from score 12.024051


In [231]:
pd.DataFrame({'Id':test.Id,'SalePrice':np.exp(y_pred)}).to_csv('sample_submission.csv',index=False)