In [39]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
import numpy as np
import time
import xgboost as xgb
import joblib
import os
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel, SelectKBest, mutual_info_regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression

# Обработка данных

### Обработка Null (NaN)

In [40]:
def data_processing_NAN(df, test = 0):

    print(f"Начальная размерность: {df.shape}")
    print(f"Количество пропусков: {df.isnull().sum().sum()}")

    # удаляем все столбцы, в которых слишком много пропущенных значений
    num_str = df.shape[0]
    sum_null_in_cols = df.isnull().sum()
    missing_cols = sum_null_in_cols[sum_null_in_cols > num_str * 0.1]
    df = df.drop(columns=missing_cols.index.tolist())

    print(f"Размерность после первичной обработки: {df.shape}")
    print(f"Количество пропусков: {df.isnull().sum().sum()}")

    # разделяем данные на признаки и целевую переменную

    if not test: target = np.log1p(df["SalePrice"]) # логорифмируем, чтобы выбросы не так сильно влияли
    feature = df.drop(columns=['SalePrice']) if not test else df

    # заполняем оставшиеся пропуски
    num_features = feature.select_dtypes(exclude=['object']).columns.tolist()
    cat_features = feature.select_dtypes(include=['object']).columns.tolist()

    imputer_num = KNNImputer(n_neighbors = 5, weights = "distance")
    imputer_cat = SimpleImputer(strategy="most_frequent")

    feature[num_features] = imputer_num.fit_transform(feature[num_features])
    feature[cat_features] = imputer_cat.fit_transform(feature[cat_features])

    print(f"Размерность после вторичной обработки обработки: {feature.shape}")
    print(f"Количество пропусков: {feature.isnull().sum().sum()}")

    if not test:
        return feature, target # выводим признаки и целевую переменную
    else:
        return feature


### Обработка категориальных признаков

In [41]:
def data_processing_cat_features(feature_train, feature_test):
    print('\nНачало обработки категориальных признаков')

    # --- OneHotEncoding ---
    ohe_cols = ['MSZoning', 'Street', 'LotConfig', 'BldgType', 'HouseStyle', 
                'RoofStyle', 'RoofMatl', 'Foundation', 'Heating', 'CentralAir', 
                'PavedDrive', 'SaleType', 'SaleCondition', 'Utilities']

    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    ohe.fit(feature_train[ohe_cols])

    ohe_train = ohe.transform(feature_train[ohe_cols])
    ohe_test = ohe.transform(feature_test[ohe_cols])

    ohe_cols_names = ohe.get_feature_names_out(ohe_cols)
    
    ohe_train_df = pd.DataFrame(ohe_train, columns=ohe_cols_names, index=feature_train.index)
    ohe_test_df = pd.DataFrame(ohe_test, columns=ohe_cols_names, index=feature_test.index)

    # --- Удаляем закодированные колонки и добавляем обратно one-hot ---
    feature_train = feature_train.drop(columns=ohe_cols)
    feature_test = feature_test.drop(columns=ohe_cols)

    feature_train = pd.concat([feature_train, ohe_train_df], axis=1)
    feature_test = pd.concat([feature_test, ohe_test_df], axis=1)

    # --- Ordinal + Label Encoding ---
    ordinal_cols = [
        'LotShape', 'LandSlope', 'ExterQual', 'ExterCond', 'HeatingQC', 'KitchenQual', 
        'Functional', 'LandContour', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
        'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'GarageType', 'GarageFinish', 
        'GarageQual', 'GarageCond'
    ]

    ordinal_mapping = [
        ['Reg', 'IR1', 'IR2', 'IR3'], 
        ['Gtl', 'Mod', 'Sev'], 
        ['Ex', 'Gd', 'TA', 'Fa'], 
        ['Ex', 'Gd', 'TA', 'Fa', 'Po'], 
        ['Ex', 'Gd', 'TA', 'Fa', 'Po'], 
        ['Ex', 'Gd', 'TA', 'Fa'], 
        ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev'],
        ['Bnk', 'Lvl', 'HLS', 'Low'],
        ['Ex', 'Gd', 'TA', 'Fa'],  
        ['TA', 'Gd', 'Fa', 'Po'], 
        ['No', 'Gd', 'Mn', 'Av'],  
        ['GLQ', 'ALQ', 'Unf', 'Rec', 'BLQ', 'LwQ'],  
        ['Unf', 'BLQ', 'ALQ', 'Rec', 'LwQ', 'GLQ'], 
        ['SBrkr', 'FuseF', 'FuseA', 'FuseP', 'Mix'],  
        ['Attchd', 'Detchd', 'BuiltIn', 'CarPort', 'Basment', '2Types'],  
        ['RFn', 'Unf', 'Fin'],  
        ['TA', 'Fa', 'Gd', 'Ex', 'Po'], 
        ['TA', 'Fa', 'Gd', 'Po', 'Ex']  
    ]

    ordinal_encoder = OrdinalEncoder(categories=ordinal_mapping)
    ordinal_encoder.fit(feature_train[ordinal_cols])

    feature_train[ordinal_cols] = ordinal_encoder.transform(feature_train[ordinal_cols])
    feature_test[ordinal_cols] = ordinal_encoder.transform(feature_test[ordinal_cols])

    # --- Label Encoding ---
    label_cols = ['Neighborhood', 'Condition1', 'Condition2', 'Exterior1st', 'Exterior2nd']
    for col in label_cols:
        le = LabelEncoder()
        le.fit(feature_train[col])  # обучаем на train
        feature_train[col] = le.transform(feature_train[col])
        feature_test[col] = le.transform(feature_test[col])  # transform на test

    # --- Проверка оставшихся категориальных признаков ---
    for title, data in [('Train', feature_train), ('Test', feature_test)]:
        cat_features = data.select_dtypes(include=['object']).columns.tolist()
        print(f"Количество не закодированных признаков в {title}: {len(cat_features)}")
        if cat_features:
            for col in cat_features:
                print(f"{title} - {col}: {data[col].unique()}")

    print("Обработка категориальных признаков закончена")
    return feature_train, feature_test

In [42]:
def data_separation(feature, target):
    print('\n', "Разделение данных на тренировочную, валидационную и тестовую выборку началось.", sep = "")
    X_train, X_test, Y_train, Y_test = train_test_split(feature, target, test_size= 0.2, random_state=42)
    Y_test = np.expm1(Y_test)
    print("Разделение данных завершено.")

    return X_train, X_test, Y_train, Y_test

In [43]:
df_train = pd.read_csv(r"data\train.csv")
df_test = pd.read_csv(r"data\test.csv")

feature_train, target = data_processing_NAN(df = df_train)
feature_test = data_processing_NAN(df = df_test, test = 1)

feature, feature_test = data_processing_cat_features(feature_train= feature_train, feature_test= feature_test)
X_train, X_test, Y_train, Y_test = data_separation(feature = feature, target = target)

Начальная размерность: (1460, 81)
Количество пропусков: 7829
Размерность после первичной обработки: (1460, 74)
Количество пропусков: 601
Размерность после вторичной обработки обработки: (1460, 73)
Количество пропусков: 0
Начальная размерность: (1459, 80)
Количество пропусков: 7878
Размерность после первичной обработки: (1459, 73)
Количество пропусков: 642
Размерность после вторичной обработки обработки: (1459, 73)
Количество пропусков: 0

Начало обработки категориальных признаков
Количество не закодированных признаков в Train: 0
Количество не закодированных признаков в Test: 0
Обработка категориальных признаков закончена

Разделение данных на тренировочную, валидационную и тестовую выборку началось.
Разделение данных завершено.


# Функция подсчета RMSE

In [44]:
def RMSE(model, select_col = X_train.columns):
    predicted = np.expm1(model.predict(X_test[select_col]))

    loss = root_mean_squared_error(predicted, Y_test)

    print(loss)

    return loss

*функция для красивых выводов признаков (понадобиться далее)

In [45]:
def print_feature(feature, title = ""):
    if title:
        print(title)

    for i in range(0, len(feature), 5):
        print(*feature[i: i + 5])

# Подбор признаков для моделей

NOTE: Какой процент брать для отбора признаков был подобран эксперементально. Далее вы поймете о чем речь.

### Подбор признаков для RF

In [46]:
def simplest_model_RF(select_col = X_train.columns, title = ""):
    start = time.time()

    if title:
        print(title, '\n')

    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train[select_col], Y_train)

    RMSE(model, select_col)

    end = time.time()
    print(f"time: {end - start}", '\n')

    return model

In [47]:
model_RF = simplest_model_RF(title="Модель RF для подбора признаков")

coef = model_RF.feature_importances_ * 100

table_coef = pd.DataFrame({
    'Feature': X_train.columns.tolist(),
    'Persentage importance': coef
})

best_feature_RF = table_coef[table_coef["Persentage importance"] > 0.5]["Feature"].tolist()
print_feature(feature = best_feature_RF, title = "Лучшие признаки для RF:")

Модель RF для подбора признаков 

30005.12132630982
time: 3.8871798515319824 

Лучшие признаки для RF:
LotArea Neighborhood OverallQual OverallCond YearBuilt
YearRemodAdd BsmtQual BsmtFinSF1 BsmtUnfSF TotalBsmtSF
1stFlrSF 2ndFlrSF GrLivArea Fireplaces GarageCars
GarageArea OpenPorchSF CentralAir_N CentralAir_Y


### Подбор признаков для XGB

In [48]:
X_train_ = X_train.astype({col: "int32" for col in X_train.select_dtypes(include="bool").columns})
X_test_ = X_test.astype({col: "int32" for col in X_test.select_dtypes(include="bool").columns})

In [49]:
def XGBoost_model_simplest(select_col = X_train_.columns, title = ""):
    start = time.time()

    if title:
        print(title, '\n')

    model = xgb.XGBRegressor(
        objective="reg:squarederror",  
        n_estimators=500,              
        learning_rate=0.05,           
        max_depth=6,                  
        subsample=0.8,                
        colsample_bytree=0.8,          
        random_state=42
        )

    model.fit(X_train_[select_col], Y_train)

    predicted = np.expm1(model.predict(X_test_[select_col]))

    loss = root_mean_squared_error(predicted, Y_test)

    print(f"RMSE: {loss}")

    end = time.time()
    print(f"time: {end - start}", '\n')

    return model

In [50]:
model_xgb = XGBoost_model_simplest(title="Модель XGB для подбора признаков")

coef = model_xgb.feature_importances_ * 100

table_coef = pd.DataFrame({
    'Feature': X_train.columns.tolist(),
    'Persentage importance': coef
})

feature_more_0001_per_imp = table_coef[table_coef["Persentage importance"] > 0.001]["Feature"].tolist()

best_feature_XGB = feature_more_0001_per_imp

print_feature(feature = best_feature_XGB, title = "Лучшие признаки для XGB:")

Модель XGB для подбора признаков 

RMSE: 25129.23406497067
time: 2.210637092590332 

Лучшие признаки для XGB:
Id MSSubClass LotArea LotShape LandContour
LandSlope Neighborhood Condition1 Condition2 OverallQual
OverallCond YearBuilt YearRemodAdd Exterior1st Exterior2nd
MasVnrArea ExterQual ExterCond BsmtQual BsmtCond
BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2
BsmtUnfSF TotalBsmtSF HeatingQC Electrical 1stFlrSF
2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath
FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual
TotRmsAbvGrd Functional Fireplaces GarageType GarageYrBlt
GarageFinish GarageCars GarageArea GarageQual GarageCond
WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch
PoolArea MiscVal MoSold YrSold MSZoning_C (all)
MSZoning_FV MSZoning_RH MSZoning_RL MSZoning_RM LotConfig_Corner
LotConfig_CulDSac LotConfig_FR2 LotConfig_FR3 LotConfig_Inside BldgType_1Fam
BldgType_2fmCon BldgType_Duplex BldgType_Twnhs BldgType_TwnhsE HouseStyle_1.5Fin
HouseStyle

### Для KNN

In [51]:
#Выделим признаки пересечением тех признаков что имеют не нулевлй коэффициент у lasso
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", Lasso(alpha=0.05))
])

pipe.fit(X_train, Y_train)

coef = pipe.named_steps["model"].coef_
select_feature_lasso = X_train.columns[coef != 0]

#И лучшие признаки отобранные с помощью SelectKBest
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("selector", SelectKBest(score_func=mutual_info_regression, k=10))
])

pipe.fit(X_train, Y_train)

coef = pipe.named_steps["selector"].get_support()
select_feature_SelKBest = X_train.columns[coef != 0]

#Вот и пересечение признаков
best_feature_KNN = list(set(select_feature_lasso).intersection(set(select_feature_SelKBest)))

print(f"Выбраны признаки: {best_feature_KNN}")


Выбраны признаки: ['YearBuilt', 'GarageCars', 'KitchenQual', 'GrLivArea', 'TotalBsmtSF', 'BsmtQual', 'OverallQual', 'GarageArea']


# Создание и обучение основных моделей

### RF

In [52]:
def general_RF(save_model = 0, title = "", selected_cols = X_train.columns):
    start = time.time()

    if title:
        print(title)

    param_grid_RF = {
        'n_estimators': [40, 100, 300, 550, 1000],
        'max_depth': [7, 9, 11],
        'min_samples_split': [2, 3, 4, 6]
    }

    model_RF = RandomForestRegressor()

    grid_search_RF = GridSearchCV(model_RF, param_grid = param_grid_RF, cv = 4, scoring = "neg_root_mean_squared_error", n_jobs = -1)

    grid_search_RF.fit(X_train[selected_cols], Y_train) 

    best_param = grid_search_RF.best_params_

    print("Best params:", best_param)

    model_RF = grid_search_RF.best_estimator_

    rmse = RMSE(model_RF, select_col= selected_cols)

    # save_model = 0 - модель не сохраняется
    # save_model = 1 - модель сохраняется в любос случае
    # save_model = 2 - модель сохраняется только если модель дает лучшие результаты
    if save_model:
        if save_model == 2: 
            try: #используем try если будет ошибка, например до этого не было сохраненной модели или были другие 
                #признаки в обучении, если срабатывает, то модель сохраняется, даже если не лучше
                model_RF_ = joblib.load("model\model_RF_general_file2")
                rmse_ = RMSE(model_RF_, select_col= selected_cols)
                if rmse < rmse_:
                    joblib.dump(model_RF, "model\model_RF_general_file2")
            except:
                joblib.dump(model_RF, "model\model_RF_general_file2")
        else: # если подавалась единица 
            joblib.dump(model_RF, "model\model_RF_general_file2")
    
    end = time.time()

    print(f"time: {end - start:.2f} sec")

    return model_RF



In [53]:
model_RF = general_RF(save_model=2, title = "general_model_RF", selected_cols = best_feature_RF)

general_model_RF
Best params: {'max_depth': 11, 'min_samples_split': 4, 'n_estimators': 40}
29346.126649774447
time: 162.87 sec


In [54]:
def general_XGB(save_model=0, title="", selected_cols=X_train.columns):
    start = time.time()

    if title:
        print(title)

    # Параметры для подбора
    param_grid_XGB = {
        'n_estimators': [100, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.05, 0.1],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'gamma': [0, 1]
    }

    model_XGB = xgb.XGBRegressor(
        objective='reg:squarederror',
        n_jobs=-1,
        verbosity=0,
        random_state=42
    )

    grid_search_XGB = GridSearchCV(
        model_XGB,
        param_grid=param_grid_XGB,
        cv=4,
        scoring="neg_root_mean_squared_error",
        n_jobs=-1
    )

    grid_search_XGB.fit(X_val[selected_cols], Y_val)

    best_param = grid_search_XGB.best_params_

    print("Best params:", best_param)


    model_XGB = grid_search_XGB.best_estimator_

    rmse = RMSE(model_XGB, select_col=selected_cols)

    if save_model:
        if save_model == 2:
            try:
                model_XGB_ = joblib.load("model/model_XGB_general_file2")
                rmse_ = RMSE(model_XGB_, select_col=selected_cols)
                if rmse < rmse_:
                    joblib.dump(model_XGB, "model/model_XGB_general_file2")
            except:
                joblib.dump(model_XGB, "model/model_XGB_general_file2")
        else:
            joblib.dump(model_XGB, "model/model_XGB_general_file2")

    end = time.time()

    print(f"time: {end - start:.2f} sec")

    return model_XGB

In [55]:
model_XGB = general_XGB(save_model=2, title="general model XGB", selected_cols= best_feature_XGB)

general model XGB
Best params: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.8}
16195.274255747217
time: 61.53 sec


In [56]:
def general_KNN(save_model=0, title="", selected_cols=X_train.columns):
    start = time.time()

    if title:
        print(title)

    param_grid_KNN = {
        'n_neighbors': [3, 5, 7, 10],
        'weights': ['uniform', 'distance'],
        'p': [1, 2]
    }

    model_KNN = KNeighborsRegressor()

    grid_search_KNN = GridSearchCV(
        model_KNN,
        param_grid=param_grid_KNN,
        cv=4,
        scoring="neg_root_mean_squared_error",
        n_jobs=-1
    )

    grid_search_KNN.fit(X_val[selected_cols], Y_val)

    best_params = grid_search_KNN.best_params_
    print("Best params:", best_params)

    model_KNN = grid_search_KNN.best_estimator_

    rmse = RMSE(model = model_KNN, select_col= selected_cols)

    if save_model:
        if save_model == 2:
            try:
                model_KNN_ = joblib.load("model/model_KNN_general_file2")
                rmse_ = RMSE(model_KNN_, select_col=selected_cols)
                if rmse < rmse_:
                    joblib.dump(model_KNN, "model/model_KNN_general_file2")
            except:
                joblib.dump(model_KNN, "model/model_KNN_general_file2")
        else:
            joblib.dump(model_KNN, "model/model_KNN_general_file2")

    end = time.time()
    print(f"time: {end - start:.2f} sec")

    return model_KNN


In [57]:
model_KNN = general_KNN(save_model=2, title="general model KNN", selected_cols= best_feature_KNN)

general model KNN
Best params: {'n_neighbors': 10, 'p': 1, 'weights': 'distance'}
22294.731261459383
time: 0.29 sec


# Создание окончательного предикта

In [65]:
def final_predict(data, model_KNN, model_RF, model_XGB, best_feature_KNN, best_feature_RF, best_feature_XGB):
    predicted_XGB = np.expm1(model_XGB.predict(data[best_feature_XGB]))
    predicted_RF = np.expm1(model_RF.predict(data[best_feature_RF]))
    predicted_KNN = np.expm1(model_KNN.predict(data[best_feature_KNN]))

    final_predict = predicted_XGB * 0.6 + predicted_RF * 0.3 + predicted_KNN * 0.1
    
    return final_predict

In [66]:
final_pred = final_predict(X_test, model_KNN, model_RF, model_XGB, best_feature_KNN, best_feature_RF, best_feature_XGB)
print(root_mean_squared_error(final_pred, Y_test))

17869.40220610184


# Сохранение результата

In [67]:
def save_predict(df, predict):
    submission = pd.DataFrame({
    "Id": df["Id"],
    "SalePrice": predict
    })

    timestamp = time.strftime("%Y%m%d_%H%M%S")  # Формат: 20230411_153200
    file_name = f"submissions/submission_{timestamp}.csv"

    submission.to_csv(file_name, index=False)
    print(f"Файл сохранён как {file_name}")

# Теперь делаем для теста, сохраняем результаты

In [68]:
final_predict = final_predict(feature_test, model_KNN, model_RF, model_XGB, best_feature_KNN, best_feature_RF, best_feature_XGB)
save_predict(df_test, final_predict)

Файл сохранён как submissions/submission_20250411_174019.csv
