In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest, f_classif
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
import numpy as np

import warnings

from sklearn.tree import DecisionTreeRegressor
warnings.filterwarnings("ignore")

train_prep = pd.read_csv("kaggle/input/mlurfuflat/train.csv")
test_prep = pd.read_csv("kaggle/input/mlurfuflat/test.csv")

from sklearn.preprocessing import LabelEncoder

class MainPrec(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, df):
        df = df.drop(['id'], axis=1)

        df['year'] = pd.to_datetime(df['timestamp']).dt.year
        df['month'] = pd.to_datetime(df['timestamp']).dt.month
        df = df.drop(['timestamp'], axis=1)

        df['build_year'] = pd.to_numeric(df['build_year'], errors='coerce').astype(float)
        df.loc[(df['build_year'] < 1900) | (df['build_year'] > 2024), 'build_year'] = df['build_year'].median()
        df['build_year'] = df['build_year'].fillna(df['build_year'].median())

        df.loc[df['num_room'] < 1, 'num_room'] = np.nan
        df.loc[df['num_room'] > 6, 'num_room'] = np.nan
        df['num_room'] = df['num_room'].fillna(df['num_room'].median())

        df['material'] = df['material'].fillna(df['material'].mode()[0])
        df['state'] = df['state'].fillna(df['state'].mode()[0])

        df.loc[df['life_sq'] < 6, 'life_sq'] = np.nan
        ratio_life_full = (df['life_sq'] / df['full_sq']).median()
        df['life_sq'] = df['life_sq'].where(
            df['life_sq'].notna(),
            df['full_sq'] * ratio_life_full
        )

        df.loc[df['floor'] <= 0, 'floor'] = np.nan
        df['floor'] = df['floor'].fillna(df['floor'].median())

        df.loc[df['kitch_sq'] < 0, 'kitch_sq'] = np.nan
        df.loc[df['kitch_sq'] > df['full_sq'] , 'kitch_sq'] = np.nan
        ratio_kitch_full = (df['kitch_sq'] / df['full_sq']).median()
        df['kitch_sq'] = df['kitch_sq'].where(
            df['kitch_sq'].notna(),
            df['full_sq'] * ratio_kitch_full
        )

        df.loc[df['max_floor'] <= 0, 'max_floor'] = np.nan
        df.loc[df['max_floor'] > 66, 'max_floor'] = np.nan
        df['max_floor'] = df['max_floor'].fillna(df['max_floor'].median())

        #df = df.dropna(how='any')

        return df

numeric_features = ['year', 'month', 'full_sq', 'life_sq', 'floor', 'state', 'max_floor', 'material', 'build_year', 'num_room', 'kitch_sq', 'full_all']
categorical_features = ['sub_area']

# Создаем пайплайн предобработки
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features)
    ])

pipelines = {
    'catboost': Pipeline([
        ('error_finder', MainPrec()),
        ('preprocessor', preprocessor),
        ('model', CatBoostRegressor(iterations=900, learning_rate=0.1, depth=6, loss_function='MAE'))
    ])
}

X_train = train_prep.drop(['price_doc'], axis=1)
y_train = train_prep['price_doc']

X_test = test_prep

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

for name, pipeline in pipelines.items():
    print(f"\nОбучение {name}...")
    
    # Обучаем пайплайн
    pipeline.fit(X_train, y_train)
    
    # Предсказания
    y_pred = pipeline.predict(X_test)

    result = pd.DataFrame(columns=['id', 'price_doc'])
    result['id'] = [i for i in range(len(y_pred))]
    result['price_doc'] = y_pred
    
    result.to_csv('submission.csv', index=False)
    
    # Оценка
    #mae = mean_absolute_error(y_test, y_pred)
    #results[name] = mae
    
    #print(f"{name} - MAE: {mae:,.0f}")


Обучение catboost...
0:	learn: 2872244.9940683	total: 149ms	remaining: 2m 13s
1:	learn: 2776948.5711682	total: 153ms	remaining: 1m 8s
2:	learn: 2681702.8844260	total: 156ms	remaining: 46.8s
3:	learn: 2611444.7260562	total: 160ms	remaining: 35.9s
4:	learn: 2551612.3086465	total: 163ms	remaining: 29.2s
5:	learn: 2483917.1028939	total: 166ms	remaining: 24.8s
6:	learn: 2429576.7276473	total: 169ms	remaining: 21.6s
7:	learn: 2389077.8072993	total: 172ms	remaining: 19.2s
8:	learn: 2345917.8634127	total: 175ms	remaining: 17.3s
9:	learn: 2313465.6331449	total: 177ms	remaining: 15.8s
10:	learn: 2275971.3336657	total: 181ms	remaining: 14.6s
11:	learn: 2251180.9827897	total: 183ms	remaining: 13.6s
12:	learn: 2231910.5717838	total: 186ms	remaining: 12.7s
13:	learn: 2212905.6266465	total: 188ms	remaining: 11.9s
14:	learn: 2184880.7290717	total: 190ms	remaining: 11.2s
15:	learn: 2167701.2223642	total: 193ms	remaining: 10.7s
16:	learn: 2148630.5943747	total: 195ms	remaining: 10.1s
17:	learn: 2135476