In [61]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest, f_classif
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings("ignore")

train_df = pd.read_csv('train.csv')

X = train_df.drop('price_doc', axis=1)
y = train_df['price_doc']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Поиск логических ошибок
class ErrorFinder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, df):
        #df = df.dropna(how='any')

        #area_error = df[df['life_sq'] > df['full_sq']].index
        #df = df.drop(area_error)
        df['life_sq'] = np.where(df['life_sq'] > df['full_sq'], 
                        df['full_sq'], 
                        df['life_sq'])

        #
        #kitchen_error = df[df['kitch_sq'] > df['full_sq']].index
        #df = df.drop(kitchen_error)
        df['kitch_sq'] = np.where(df['kitch_sq'] > df['full_sq'], 
                         df['full_sq'] * 0.2,  # предполагаем 20% от общей площади
                         df['kitch_sq'])

        #
        #floor_error = df[df['floor'] > df['max_floor']].index
        #df = df.drop(floor_error)
        df['floor'] = np.where(df['floor'] > df['max_floor'], 
                      df['max_floor'], 
                      df['floor'])

        #
        #room_error = df[df['num_room'] <= 0].index
        #df = df.drop(room_error)
        df['num_room'] = np.where(df['num_room'] <= 0, 
                         1, 
                         df['num_room'])
        
        #
        #df['build_year'] = df['build_year'].fillna(2000)
        df['build_year'] = pd.to_numeric(df['build_year'], errors='coerce').astype('Int64')

        # Удаление бесконечных значений
        df = df.replace([np.inf, -np.inf], np.nan)
        
        return df
    
    
#Удаление \ преобразование колонок и строк
class ChangeColumns(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, df):
        df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d')
        df['quarter'] = df['timestamp'].dt.quarter
        df['year'] = df['timestamp'].dt.year

        df['year'] = df['year'].astype(int)
        df['quarter'] = df['quarter'].astype(int)

        return df
    
class DeleteCols(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, df):
        df = df.drop(['id'], axis=1)
        df = df.drop(['timestamp'], axis=1)

        return df


numeric_features = ['year', 'quarter', 'full_sq', 'life_sq', 'floor', 'state', 'max_floor', 'material', 'build_year', 'num_room', 'kitch_sq', 'full_all']
categorical_features = ['sub_area']

# Создаем пайплайн предобработки
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# # Функция для применения обработки ошибок
# #anomaly_transformer = FunctionTransformer(preprocess_data)

# Создаем полные пайплайны для разных моделей
pipelines = {
    'GradientBoost': Pipeline([
        ('error_finder', ErrorFinder()),
        ('structure_fix', ChangeColumns()),
        ('deleting', DeleteCols()),
        ('preprocessor', preprocessor),
        ('model', GradientBoostingRegressor(n_estimators = 300, max_depth=6, min_samples_split=2,learning_rate=0.1,loss='squared_error'))
    ]),
    'CatBoost': Pipeline([
        ('error_finder', ErrorFinder()),
        ('structure_fix', ChangeColumns()),
        ('deleting', DeleteCols()),
        ('preprocessor', preprocessor),
        ('model', CatBoostRegressor(iterations=900, learning_rate=0.1, depth=7, loss_function='MAE'))
    ])
}

# param_grid = [
# {
#     'classifier': [LogisticRegression()],
#     'classifier__C': [0.1, 1, 10],
#     'feature_selection__k': [2, 3, 4, 5],
#     'scaler': [StandardScaler(), None]
# },
# ]

# pipeline = Pipeline([
#     ('structure_fix', ChangeColumns()),
#     ('error_finder', ErrorFinder()),
#     ('preprocessor', preprocessor),
#     ('feature_selection', SelectKBest(f_classif, k=5)), # Отбор 5 лучших признаков
#     ('scaler', StandardScaler()),
#     ('classifier', LogisticRegression())
# ])

# grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1')
# grid_search.fit(X_train, y_train)

print("\n" + "="*50)
print("ОБУЧЕНИЕ МОДЕЛЕЙ")
print("="*50)

results = {}

for name, pipeline in pipelines.items():
    print(f"\nОбучение {name}...")
    
    # Обучаем пайплайн
    pipeline.fit(X_train, y_train)
    
    # Предсказания
    y_pred = pipeline.predict(X_test)
    
    # Оценка
    mae = mean_absolute_error(y_test, y_pred)
    results[name] = mae
    
    print(f"{name} - MAE: {mae:,.0f}")

# Сравнение результатов
print("\n" + "="*50)
print("СРАВНЕНИЕ МОДЕЛЕЙ")
print("="*50)

for name, mae in sorted(results.items(), key=lambda x: x[1]):
    print(f"{name}: MAE = {mae:,.0f}")



ОБУЧЕНИЕ МОДЕЛЕЙ

Обучение GradientBoost...
GradientBoost - MAE: 1,690,917

Обучение CatBoost...
0:	learn: 2865553.7431531	total: 3.31ms	remaining: 2.98s
1:	learn: 2772817.7731777	total: 6.88ms	remaining: 3.09s
2:	learn: 2679782.5457119	total: 9.99ms	remaining: 2.99s
3:	learn: 2606881.1345403	total: 13.3ms	remaining: 2.99s
4:	learn: 2546797.9373185	total: 17.1ms	remaining: 3.05s
5:	learn: 2484627.9247065	total: 20ms	remaining: 2.98s
6:	learn: 2423840.2741957	total: 23.1ms	remaining: 2.94s
7:	learn: 2373594.5057018	total: 26.4ms	remaining: 2.94s
8:	learn: 2331824.9287030	total: 41.5ms	remaining: 4.1s
9:	learn: 2291021.3486045	total: 45.4ms	remaining: 4.04s
10:	learn: 2257018.2056889	total: 50ms	remaining: 4.04s
11:	learn: 2228163.1828047	total: 54.2ms	remaining: 4.01s
12:	learn: 2198269.2331356	total: 58.5ms	remaining: 3.99s
13:	learn: 2172984.3660694	total: 61.5ms	remaining: 3.89s
14:	learn: 2153817.7050024	total: 65.3ms	remaining: 3.85s
15:	learn: 2133692.2118693	total: 68.5ms	remain

In [None]:
exclude_column = 'sub_area'

# Создаем маску: ищем 'о' во всех колонках кроме exclude_column
mask = pd.DataFrame()
for col in train_df.columns:
    if col != exclude_column:
        # Ищем 'о' как подстроку, игнорируя регистр
        mask[col] = train_df[col].astype(str).str.contains('о', case=False, na=False)

# Строки где есть хотя бы одно вхождение 'о' в любой колонке кроме exclude_column
rows_with_o = mask.any(axis=1)
print(f"Найдено строк с 'о': {rows_with_o.sum()}")

AttributeError: Can only use .str accessor with string values!

In [48]:
train_df['build_year'] = pd.to_numeric(train_df['build_year'], errors='coerce').astype('Int64')

In [67]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
   ---------------------------------------- 0.0/404.7 kB ? eta -:--:--
   - -------------------------------------- 10.2/404.7 kB ? eta -:--:--
   -- ------------------------------------ 30.7/404.7 kB 660.6 kB/s eta 0:00:01
   -------- ------------------------------ 92.2/404.7 kB 871.5 kB/s eta 0:00:01
   --------------- ------------------------ 153.6/404.7 kB 1.1 MB/s eta 0:00:01
   ------------------------- -------------- 256.0/404.7 kB 1.6 MB/s eta 0:00:01
   ----------------------------------- ---- 358.4/404.7 kB 1.7 MB/s eta 0:00:01
   ---------------------------------------- 404.7/404.7 kB 1.7 MB/s eta 0:00:00
Installing collected packages: optuna
Successfully installed optuna-4.6.0



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\Admin\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV

train_df = pd.read_csv('train.csv')

X = train_df.drop('price_doc', axis=1)
y = train_df['price_doc']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

import optuna
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor

def objective(trial):
    # Параметры для CatBoost
    cb_params = {
        'iterations': trial.suggest_int('iterations', 500, 2000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 10),
        'random_strength': trial.suggest_float('random_strength', 0.1, 2.0),
    }
    
    # Создаем пайплайн с текущими параметрами
    pipeline = Pipeline([
         ('error_finder', ErrorFinder()),
        ('structure_fix', ChangeColumns()),
        ('deleting', DeleteCols()),
        ('preprocessor', preprocessor),
        ('model', CatBoostRegressor(**cb_params, loss_function='MAE', random_seed=42, verbose=False))
    ])
    
    # Обучаем пайплайн
    pipeline.fit(X_train, y_train)
    
    # Предсказываем
    y_pred = pipeline.predict(X_test)
    
    return mean_absolute_error(y_test, y_pred)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

print("Лучшие параметры:", study.best_params)

[I 2025-11-30 23:51:22,134] A new study created in memory with name: no-name-76eeef10-4eda-4430-8240-de3dfabf1c9c
