In [5]:
import pandas as pd
import numpy as np
import optuna
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor


In [6]:
df = pd.read_csv('Final_df.csv', index_col=0)

In [7]:
df_train = df[df['isTrain'] == True]

Y = df_train['target']
X = df_train.drop(columns=['target', 'isTrain', 'address', 'address_rus', 'geometry', 'buffer', 'poi_amenities'])

categorical_features = ['city', 'atm_group']
X = pd.get_dummies(X, columns=categorical_features, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

In [8]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 500],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb_model = XGBRegressor(random_state=42, objective='reg:squarederror')

grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

print("Лучшие параметры:", grid_search.best_params_)
print("Лучший кросс-валидационный score (отрицательная MSE):", grid_search.best_score_)

best_xgb = grid_search.best_estimator_

y_pred_xgb = best_xgb.predict(X_test_scaled)

mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print("\nРезультаты на тестовой выборке для лучшей модели XGBRegressor:")
print("MSE:", mse_xgb)
print("RMSE:", rmse_xgb)
print("R^2:", r2_xgb)

Лучшие параметры: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.8}
Лучший кросс-валидационный score (отрицательная MSE): -0.0020230114822195368

Результаты на тестовой выборке для лучшей модели XGBRegressor:
MSE: 0.0020574162928616306
RMSE: 0.04535875100641144
R^2: 0.7289905774606872


In [9]:
import joblib

# Сохраняем лучшую модель в файл
joblib.dump(best_xgb, 'best_xgb_model.pkl')

['best_xgb_model.pkl']