## RandomForest Regressor Pipeline - Data Cleaning, Feature Engineering, Training & Analysis

In [None]:
# 📦 Импорт библиотек
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
# 🔹 Настройки отображения
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# 1️⃣ Загрузка и исследование данных
df = pd.read_csv('your_dataset.csv')  # замените на путь к вашему датасету
display(df.info())
display(df.describe())
display(df.isna().sum())

In [None]:
# 📊 Распределения параметров
df.hist(bins=30, figsize=(15, 10))
plt.suptitle("Distributions of Original Features")
plt.show()

In [None]:
# 2️⃣ Расширение классов параметров
# Пример: округление числовых параметров
df['param_rounded'] = df['your_param'].round(1)
# Пример: бининг признаков
df['param_binned'] = pd.cut(df['your_param'], bins=10)

In [None]:
# 3️⃣ Обработка выбросов
Q1 = df['your_param'].quantile(0.25)
Q3 = df['your_param'].quantile(0.75)
IQR = Q3 - Q1
low = Q1 - 1.5 * IQR
high = Q3 + 1.5 * IQR
df = df[(df['your_param'] >= low) & (df['your_param'] <= high)]

In [None]:
# 4️⃣ Статистика до и после обработки
display(df['your_param'].describe())
sns.histplot(df['your_param'], kde=True)
plt.title("Cleaned Distribution")
plt.show()

In [None]:
# 5️⃣ Разделение на тренировочные и тестовые данные
X = df.drop(columns=['target'])
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 6️⃣ Подбор гиперпараметров
params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
grid = GridSearchCV(RandomForestRegressor(random_state=42), params, cv=3, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)
display("Best params:", grid.best_params_)

In [None]:
# 7️⃣ Обучение финальной модели
model = grid.best_estimator_
y_pred = model.predict(X_test)

In [None]:
# 8️⃣ Оценка качества
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
display(f"MAE: {mae:.2f}, MSE: {mse:.2f}, R2: {r2:.2f}")

In [None]:
# 🔍 Графики ошибок и отклонений
sns.scatterplot(x=y_test, y=y_pred)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Actual vs Predicted")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r')
plt.show()

residuals = y_test - y_pred
sns.histplot(residuals, kde=True)
plt.title("Distribution of Residuals")
plt.show()

In [None]:
# 🔎 Анализ крайних точек ошибки
errors = abs(residuals)
worst = errors.sort_values(ascending=False).head(5)
display("Worst predictions:")
display(df.loc[worst.index])