In [38]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd
import pickle

# 1. Загружаем данные
url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv"
df = pd.read_csv(url)

# 2. Разделяем фичи и целевую переменную
X = df.drop(columns=["median_house_value"])  # Признаки
y = df["median_house_value"]  # Целевая переменная

# 3. Определяем числовые и категориальные признаки
categorical_features = ["ocean_proximity"]
numerical_features = X.select_dtypes(include=[np.number]).columns

# 4. Разделяем данные на тренировочные и тестовые
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Создаём предобработчик данных
preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler())
        ]), numerical_features),
        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
        ]), categorical_features),
    ]
)

# 6. Создаём и обучаем модель Random Forest
model_rf = Pipeline(steps=[("preprocessor", preprocessor),
                           ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))])
model_rf.fit(X_train, y_train)

# 7. Делаем предсказание
y_pred_rf = model_rf.predict(X_test)

# 8. Оцениваем модель
print("Random Forest:")
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("MSE:", mean_squared_error(y_test, y_pred_rf))
print("R2 Score:", r2_score(y_test, y_pred_rf))

# 9. Сохраняем модель
with open("model.pkl", "wb") as file:
    pickle.dump(model_rf, file)

print("Модель успешно сохранена в model.pkl")


Random Forest:
MAE: 31660.46843023256
MSE: 2393815007.8887124
R2 Score: 0.8173230611482697
Модель успешно сохранена в model.pkl
