# Классификация

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score, average_precision_score
import numpy as np

df = pd.read_csv('placementdata.csv')

X = df.drop(columns=['StudentID', 'PlacementStatus'])
y = df['PlacementStatus']

le_dict = {}
categorical_columns = ['ExtracurricularActivities', 'PlacementTraining']
for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    le_dict[col] = le

target_le = LabelEncoder()
y_encoded = target_le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

baseline_rf = RandomForestClassifier(random_state=42)
baseline_rf.fit(X_train, y_train)

y_pred_baseline = baseline_rf.predict(X_test)
y_pred_proba_baseline = baseline_rf.predict_proba(X_test)[:, 1]

baseline_f1 = f1_score(y_test, y_pred_baseline)
baseline_pr_auc = average_precision_score(y_test, y_pred_proba_baseline)

print(f"F1 Score: {baseline_f1:.4f}")
print(f"PR AUC: {baseline_pr_auc:.4f}")

F1 Score: 0.7459
PR AUC: 0.8299


Добавим масштабирование

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

rf_scaled = RandomForestClassifier(random_state=42)
rf_scaled.fit(X_train_scaled_df, y_train)

y_pred_scaled = rf_scaled.predict(X_test_scaled_df)
y_pred_proba_scaled = rf_scaled.predict_proba(X_test_scaled_df)[:, 1]

f1_scaled = f1_score(y_test, y_pred_scaled)
pr_auc_scaled = average_precision_score(y_test, y_pred_proba_scaled)

print(f"F1 Score: {f1_scaled:.4f}")
print(f"PR AUC: {pr_auc_scaled:.4f}")

F1 Score: 0.7410
PR AUC: 0.8297


Добавим новые признаки

In [8]:
X_with_new_features = X.copy()
X_with_new_features['Average_Marks'] = (X_with_new_features['SSC_Marks'] + X_with_new_features['HSC_Marks'] + X_with_new_features['CGPA']) / 3
X_with_new_features['Total_Activities'] = X_with_new_features['Projects'] + X_with_new_features['Internships'] + X_with_new_features['Workshops/Certifications']
X_with_new_features['CGPA_Aptitude_Score'] = X_with_new_features['CGPA'] * X_with_new_features['AptitudeTestScore']

X_train_new, X_test_new, _, _ = train_test_split(X_with_new_features, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

X_train_new_scaled = scaler.fit_transform(X_train_new)
X_test_new_scaled = scaler.transform(X_test_new)

X_train_new_scaled_df = pd.DataFrame(X_train_new_scaled, columns=X_train_new.columns)
X_test_new_scaled_df = pd.DataFrame(X_test_new_scaled, columns=X_test_new.columns)

rf_new_scaled = RandomForestClassifier(random_state=42)
rf_new_scaled.fit(X_train_new_scaled_df, y_train)

y_pred_new_scaled = rf_new_scaled.predict(X_test_new_scaled_df)
y_pred_proba_new_scaled = rf_new_scaled.predict_proba(X_test_new_scaled_df)[:, 1]

f1_new_scaled = f1_score(y_test, y_pred_new_scaled)
pr_auc_new_scaled = average_precision_score(y_test, y_pred_proba_new_scaled)

print(f"F1 Score: {f1_new_scaled:.4f}")
print(f"PR AUC: {pr_auc_new_scaled:.4f}")


F1 Score: 0.7428
PR AUC: 0.8371


Добавим гиперпараметры

In [9]:
rf_optimized = RandomForestClassifier(
    class_weight='balanced',
    max_depth=10,
    min_samples_leaf=2,
    min_samples_split=5,
    n_estimators=200,
    random_state=42
)
rf_optimized.fit(X_train_new_scaled_df, y_train)

y_pred_optimized = rf_optimized.predict(X_test_new_scaled_df)
y_pred_proba_optimized = rf_optimized.predict_proba(X_test_new_scaled_df)[:, 1]

f1_optimized = f1_score(y_test, y_pred_optimized)
pr_auc_optimized = average_precision_score(y_test, y_pred_proba_optimized)

print(f"F1 Score: {f1_optimized:.4f}")
print(f"PR AUC: {pr_auc_optimized:.4f}")

F1 Score: 0.7655
PR AUC: 0.8472


| Модель | F1 Score | PR AUC |
|----------|-------|--------|
| Бейзлайн | 0.7459 | 0.8299 |
| + Масштабирование | 0.7410 | 0.8297 |
| + Новые признаки | 0.7428 | 0.8371 |
| + Фиксированные гиперпараметры | 0.7655 | 0.8472 |

## Имплементация

## Бейзлайн

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score, average_precision_score

class RandomForest:
    def __init__(self, n_estimators=100, max_depth=10, min_samples_split=5, min_samples_leaf=2,
                 max_features='sqrt', class_weight=None, random_state=42):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.class_weight = class_weight
        self.random_state = random_state
        self.trees = []

    def fit(self, X, y):
        np.random.seed(self.random_state)
        self.trees = []
        n_samples = X.shape[0]

        for i in range(self.n_estimators):
            idxs = np.random.choice(n_samples, n_samples, replace=True)
            X_bootstrap = X[idxs]
            y_bootstrap = y[idxs]

            if self.max_features == 'sqrt':
                n_features = int(np.sqrt(X.shape[1]))
            elif self.max_features == 'log2':
                n_features = int(np.log2(X.shape[1]))
            else:
                n_features = X.shape[1]

            tree = RandomForestClassifier(
                n_estimators=1,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                max_features=n_features,
                class_weight=self.class_weight,
                random_state=self.random_state + i
            )
            tree.fit(X_bootstrap, y_bootstrap)
            self.trees.append(tree)

    def predict_proba(self, X):
        tree_predictions = []
        for tree in self.trees:
            probas = tree.predict_proba(X)
            tree_predictions.append(probas)

        avg_probas = np.mean(tree_predictions, axis=0)
        return avg_probas

    def predict(self, X):
        tree_predictions = []
        for tree in self.trees:
            predictions = tree.predict(X)
            tree_predictions.append(predictions)

        tree_predictions = np.array(tree_predictions)
        predictions = []

        for i in range(X.shape[0]):
            sample_predictions = tree_predictions[:, i]
            unique_classes, counts = np.unique(sample_predictions, return_counts=True)
            majority_class = unique_classes[np.argmax(counts)]
            predictions.append(majority_class)

        return np.array(predictions)
df = pd.read_csv('placementdata.csv')

X = df.drop(columns=['StudentID', 'PlacementStatus'])
y = df['PlacementStatus']

le_dict = {}
categorical_columns = ['ExtracurricularActivities', 'PlacementTraining']
for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    le_dict[col] = le

target_le = LabelEncoder()
y_encoded = target_le.fit_transform(y)

X_train_baseline, X_test_baseline, y_train_baseline, y_test_baseline = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

rf_baseline = RandomForest(
    n_estimators=100,
    max_depth=5,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    class_weight=None,
    random_state=42
)
rf_baseline.fit(X_train_baseline.values, y_train_baseline)

y_pred_baseline = rf_baseline.predict(X_test_baseline.values)
y_pred_proba_baseline = rf_baseline.predict_proba(X_test_baseline.values)

if y_pred_proba_baseline.shape[1] > 1:
    y_pred_proba_baseline_binary = y_pred_proba_baseline[:, 1]
else:
    y_pred_proba_baseline_binary = y_pred_proba_baseline[:, 0]

f1_baseline = f1_score(y_test_baseline, y_pred_baseline)
pr_auc_baseline = average_precision_score(y_test_baseline, y_pred_proba_baseline_binary)

print(f"F1 Score: {f1_baseline:.4f}")
print(f"PR AUC: {pr_auc_baseline:.4f}")

F1 Score: 0.7572
PR AUC: 0.8511


Добавим новые признаки, масштабирование и изменим гиперпараметры

In [22]:
X_with_new_features = X.copy()
X_with_new_features['Average_Marks'] = (X_with_new_features['SSC_Marks'] + X_with_new_features['HSC_Marks'] + X_with_new_features['CGPA']) / 3
X_with_new_features['Total_Activities'] = X_with_new_features['Projects'] + X_with_new_features['Internships'] + X_with_new_features['Workshops/Certifications']
X_with_new_features['CGPA_Aptitude_Score'] = X_with_new_features['CGPA'] * X_with_new_features['AptitudeTestScore']

X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(
    X_with_new_features, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

X_train_new_scaled = scaler.fit_transform(X_train_new)
X_test_new_scaled = scaler.transform(X_test_new)

rf_improved = RandomForest(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    class_weight='balanced',
    random_state=42
)
rf_improved.fit(X_train_new_scaled, y_train_new)

y_pred_improved = rf_improved.predict(X_test_new_scaled)
y_pred_proba_improved = rf_improved.predict_proba(X_test_new_scaled)

if y_pred_proba_improved.shape[1] > 1:
    y_pred_proba_improved_binary = y_pred_proba_improved[:, 1]
else:
    y_pred_proba_improved_binary = y_pred_proba_improved[:, 0]

f1_improved = f1_score(y_test_new, y_pred_improved)
pr_auc_improved = average_precision_score(y_test_new, y_pred_proba_improved_binary)

print(f"F1 Score: {f1_improved:.4f}")
print(f"PR AUC: {pr_auc_improved:.4f}")

F1 Score: 0.7635
PR AUC: 0.8485


Увеличим тестовую выборку

In [23]:
X_train_new_27, X_test_new_27, y_train_new_27, y_test_new_27 = train_test_split(
    X_with_new_features, y_encoded, test_size=0.27, random_state=42, stratify=y_encoded
)

X_train_new_scaled_27 = scaler.fit_transform(X_train_new_27)
X_test_new_scaled_27 = scaler.transform(X_test_new_27)

rf_improved_27 = RandomForest(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    class_weight='balanced',
    random_state=42
)
rf_improved_27.fit(X_train_new_scaled_27, y_train_new_27)

y_pred_improved_27 = rf_improved_27.predict(X_test_new_scaled_27)
y_pred_proba_improved_27 = rf_improved_27.predict_proba(X_test_new_scaled_27)

if y_pred_proba_improved_27.shape[1] > 1:
    y_pred_proba_improved_27_binary = y_pred_proba_improved_27[:, 1]
else:
    y_pred_proba_improved_27_binary = y_pred_proba_improved_27[:, 0]

f1_improved_27 = f1_score(y_test_new_27, y_pred_improved_27)
pr_auc_improved_27 = average_precision_score(y_test_new_27, y_pred_proba_improved_27_binary)

print(f"F1 Score: {f1_improved_27:.4f}")
print(f"PR AUC: {pr_auc_improved_27:.4f}")

F1 Score: 0.7702
PR AUC: 0.8547


| Модель | F1 Score | PR AUC |
|----------|-------|--------|
| Бейзлайн | 0.7572 | 0.8511 |
| + Новые признаки, масштабирование и измененные гиперпараметры | 0.7635 | 0.8485 |
| + Увеличенная тестовая выборка | 0.7702 | 0.8547 |

## Сравнение

| Модель | F1 Score | PR AUC |
|----------|-------|--------|
| Бейзлайн | 0.7459 | 0.8299 |
| + Масштабирование | 0.7410 | 0.8297 |
| + Новые признаки | 0.7428 | 0.8371 |
| + Фиксированные гиперпараметры | 0.7655 | 0.8472 |
| Имплементация бейзлайн | 0.7572 | 0.8511 |
| + Новые признаки, масштабирование и измененные гиперпараметры | 0.7635 | 0.8485 |
| + Увеличенная тестовая выборка | 0.7702 | 0.8547 |

Случайный лес оказалось тяжело значительно улучшить

# Регрессия

## Бейзлайн

In [24]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

df = pd.read_csv('possum.csv')

target_column = 'hdlngth'
X = df.drop(columns=[target_column])
y = df[target_column]

categorical_columns = ['site', 'Pop', 'sex']
label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

X_train_no_drop, X_test_no_drop, y_train_no_drop, y_test_no_drop = train_test_split(
    X, y, test_size=0.2, random_state=42
)

rf_model_no_drop = RandomForestRegressor(random_state=42)
rf_model_no_drop.fit(X_train_no_drop, y_train_no_drop)
y_pred_no_drop = rf_model_no_drop.predict(X_test_no_drop)

mae_no_drop = mean_absolute_error(y_test_no_drop, y_pred_no_drop)
rmse_no_drop = np.sqrt(mean_squared_error(y_test_no_drop, y_pred_no_drop))
r2_no_drop = r2_score(y_test_no_drop, y_pred_no_drop)

print(f"MAE: {mae_no_drop:.3f}")
print(f"RMSE: {rmse_no_drop:.3f}")
print(f"R²: {r2_no_drop:.3f}")

MAE: 1.400
RMSE: 1.906
R²: 0.523


## Улучшение

Удаление строк с пропущенными значениями

In [26]:
data = X.join(y)
data = data.dropna()

X_clean = data.drop(columns=[target_column])
y_clean = data[target_column]

X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y_clean, test_size=0.2, random_state=42
)

rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R²: {r2:.3f}")

MAE: 1.632
RMSE: 1.980
R²: 0.701


Добавление новых признаков

In [27]:
df_clean = df.dropna()

df_clean = df_clean.copy()
df_clean['skull_to_hdlngth_ratio'] = df_clean['skullw'] / df_clean['hdlngth']
df_clean['total_to_body_ratio'] = df_clean['totlngth'] / df_clean['hdlngth']
df_clean['tail_to_total_ratio'] = df_clean['taill'] / df_clean['totlngth']

X = df_clean.drop(columns=['hdlngth'])
y = df_clean['hdlngth']

X_encoded = X.copy()
label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))
    label_encoders[col] = le

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

rf_model_new_features = RandomForestRegressor(random_state=42)
rf_model_new_features.fit(X_train, y_train)

y_pred_new_features = rf_model_new_features.predict(X_test)

mae_new_features = mean_absolute_error(y_test, y_pred_new_features)
rmse_new_features = np.sqrt(mean_squared_error(y_test, y_pred_new_features))
r2_new_features = r2_score(y_test, y_pred_new_features)

print(f"MAE: {mae_new_features:.3f}")
print(f"RMSE: {rmse_new_features:.3f}")
print(f"R²: {r2_new_features:.3f}")

Модель с новыми признаками (без подбора гиперпараметров):
MAE: 1.093
RMSE: 1.450
R²: 0.840


Добавим гиперпараметры

In [29]:
rf_model_fixed = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='log2',
    random_state=42
)
rf_model_fixed.fit(X_train, y_train)

y_pred_fixed = rf_model_fixed.predict(X_test)

mae_fixed = mean_absolute_error(y_test, y_pred_fixed)
rmse_fixed = np.sqrt(mean_squared_error(y_test, y_pred_fixed))
r2_fixed = r2_score(y_test, y_pred_fixed)

print(f"MAE: {mae_fixed:.3f}")
print(f"RMSE: {rmse_fixed:.3f}")
print(f"R²: {r2_fixed:.3f}")

MAE: 1.350
RMSE: 1.820
R²: 0.747


Выставленные параметры ухудшили метрики

Подбор с помощью GridSearchCV

In [30]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [6, 10, 15],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=0
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R²: {r2:.3f}")

MAE: 1.337
RMSE: 1.794
R²: 0.755


Метрики снова ухудшились

| Модель | R² | MAE | RMSE |
| :--- | :--- | :--- | :--- |
| Бейзлайн | 0.523 | 1.400 | 1.906 |
| + Удаление пропущенных значений | 0.701 | 1.632 | 1.980 |
| + Новые признаки | 0.840 | 1.093 | 1.450 |
| + Фиксированные гиперпараметры | 0.747 | 1.350 | 1.820 |
| + Автоподбор гиперпараметров | 0.755 | 1.337 | 1.794 |

## Имплементация

In [None]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

class MyRandomForestRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.random_state = random_state

    def fit(self, X, y):
        if self.random_state:
            np.random.seed(self.random_state)

        X = np.array(X)
        y = np.array(y)
        n_samples, n_features = X.shape

        if self.max_features == 'sqrt':
            self.max_features_ = int(np.sqrt(n_features))
        elif isinstance(self.max_features, int):
            self.max_features_ = min(self.max_features, n_features)
        else:
            self.max_features_ = n_features

        self.trees = []

        for i in range(self.n_estimators):
            tree = DecisionTreeRegressor(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                max_features=self.max_features_,
                random_state=(self.random_state + i if self.random_state is not None else None)
            )

            bootstrap_indices = np.random.choice(n_samples, size=n_samples, replace=True)
            X_bootstrap = X[bootstrap_indices]
            y_bootstrap = y[bootstrap_indices]

            tree.fit(X_bootstrap, y_bootstrap)
            self.trees.append(tree)

        return self

    def predict(self, X):
        X = np.array(X)
        predictions = np.array([tree.predict(X) for tree in self.trees])
        mean_predictions = np.mean(predictions, axis=0)
        return mean_predictions

df = pd.read_csv('possum.csv')

df_clean = df.dropna()

categorical_columns = ['site', 'Pop', 'sex']

X_orig = df_clean.drop(columns=['hdlngth'])
y_orig = df_clean['hdlngth']

X_orig_encoded = X_orig.copy()
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    X_orig_encoded[col] = le.fit_transform(X_orig_encoded[col].astype(str))
    label_encoders[col] = le

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(
    X_orig_encoded, y_orig, test_size=0.2, random_state=42
)

model_1 = MyRandomForestRegressor(random_state=42)
model_1.fit(X_train_1, y_train_1)

y_pred_1 = model_1.predict(X_test_1)

mae_1 = mean_absolute_error(y_test_1, y_pred_1)
rmse_1 = np.sqrt(mean_squared_error(y_test_1, y_pred_1))
r2_1 = r2_score(y_test_1, y_pred_1)

print(f"MAE: {mae_1:.3f}")
print(f"RMSE: {rmse_1:.3f}")
print(f"R²: {r2_1:.3f}")

MAE: 1.574
RMSE: 1.997
R²: 0.696


### Улучшение бейзлайна имплементации

Добавление новых признаков

In [None]:
df_with_features = df_clean.copy()
df_with_features['skull_to_hdlngth_ratio'] = df_with_features['skullw'] / df_with_features['hdlngth']
df_with_features['total_to_body_ratio'] = df_with_features['totlngth'] / df_with_features['hdlngth']
df_with_features['tail_to_total_ratio'] = df_with_features['taill'] / df_with_features['totlngth']

X_new = df_with_features.drop(columns=['hdlngth'])
y_new = df_with_features['hdlngth']

X_new_encoded = X_new.copy()
for col in categorical_columns:
    le = label_encoders[col]
    X_new_encoded[col] = le.transform(X_new_encoded[col].astype(str))

X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(
    X_new_encoded, y_new, test_size=0.2, random_state=42
)

model_2 = MyRandomForestRegressor(random_state=42)
model_2.fit(X_train_2, y_train_2)

y_pred_2 = model_2.predict(X_test_2)

mae_2 = mean_absolute_error(y_test_2, y_pred_2)
rmse_2 = np.sqrt(mean_squared_error(y_test_2, y_pred_2))
r2_2 = r2_score(y_test_2, y_pred_2)

print(f"MAE: {mae_2:.3f}")
print(f"RMSE: {rmse_2:.3f}")
print(f"R²: {r2_2:.3f}")

MAE: 1.343
RMSE: 1.715
R²: 0.776


Добавление гиперпараметров без новых признаков

In [None]:
model_3 = MyRandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='log2',
    random_state=42
)
model_3.fit(X_train_1, y_train_1)

y_pred_3 = model_3.predict(X_test_1)

mae_3 = mean_absolute_error(y_test_1, y_pred_3)
rmse_3 = np.sqrt(mean_squared_error(y_test_1, y_pred_3))
r2_3 = r2_score(y_test_1, y_pred_3)

print(f"MAE: {mae_3:.3f}")
print(f"RMSE: {rmse_3:.3f}")
print(f"R²: {r2_3:.3f}")

MAE: 1.682
RMSE: 2.081
R²: 0.669


Добавление гиперпараметров с новыми признаками

In [None]:
model_4 = MyRandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='log2',
    random_state=42
)
model_4.fit(X_train_2, y_train_2)

y_pred_4 = model_4.predict(X_test_2)

mae_4 = mean_absolute_error(y_test_2, y_pred_4)
rmse_4 = np.sqrt(mean_squared_error(y_test_2, y_pred_4))
r2_4 = r2_score(y_test_2, y_pred_4)

print(f"MAE: {mae_4:.3f}")
print(f"RMSE: {rmse_4:.3f}")
print(f"R²: {r2_4:.3f}")

MAE: 1.217
RMSE: 1.627
R²: 0.798


| Модель | R² | MAE | RMSE |
| :--- | :--- | :--- | :--- |
| Бейзлайн | 0.696 | 1.574 | 1.997 |
| + Новые признаки | 0.776 | 1.343 | 1.715 |
| С гиперпараметрами (без новых признаков) | 0.669 | 1.682 | 2.081 |
| С гиперпараметрами (с новыми признаками) | 0.798 | 1.217 | 1.627 |

## Сравнение

| Библиотека / Имплементация | Модель | R² | MAE | RMSE |
| :--- | :--- | :--- | :--- | :--- |
| Библиотечная | Бейзлайн | 0.523 | 1.400 | 1.906 |
| Библиотечная | Улучшенная | 0.840 | 1.093 | 1.450 |
| Имплементация | Бейзлайн | 0.696 | 1.574 | 1.997 |
| Имплементация | Улучшенная | 0.798 | 1.217 | 1.627 |