##Классификация

## Бейзлайн из библиотеки

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, precision_recall_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, average_precision_score, classification_report

df = pd.read_csv('placementdata.csv')

label_encoders = {}
binary_columns = ['ExtracurricularActivities', 'PlacementTraining', 'PlacementStatus']

for col in binary_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

X = df.drop(columns=['StudentID', 'PlacementStatus'])
y = df['PlacementStatus'].copy()

X_orig_train, X_orig_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_v1_train = X_orig_train.copy()
X_v1_test = X_orig_test[X_v1_train.columns]

dt_v1 = DecisionTreeClassifier(random_state=42)
dt_v1.fit(X_v1_train, y_train)
y_pred_v1 = dt_v1.predict(X_v1_test)
y_proba_v1 = dt_v1.predict_proba(X_v1_test)[:, 1]

f1_v1 = f1_score(y_test, y_pred_v1)
precision_v1, recall_v1, _ = precision_recall_curve(y_test, y_proba_v1)
pr_auc_v1 = auc(recall_v1, precision_v1)

print(f"F1-мера: {f1_v1:.4f}")
print(f"PR AUC: {pr_auc_v1:.4f}")

F1-мера: 0.6663
PR AUC: 0.7372


## Улучшенный бейзлайн

In [2]:
X_original = df[['SSC_Marks', 'HSC_Marks', 'CGPA', 'Projects', 'Workshops/Certifications', 'AptitudeTestScore', 'Internships', 'SoftSkillsRating']].copy()
X_new = X_original.copy()

X_new['Average_Marks'] = (X_new['SSC_Marks'] + X_new['HSC_Marks'] + X_new['CGPA']) / 3
X_new['Total_Activities'] = X_new['Projects'] + X_new['Internships'] + X_new['Workshops/Certifications']
X_new['CGPA_Aptitude_Score'] = X_new['CGPA'] * X_new['AptitudeTestScore']
X_new['Projects_And_Internships'] = X_new['Projects'] + X_new['Internships']
X_new['Normalized_CGPA'] = X_new['CGPA'] / 10.0
X_new['High_Aptitude'] = (X_new['AptitudeTestScore'] > 70).astype(int)
X_new['Activity_Skill_Index'] = (X_new['Projects'] + X_new['Internships'] + X_new['Workshops/Certifications']) * X_new['SoftSkillsRating']

X_new_train, X_new_test, _, _ = train_test_split(X_new, y, test_size=0.2, random_state=42, stratify=y)

X_v2_train = X_new_train[['Activity_Skill_Index', 'Normalized_CGPA', 'High_Aptitude']].copy()
X_v2_test = X_new_test[X_v2_train.columns]
dt_v2 = DecisionTreeClassifier(random_state=42)
dt_v2.fit(X_v2_train, y_train)
y_pred_v2 = dt_v2.predict(X_v2_test)
y_proba_v2 = dt_v2.predict_proba(X_v2_test)[:, 1]

f1_v2 = f1_score(y_test, y_pred_v2)
precision_v2, recall_v2, _ = precision_recall_curve(y_test, y_proba_v2)
pr_auc_v2 = auc(recall_v2, precision_v2)

print(f"F1-мера: {f1_v2:.4f}")
print(f"PR AUC: {pr_auc_v2:.4f}")

F1-мера: 0.6941
PR AUC: 0.6934


Создание новых признаков улучшило метрику F1, но ухудшило PR AUC, нужна дальнейшая работа

In [3]:
param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'class_weight': [None, 'balanced']
}

grid_search_v3 = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='f1'
)
grid_search_v3.fit(X_v2_train, y_train)

best_dt_v3 = grid_search_v3.best_estimator_

y_pred_v3 = best_dt_v3.predict(X_v2_test)
y_pred_proba_v3 = best_dt_v3.predict_proba(X_v2_test)[:, 1]

f1_v3 = f1_score(y_test, y_pred_v3)
pr_auc_v3 = auc(precision_recall_curve(y_test, y_pred_proba_v3)[1], precision_recall_curve(y_test, y_pred_proba_v3)[0])

print(f"F1-мера: {f1_v3:.4f}")
print(f"PR AUC: {pr_auc_v3:.4f}")

F1-мера: 0.7308
PR AUC: 0.7992


Подбор гиперпараметров в сочетании с новыми признаками заметно улучшили обе метрики по сравнению с бейзлайном

## Имплементация бейзлайн

In [None]:
import numpy as np
import pandas as pd
from collections import Counter
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, precision_recall_curve, auc, classification_report

def entropy(y):
    if len(y) == 0:
        return 0
    counts = Counter(y)
    probs = [count / len(y) for count in counts.values()]
    return -sum(p * np.log2(p) for p in probs if p > 0)

def information_gain(y, y_left, y_right):
    n = len(y)
    n_l, n_r = len(y_left), len(y_right)
    if n_l == 0 or n_r == 0:
        return 0
    h = entropy(y)
    h_l = entropy(y_left)
    h_r = entropy(y_right)
    return h - (n_l / n) * h_l - (n_r / n) * h_r

class DecisionNode:
    def __init__(self, feature_idx=None, threshold=None, left=None, right=None, value=None):
        self.feature_idx = feature_idx
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

class DecisionTree:
    def __init__(self, max_depth=100, min_samples_split=2, min_samples_leaf=1):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.root = None

    def _best_split(self, X, y):
        best_gain = -1
        best_feature_idx = None
        best_threshold = None
        current_entropy = entropy(y)

        if current_entropy == 0:
            return None, None

        n_features = X.shape[1]
        for feature_idx in range(n_features):
            thresholds = np.unique(X[:, feature_idx])
            for threshold in thresholds:
                left_mask = X[:, feature_idx] <= threshold
                right_mask = ~left_mask

                y_left, y_right = y[left_mask], y[right_mask]

                if len(y_left) < self.min_samples_leaf or len(y_right) < self.min_samples_leaf:
                    continue

                gain = information_gain(y, y_left, y_right)

                if gain > best_gain:
                    best_gain = gain
                    best_feature_idx = feature_idx
                    best_threshold = threshold

        return best_feature_idx, best_threshold

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        if (depth >= self.max_depth or
            n_labels == 1 or
            n_samples < self.min_samples_split):
            leaf_value = self._most_common_label(y)
            return DecisionNode(value=leaf_value)

        feature_idx, threshold = self._best_split(X, y)
        if feature_idx is None:
            leaf_value = self._most_common_label(y)
            return DecisionNode(value=leaf_value)

        left_mask = X[:, feature_idx] <= threshold
        right_mask = ~left_mask

        left_child = self._grow_tree(X[left_mask], y[left_mask], depth + 1)
        right_child = self._grow_tree(X[right_mask], y[right_mask], depth + 1)

        return DecisionNode(feature_idx=feature_idx, threshold=threshold, left=left_child, right=right_child)

    def _most_common_label(self, y):
        counter = Counter(y)
        value = counter.most_common(1)[0][0]
        return value

    def fit(self, X, y):
        self.root = self._grow_tree(X, y)
        return self

    def _predict_sample(self, x, node):
        if node.value is not None:
            return node.value

        if x[node.feature_idx] <= node.threshold:
            return self._predict_sample(x, node.left)
        else:
            return self._predict_sample(x, node.right)

    def predict(self, X):
        return np.array([self._predict_sample(x, self.root) for x in X])

df = pd.read_csv('placementdata.csv')

df_processed = df.copy()

binary_columns = ['ExtracurricularActivities', 'PlacementTraining']
for col in binary_columns:
    le = LabelEncoder()
    df_processed[col] = le.fit_transform(df_processed[col])

target_encoder = LabelEncoder()
df_processed['PlacementStatus'] = target_encoder.fit_transform(df_processed['PlacementStatus'])

X = df_processed.drop(columns=['StudentID', 'PlacementStatus'])
y = df_processed['PlacementStatus'].copy()

X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train_orig_np = X_train_orig.values
X_test_orig_np = X_test_orig.values
y_train_orig_np = y_train_orig.values
y_test_orig_np = y_test_orig.values

tree_simple = DecisionTree(min_samples_split=5, min_samples_leaf=2)
tree_simple.fit(X_train_orig_np, y_train_orig_np)
y_pred_simple = tree_simple.predict(X_test_orig_np)

f1_simple = f1_score(y_test_orig_np, y_pred_simple)
precision_simple, recall_simple, _ = precision_recall_curve(y_test_orig_np, y_pred_simple)
pr_auc_simple = auc(recall_simple, precision_simple)

print(f"F1-мера: {f1_simple:.4f}")
print(f"PR AUC: {pr_auc_simple:.4f}")

F1-мера: 0.6738
PR AUC: 0.7411


Имплементированный алгоритм показал примерно такие же результаты, что и бейзлайн из библиотеки

In [None]:
X_new = X.copy()

X_new['Average_Marks'] = (X_new['SSC_Marks'] + X_new['HSC_Marks'] + X_new['CGPA']) / 3
X_new['Total_Activities'] = X_new['Projects'] + X_new['Internships'] + X_new['Workshops/Certifications']
X_new['CGPA_Aptitude_Score'] = X_new['CGPA'] * X_new['AptitudeTestScore']
X_new['Projects_And_Internships'] = X_new['Projects'] + X_new['Internships']
X_new['Normalized_CGPA'] = X_new['CGPA'] / 10.0
X_new['High_Aptitude'] = (X_new['AptitudeTestScore'] > 70).astype(int)
X_new['Activity_Skill_Index'] = (X_new['Projects'] + X_new['Internships'] + X_new['Workshops/Certifications']) * X_new['SoftSkillsRating']

X_improved_df = X_new[['Activity_Skill_Index', 'Normalized_CGPA', 'High_Aptitude']].copy()

X_train_imp, X_test_imp, y_train_imp, y_test_imp = train_test_split(X_improved_df, y, test_size=0.2, random_state=42, stratify=y)

X_train_imp_np = X_train_imp.values
X_test_imp_np = X_test_imp.values
y_train_imp_np = y_train_imp.values
y_test_imp_np = y_test_imp.values

best_f1 = -1
best_params = {}
param_grid = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

for md in param_grid['max_depth']:
    for mss in param_grid['min_samples_split']:
        for msl in param_grid['min_samples_leaf']:
            cv_f1_scores = []
            for train_idx, val_idx in cv.split(X_train_imp_np, y_train_imp_np):
                X_tr_cv, X_val_cv = X_train_imp_np[train_idx], X_train_imp_np[val_idx]
                y_tr_cv, y_val_cv = y_train_imp_np[train_idx], y_train_imp_np[val_idx]

                tree_cv = DecisionTree(max_depth=md, min_samples_split=mss, min_samples_leaf=msl)
                tree_cv.fit(X_tr_cv, y_tr_cv)
                y_pred_cv = tree_cv.predict(X_val_cv)
                cv_f1_scores.append(f1_score(y_val_cv, y_pred_cv))

            mean_f1 = np.mean(cv_f1_scores)
            if mean_f1 > best_f1:
                best_f1 = mean_f1
                best_params = {'max_depth': md, 'min_samples_split': mss, 'min_samples_leaf': msl}

tree_improved = DecisionTree(**best_params)
tree_improved.fit(X_train_imp_np, y_train_imp_np)
y_pred_improved = tree_improved.predict(X_test_imp_np)

f1_improved = f1_score(y_test_imp_np, y_pred_improved)
precision_improved, recall_improved, _ = precision_recall_curve(y_test_imp_np, y_pred_improved)
pr_auc_improved = auc(recall_improved, precision_improved)

print(f"F1-мера: {f1_improved:.4f}")
print(f"PR AUC: {pr_auc_improved:.4f}")

F1-мера: 0.7306
PR AUC: 0.7913


Новые признаки и подбор гиперпараметров повысили метрики, улучшенная имплементация на таком же уровне, что улучшенный библиотечный алгоритм

## Сравнение

| Модель | F1-мера | PR AUC |
|----------|-------|--------|
| Библиотечный бейзлайн | 0.6663 | 0.7372 |
| + Новые признаки | 0.6941 | 0.6934 |
| + Подбор гиперпараметров | 0.7308 | 0.7992 |
| Имплементация бейзлайн | 0.6738 | 0.7411 |
| + Новые признаки и подбор гиперпараметров | 0.7306 | 0.7913 |

#Регрессия

##Бейзлайн библиотечной модели решающего дерева

In [4]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

df = pd.read_csv('possum.csv')

target_column = 'totlngth'
original_feature_columns = ['age', 'hdlngth', 'skullw', 'taill', 'footlgth', 'earconch', 'eye', 'chest', 'belly']

X_original = df[original_feature_columns]
y = df[target_column]

X_clean = X_original.fillna(X_original.median())
y_clean = y.fillna(y.median())

X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42)

dt_baseline = DecisionTreeRegressor(random_state=42)
dt_baseline.fit(X_train, y_train)
y_pred_baseline = dt_baseline.predict(X_test)

r2_baseline = r2_score(y_test, y_pred_baseline)
mae_baseline = mean_absolute_error(y_test, y_pred_baseline)
rmse_baseline = np.sqrt(mean_squared_error(y_test, y_pred_baseline))

print(f"R²: {r2_baseline:.3f}")
print(f"MAE: {mae_baseline:.3f} см")
print(f"RMSE: {rmse_baseline:.3f} см")

R²: -0.643
MAE: 3.524 см
RMSE: 4.265 см


По метрикам видно, что самый простой бейзлайн не справляется с задачей. Добавим категориальные признаки и фиксированные гиперпараметры.

## Улучшенный бейзлайн

In [None]:
from sklearn.preprocessing import LabelEncoder
X_impr = df.drop(columns=[target_column, 'case'])
y = df[target_column]

numeric_features = X_impr.select_dtypes(include=[np.number]).columns.tolist()
X_processed = X_impr.copy()
X_processed[numeric_features] = X_processed[numeric_features].fillna(X_processed[numeric_features].median())

categorical_features = ['site', 'Pop', 'sex']
for col in categorical_features:
    le = LabelEncoder()
    X_processed[col] = le.fit_transform(X_processed[col].astype(str))

X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)
impr_model = DecisionTreeRegressor(max_depth=3, min_samples_split=20, min_samples_leaf=10, random_state=42)
impr_model.fit(X_train, y_train)
y_pred_impr_ = impr_model.predict(X_test)

mae_impr = mean_absolute_error(y_test, y_pred_impr_)
rmse_impr = np.sqrt(mean_squared_error(y_test, y_pred_impr_))
r2_impr = r2_score(y_test, y_pred_impr_)

print(f"R²:   {r2_impr:.4f}")
print(f"MAE:  {mae_impr:.4f}")
print(f"RMSE: {rmse_impr:.4f}")

R²:   0.2765
MAE:  2.2952
RMSE: 2.8299


Добавим подбор гиперпараметров и масштабирование для улучшения модели

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
impr_model = DecisionTreeRegressor(max_depth=3, min_samples_split=20, min_samples_leaf=10, random_state=42)
impr_model.fit(X_train_scaled, y_train)
y_pred_impr_ = impr_model.predict(X_test_scaled)

mae_impr = mean_absolute_error(y_test, y_pred_impr_)
rmse_impr = np.sqrt(mean_squared_error(y_test, y_pred_impr_))
r2_impr = r2_score(y_test, y_pred_impr_)

print(f"R²:   {r2_impr:.4f}")
print(f"MAE:  {mae_impr:.4f}")
print(f"RMSE: {rmse_impr:.4f}")

R²:   0.2765
MAE:  2.2952
RMSE: 2.8299


Масштабирование не дало никакого результата

In [None]:
param_grid = {
    'max_depth': [1, 3, 5, 10, 15],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 5, 10]
}

grid_search = GridSearchCV(
    estimator=DecisionTreeRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

r2_best = r2_score(y_test, y_pred_best)
mae_best = mean_absolute_error(y_test, y_pred_best)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))

print("Лучшие параметры:", grid_search.best_params_)
print(f"R²:   {r2_best:.4f}")
print(f"MAE:  {mae_best:.4f}")
print(f"RMSE: {rmse_best:.4f}")

Лучшие параметры: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 15}
R²:   0.1326
MAE:  2.5582
RMSE: 3.0987


Подбор параметров с помощью GridSearchCV выдал параметры,  которые показали результат хуже по сравнению с фиксированнынми

## Имплементация

In [5]:
from collections import Counter

class SimpleDecisionTreeRegressor:
    def __init__(self, max_depth=5, min_samples_split=2, min_samples_leaf=1):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.tree = None

    def _calculate_mse(self, y):
        if len(y) == 0:
            return 0
        mean_val = np.mean(y)
        return np.mean((y - mean_val) ** 2)

    def _find_best_split(self, X, y):
        best_mse = float('inf')
        best_feature_idx = None
        best_threshold = None

        n_features = X.shape[1]
        for feature_idx in range(n_features):
            thresholds = np.unique(X[:, feature_idx])
            for threshold in thresholds:
                left_mask = X[:, feature_idx] <= threshold
                right_mask = ~left_mask

                if np.sum(left_mask) < self.min_samples_leaf or np.sum(right_mask) < self.min_samples_leaf:
                    continue

                mse_left = self._calculate_mse(y[left_mask])
                mse_right = self._calculate_mse(y[right_mask])

                weighted_mse = (np.sum(left_mask) * mse_left + np.sum(right_mask) * mse_right) / len(y)

                if weighted_mse < best_mse:
                    best_mse = weighted_mse
                    best_feature_idx = feature_idx
                    best_threshold = threshold

        return best_feature_idx, best_threshold, best_mse

    def _build_tree(self, X, y, depth=0):
        n_samples = X.shape[0]
        if (depth >= self.max_depth or
            n_samples < self.min_samples_split or
            self._calculate_mse(y) == 0):
            leaf_value = np.mean(y)
            return {'type': 'leaf', 'value': leaf_value}

        best_feature_idx, best_threshold, best_mse = self._find_best_split(X, y)

        if best_feature_idx is None:
            leaf_value = np.mean(y)
            return {'type': 'leaf', 'value': leaf_value}

        left_mask = X[:, best_feature_idx] <= best_threshold
        right_mask = ~left_mask

        left_subtree = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right_subtree = self._build_tree(X[right_mask], y[right_mask], depth + 1)

        return {
            'type': 'split',
            'feature_idx': best_feature_idx,
            'threshold': best_threshold,
            'left': left_subtree,
            'right': right_subtree
        }

    def fit(self, X, y):
        if not isinstance(X, np.ndarray):
            X = X.values
        if not isinstance(y, np.ndarray):
            y = y.values
        self.tree = self._build_tree(X, y)

    def _predict_sample(self, x, node):
        if node['type'] == 'leaf':
            return node['value']

        if x[node['feature_idx']] <= node['threshold']:
            return self._predict_sample(x, node['left'])
        else:
            return self._predict_sample(x, node['right'])

    def predict(self, X):
        if not isinstance(X, np.ndarray):
            X = X.values
        predictions = [self._predict_sample(x, self.tree) for x in X]
        return np.array(predictions)

df = pd.read_csv('possum.csv')

target_column = 'totlngth'
original_feature_columns = ['age', 'hdlngth', 'skullw', 'taill', 'footlgth', 'earconch', 'eye', 'chest', 'belly']

X_original = df[original_feature_columns]
y = df[target_column]

X_clean = X_original.fillna(X_original.median())
y_clean = y.fillna(y.median())

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42)

custom_tree = SimpleDecisionTreeRegressor(max_depth=5, min_samples_split=5, min_samples_leaf=2)
custom_tree.fit(X_train, y_train)

y_pred = custom_tree.predict(X_test)

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"R²: {r2:.4f}")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")


R²: -0.1763
MAE: 3.0010
RMSE: 3.6085


Как уже выяснилось при исследовании библиотечной модели, значимыми являются гиперпараметры

In [6]:
class SimpleDecisionTreeRegressor:
    def __init__(self, max_depth=5, min_samples_split=5, min_samples_leaf=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.tree = None

    def _calculate_mse(self, y):
        if len(y) == 0:
            return 0
        mean_val = np.mean(y)
        return np.mean((y - mean_val) ** 2)

    def _find_best_split(self, X, y):
        best_mse = float('inf')
        best_feature_idx = None
        best_threshold = None

        n_features = X.shape[1]
        for feature_idx in range(n_features):
            thresholds = np.unique(X[:, feature_idx])
            for threshold in thresholds:
                left_mask = X[:, feature_idx] <= threshold
                right_mask = ~left_mask

                if np.sum(left_mask) < self.min_samples_leaf or np.sum(right_mask) < self.min_samples_leaf:
                    continue

                mse_left = self._calculate_mse(y[left_mask])
                mse_right = self._calculate_mse(y[right_mask])

                weighted_mse = (np.sum(left_mask) * mse_left + np.sum(right_mask) * mse_right) / len(y)

                if weighted_mse < best_mse:
                    best_mse = weighted_mse
                    best_feature_idx = feature_idx
                    best_threshold = threshold

        return best_feature_idx, best_threshold, best_mse

    def _build_tree(self, X, y, depth=0):
        n_samples = X.shape[0]
        if (depth >= self.max_depth or
            n_samples < self.min_samples_split or
            self._calculate_mse(y) == 0):
            leaf_value = np.mean(y)
            return {'type': 'leaf', 'value': leaf_value}

        best_feature_idx, best_threshold, best_mse = self._find_best_split(X, y)

        if best_feature_idx is None:
            leaf_value = np.mean(y)
            return {'type': 'leaf', 'value': leaf_value}

        left_mask = X[:, best_feature_idx] <= best_threshold
        right_mask = ~left_mask

        left_subtree = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right_subtree = self._build_tree(X[right_mask], y[right_mask], depth + 1)

        return {
            'type': 'split',
            'feature_idx': best_feature_idx,
            'threshold': best_threshold,
            'left': left_subtree,
            'right': right_subtree
        }

    def fit(self, X, y):
        if not isinstance(X, np.ndarray):
            X = X.values
        if not isinstance(y, np.ndarray):
            y = y.values
        self.tree = self._build_tree(X, y)

    def _predict_sample(self, x, node):
        if node['type'] == 'leaf':
            return node['value']

        if x[node['feature_idx']] <= node['threshold']:
            return self._predict_sample(x, node['left'])
        else:
            return self._predict_sample(x, node['right'])

    def predict(self, X):
        if not isinstance(X, np.ndarray):
            X = X.values
        predictions = [self._predict_sample(x, self.tree) for x in X]
        return np.array(predictions)

df = pd.read_csv('possum.csv')

target_column = 'totlngth'
original_feature_columns = ['age', 'hdlngth', 'skullw', 'taill', 'footlgth', 'earconch', 'eye', 'chest', 'belly']

X_original = df[original_feature_columns]
y = df[target_column]

X_clean = X_original.fillna(X_original.median())
y_clean = y.fillna(y.median())

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42)

custom_tree = SimpleDecisionTreeRegressor(max_depth=4, min_samples_split=10, min_samples_leaf=5)
custom_tree.fit(X_train, y_train)

y_pred = custom_tree.predict(X_test)

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"R²: {r2:.4f}")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")


R²: 0.2431
MAE: 2.2594
RMSE: 2.8945


## Сравнение

| Модель | R² | MAE | RMSE |
|----------|-------|--------|--------|
| Бейзлайн | -0.643 | 3.524 | 4.265 |
| + Фиксированные параметры | 0.2765 | 2.2952 | 2.8299 |
| + Масштабирование | 0.2765 | 2.2952 | 2.8299 |
| + Подбор гиперпараметров | 0.1326 | 2.5582 | 3.0987 |
| Имплементация | -0.1763 | 3.0010 | 3.6085 |
| Имплементация с подходящими параметрами | 0.2431 | 2.2594 | 2.8945 |

Лучше всего себя показал бейзлайн с фиксированными параметрами. Подбор гиперпараметров с помощью GridSearchCV оказался неоптимален, что контринтуитивно. Возможно, дело в маленькой выборке данных.