In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging
import optuna
import warnings
from sklearn.utils import all_estimators
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.inspection import permutation_importance
from sklearn.exceptions import ConvergenceWarning
from sklearn.ensemble import VotingRegressor, StackingRegressor
from sklearn.linear_model import RidgeCV, Ridge
from optuna.samplers import TPESampler

optuna.logging.set_verbosity(logging.WARNING)
warnings.filterwarnings('ignore')
warnings.simplefilter("ignore", category=ConvergenceWarning)

In [3]:
df = pd.read_csv('final_datasets/enhanced_anxiety_dataset.csv')
# df = pd.read_csv('cleaned_enhanced_anxiety_dataset.csv')
target_column = 'Anxiety Level (1-10)'
df.drop(target_column, axis=1, inplace=True)

print("Total rows:", df.shape[0])
print("Total columns:", df.shape[1])
print("Target column:", target_column)

Total rows: 11000
Total columns: 18
Target column: Anxiety Level (1-10)


In [3]:
df_2 = df.copy()

In [4]:
df.dtypes

Age                                    int64
Gender                                object
Occupation                            object
Sleep Hours                          float64
Physical Activity (hrs/week)         float64
Caffeine Intake (mg/day)               int64
Alcohol Consumption (drinks/week)      int64
Smoking                               object
Family History of Anxiety             object
Stress Level (1-10)                    int64
Heart Rate (bpm)                       int64
Breathing Rate (breaths/min)           int64
Sweating Level (1-5)                   int64
Dizziness                             object
Medication                            object
Therapy Sessions (per month)           int64
Recent Major Life Event               object
Diet Quality (1-10)                    int64
Anxiety Level (1-10)                 float64
dtype: object

In [5]:
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
label_encoders = {}
df_encoded = df.copy()

for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le
    print(f"Закодовано {col}: {len(le.classes_)} унікальних значень")

Закодовано Gender: 3 унікальних значень
Закодовано Occupation: 13 унікальних значень
Закодовано Smoking: 2 унікальних значень
Закодовано Family History of Anxiety: 2 унікальних значень
Закодовано Dizziness: 2 унікальних значень
Закодовано Medication: 2 унікальних значень
Закодовано Recent Major Life Event: 2 унікальних значень


In [6]:
df_encoded.dtypes

Age                                    int64
Gender                                 int32
Occupation                             int32
Sleep Hours                          float64
Physical Activity (hrs/week)         float64
Caffeine Intake (mg/day)               int64
Alcohol Consumption (drinks/week)      int64
Smoking                                int32
Family History of Anxiety              int32
Stress Level (1-10)                    int64
Heart Rate (bpm)                       int64
Breathing Rate (breaths/min)           int64
Sweating Level (1-5)                   int64
Dizziness                              int32
Medication                             int32
Therapy Sessions (per month)           int64
Recent Major Life Event                int32
Diet Quality (1-10)                    int64
Anxiety Level (1-10)                 float64
dtype: object

In [7]:
numeric_cols = df_encoded.select_dtypes(include='number').columns.tolist()
numeric_cols

['Age',
 'Gender',
 'Occupation',
 'Sleep Hours',
 'Physical Activity (hrs/week)',
 'Caffeine Intake (mg/day)',
 'Alcohol Consumption (drinks/week)',
 'Smoking',
 'Family History of Anxiety',
 'Stress Level (1-10)',
 'Heart Rate (bpm)',
 'Breathing Rate (breaths/min)',
 'Sweating Level (1-5)',
 'Dizziness',
 'Medication',
 'Therapy Sessions (per month)',
 'Recent Major Life Event',
 'Diet Quality (1-10)',
 'Anxiety Level (1-10)']

In [8]:
numeric_cols = df_encoded.select_dtypes(include='number').columns
numeric_cols = numeric_cols.drop(target_column, errors='ignore')
scaler = StandardScaler()
df_encoded[numeric_cols] = scaler.fit_transform(df_encoded[numeric_cols])

print("Масштабування числових ознак завершено")
print(f"Середнє значення після масштабування:\n{df_encoded[numeric_cols].mean()}")
print(f"\nСтандартне відхилення після масштабування:\n{df_encoded[numeric_cols].std()}")

Масштабування числових ознак завершено
Середнє значення після масштабування:
Age                                 -1.401707e-16
Gender                              -6.201100e-17
Occupation                           6.039613e-17
Sleep Hours                         -9.818409e-17
Physical Activity (hrs/week)        -7.492996e-17
Caffeine Intake (mg/day)             1.724681e-16
Alcohol Consumption (drinks/week)   -1.111030e-16
Smoking                              4.844610e-18
Family History of Anxiety            5.296773e-17
Stress Level (1-10)                 -3.035955e-17
Heart Rate (bpm)                     1.821573e-16
Breathing Rate (breaths/min)         2.306034e-16
Sweating Level (1-5)                -1.498599e-16
Dizziness                           -1.446923e-16
Medication                           2.067033e-17
Therapy Sessions (per month)         5.748937e-17
Recent Major Life Event             -1.873249e-17
Diet Quality (1-10)                  3.229740e-17
dtype: float64

Стандар

In [9]:
numeric_cols

Index(['Age', 'Gender', 'Occupation', 'Sleep Hours',
       'Physical Activity (hrs/week)', 'Caffeine Intake (mg/day)',
       'Alcohol Consumption (drinks/week)', 'Smoking',
       'Family History of Anxiety', 'Stress Level (1-10)', 'Heart Rate (bpm)',
       'Breathing Rate (breaths/min)', 'Sweating Level (1-5)', 'Dizziness',
       'Medication', 'Therapy Sessions (per month)', 'Recent Major Life Event',
       'Diet Quality (1-10)'],
      dtype='object')

In [10]:
df_encoded.dtypes

Age                                  float64
Gender                               float64
Occupation                           float64
Sleep Hours                          float64
Physical Activity (hrs/week)         float64
Caffeine Intake (mg/day)             float64
Alcohol Consumption (drinks/week)    float64
Smoking                              float64
Family History of Anxiety            float64
Stress Level (1-10)                  float64
Heart Rate (bpm)                     float64
Breathing Rate (breaths/min)         float64
Sweating Level (1-5)                 float64
Dizziness                            float64
Medication                           float64
Therapy Sessions (per month)         float64
Recent Major Life Event              float64
Diet Quality (1-10)                  float64
Anxiety Level (1-10)                 float64
dtype: object

In [11]:
df_encoded

Unnamed: 0,Age,Gender,Occupation,Sleep Hours,Physical Activity (hrs/week),Caffeine Intake (mg/day),Alcohol Consumption (drinks/week),Smoking,Family History of Anxiety,Stress Level (1-10),Heart Rate (bpm),Breathing Rate (breaths/min),Sweating Level (1-5),Dizziness,Medication,Therapy Sessions (per month),Recent Major Life Event,Diet Quality (1-10),Anxiety Level (1-10)
0,-0.849359,-1.211023,-1.597525,-0.530115,-0.132478,-0.725727,0.052442,0.950496,-1.065213,1.415627,1.332415,-1.348395,0.657245,-1.031777,0.970260,0.262107,0.977881,0.628018,5.0
1,0.435061,1.237062,0.539177,-0.367176,1.508891,-0.594517,-0.299086,0.950496,0.938779,-1.659122,-1.669040,0.395834,-0.772538,0.969201,-1.030651,-0.195977,-1.022619,0.973428,3.0
2,1.795036,0.013019,0.806265,-1.344810,0.414645,-1.167696,-1.002141,-1.052082,0.938779,-1.659122,0.004849,1.364850,-0.057646,-1.031777,-1.030651,-0.654060,0.977881,-1.444441,1.0
3,-1.529347,-1.211023,1.073353,-0.693054,-0.077766,0.510405,-0.650614,0.950496,-1.065213,-0.634206,-0.283753,-0.766985,-0.057646,-1.031777,-1.030651,-1.112144,-1.022619,-1.444441,2.0
4,0.661724,-1.211023,0.806265,1.262214,-0.351328,-0.269946,-1.002141,0.950496,-1.065213,-1.659122,0.408890,-0.379379,0.657245,0.969201,0.970260,-0.654060,-1.022619,-0.753622,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10995,-1.302684,-1.211023,-0.529174,-0.448645,0.086371,1.932992,-0.123322,0.950496,-1.065213,0.732349,0.004849,1.364850,-1.487429,0.969201,0.970260,-0.654060,-1.022619,-0.753622,6.0
10996,0.737278,1.237062,1.607528,-0.041298,0.359932,-1.533701,1.282788,0.950496,-1.065213,0.390710,0.235730,-0.766985,-0.057646,-1.031777,-1.030651,-0.195977,-1.022619,0.628018,3.0
10997,-0.849359,0.013019,0.539177,0.040172,2.165439,-0.877654,0.755497,-1.052082,-1.065213,0.732349,-1.091837,-0.960788,-1.487429,0.969201,0.970260,-0.195977,0.977881,0.628018,4.0
10998,0.963941,1.237062,-1.597525,-0.774523,-0.132478,-0.263041,-0.299086,-1.052082,-1.065213,-0.634206,1.216974,1.364850,-0.057646,0.969201,0.970260,-0.654060,0.977881,-1.099031,4.0


In [12]:
X = df_encoded.drop(columns=[target_column])
y = df_encoded[target_column]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\nРозмір тренувальної вибірки: {X_train.shape}")
print(f"Розмір тестової вибірки: {X_test.shape}")

In [13]:
df_encoded

Unnamed: 0,Age,Gender,Occupation,Sleep Hours,Physical Activity (hrs/week),Caffeine Intake (mg/day),Alcohol Consumption (drinks/week),Smoking,Family History of Anxiety,Stress Level (1-10),Heart Rate (bpm),Breathing Rate (breaths/min),Sweating Level (1-5),Dizziness,Medication,Therapy Sessions (per month),Recent Major Life Event,Diet Quality (1-10),Anxiety Level (1-10)
0,-0.849359,-1.211023,-1.597525,-0.530115,-0.132478,-0.725727,0.052442,0.950496,-1.065213,1.415627,1.332415,-1.348395,0.657245,-1.031777,0.970260,0.262107,0.977881,0.628018,5.0
1,0.435061,1.237062,0.539177,-0.367176,1.508891,-0.594517,-0.299086,0.950496,0.938779,-1.659122,-1.669040,0.395834,-0.772538,0.969201,-1.030651,-0.195977,-1.022619,0.973428,3.0
2,1.795036,0.013019,0.806265,-1.344810,0.414645,-1.167696,-1.002141,-1.052082,0.938779,-1.659122,0.004849,1.364850,-0.057646,-1.031777,-1.030651,-0.654060,0.977881,-1.444441,1.0
3,-1.529347,-1.211023,1.073353,-0.693054,-0.077766,0.510405,-0.650614,0.950496,-1.065213,-0.634206,-0.283753,-0.766985,-0.057646,-1.031777,-1.030651,-1.112144,-1.022619,-1.444441,2.0
4,0.661724,-1.211023,0.806265,1.262214,-0.351328,-0.269946,-1.002141,0.950496,-1.065213,-1.659122,0.408890,-0.379379,0.657245,0.969201,0.970260,-0.654060,-1.022619,-0.753622,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10995,-1.302684,-1.211023,-0.529174,-0.448645,0.086371,1.932992,-0.123322,0.950496,-1.065213,0.732349,0.004849,1.364850,-1.487429,0.969201,0.970260,-0.654060,-1.022619,-0.753622,6.0
10996,0.737278,1.237062,1.607528,-0.041298,0.359932,-1.533701,1.282788,0.950496,-1.065213,0.390710,0.235730,-0.766985,-0.057646,-1.031777,-1.030651,-0.195977,-1.022619,0.628018,3.0
10997,-0.849359,0.013019,0.539177,0.040172,2.165439,-0.877654,0.755497,-1.052082,-1.065213,0.732349,-1.091837,-0.960788,-1.487429,0.969201,0.970260,-0.195977,0.977881,0.628018,4.0
10998,0.963941,1.237062,-1.597525,-0.774523,-0.132478,-0.263041,-0.299086,-1.052082,-1.065213,-0.634206,1.216974,1.364850,-0.057646,0.969201,0.970260,-0.654060,0.977881,-1.099031,4.0


In [14]:
classifiers = all_estimators(type_filter='regressor')
skip_models = ['GaussianProcessRegressor', 'QuantileRegressor'] 
results = {}

print(f"Тестування різних регресорів з кросвалідацією (5 фолдів)")
print("=" * 80)

CV = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for name, RegressorClass in classifiers:
    if name in skip_models:
        continue 
    try:
        model = RegressorClass()
        scores = cross_val_score(model, X_train, y_train, cv=CV, scoring='neg_mean_absolute_percentage_error', n_jobs=-1)
        
        mape_scores = -scores 
        mean_mape = mape_scores.mean()
        std_mape = mape_scores.std()

        model.fit(X_train, y_train)
        
        r2 = np.nan
        try:
            y_pred = model.predict(X_train)
            r2 = r2_score(y_train, y_pred)
        except Exception:
            r2 = np.nan
            
        if not np.isnan(mean_mape):
            results[name] = {
                'mean_mape': mean_mape, 
                'std_dev': std_mape,
                'r2_score': r2          
            }
            print(f"{name}: MAPE {mean_mape:.2%} (± {std_mape:.2%}), R2 Score: {r2:.3f}")
        else:
            print(f"{name}: Пропущено через NaN у результаті") 

    except Exception as e:
        continue
       
sorted_results = sorted(results.items(), key=lambda x: x[1]['mean_mape'], reverse=False)

print("\nТоп-10 моделей з найменшою помилкою (MAPE):")
for i, (name, metric) in enumerate(sorted_results[:10], 1):
    print(f"{i:2d}. {name:40s} | MAPE: {metric['mean_mape']:.2%} (± {metric['std_dev']:.2%}) | R2: {metric['r2_score']:.3f}")
    
results_df = pd.DataFrame.from_dict(results, orient='index')

Тестування різних регресорів з кросвалідацією (5 фолдів)
ARDRegression: MAPE 32.31% (± 1.00%), R2 Score: 0.715
AdaBoostRegressor: MAPE 36.64% (± 1.18%), R2 Score: 0.733
BaggingRegressor: MAPE 32.84% (± 0.91%), R2 Score: 0.952
BayesianRidge: MAPE 32.34% (± 1.00%), R2 Score: 0.716
DecisionTreeRegressor: MAPE 40.26% (± 1.62%), R2 Score: 1.000
DummyRegressor: MAPE 60.94% (± 1.47%), R2 Score: 0.000
ElasticNet: MAPE 44.56% (± 1.30%), R2 Score: 0.463
ElasticNetCV: MAPE 32.34% (± 1.00%), R2 Score: 0.716
ExtraTreeRegressor: MAPE 40.86% (± 1.75%), R2 Score: 1.000
ExtraTreesRegressor: MAPE 31.54% (± 0.98%), R2 Score: 1.000
GammaRegressor: MAPE 38.53% (± 1.06%), R2 Score: 0.607
GradientBoostingRegressor: MAPE 30.87% (± 0.97%), R2 Score: 0.782
HistGradientBoostingRegressor: MAPE 31.10% (± 1.03%), R2 Score: 0.839
HuberRegressor: MAPE 32.07% (± 0.98%), R2 Score: 0.715
KNeighborsRegressor: MAPE 38.44% (± 1.15%), R2 Score: 0.779
KernelRidge: MAPE 126.53% (± 0.91%), R2 Score: -2.738
Lars: MAPE 32.33% (±

In [15]:
top5 = sorted_results[:5]
print("\nТОП-5 моделей для ансамблів:")
for name, metric in top5:
    score = metric.get('mean_mape', metric.get('mean_mae', 0)) 
    print(f"{name} | Error: {score:.2%}")

def instantiate_model(model_name):
    for name, RegressorClass in classifiers:
        if name == model_name:
            try:
                return RegressorClass()
            except:
                return None
    return None

top_models = []
for name, metric in top5:
    model_obj = instantiate_model(name)
    if model_obj is not None:
        top_models.append((name, model_obj))

voting_model = VotingRegressor(
    estimators=top_models
)

voting_model.fit(X_train, y_train)
y_pred_vote = voting_model.predict(X_test)

mape_vote = mean_absolute_percentage_error(y_test, y_pred_vote)
print(f"MAPE (Voting): {mape_vote:.2%}")

stacked_model = StackingRegressor(
    estimators=top_models,
    final_estimator=RidgeCV(),
    cv=5  
)

stacked_model.fit(X_train, y_train)
y_pred_stack = stacked_model.predict(X_test)

mape_stack = mean_absolute_percentage_error(y_test, y_pred_stack)
print(f"MAPE (Stacking): {mape_stack:.2%}")


ТОП-5 моделей для ансамблів:
GradientBoostingRegressor | Error: 30.87%
HistGradientBoostingRegressor | Error: 31.10%
SVR | Error: 31.53%
ExtraTreesRegressor | Error: 31.54%
NuSVR | Error: 31.55%
MAPE (Voting): 30.40%
MAPE (Stacking): 30.41%


In [16]:
param_grids = {
    "Ridge": {
        'alpha': [0.01, 0.1, 1.0, 10.0, 100.0],
        'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sag']
    },
    "Lasso": {
        'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0],
        'selection': ['cyclic', 'random']
    },
    "ElasticNet": {
        'alpha': [0.001, 0.01, 0.1, 1.0, 10.0],
        'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
        'selection': ['cyclic', 'random']
    },
    "SVR": {
        'kernel': ['linear', 'rbf', 'poly'],
        'C': [0.1, 1.0, 10.0, 100.0],
        'epsilon': [0.01, 0.1, 0.2, 0.5],
        'gamma': ['scale', 'auto']
    },
    "NuSVR": {
        'nu': [0.25, 0.5, 0.75],
        'C': [0.1, 1.0, 10.0],
        'kernel': ['rbf', 'poly', 'sigmoid']
    },
    "MLPRegressor": {
        'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'lbfgs'], 
        'alpha': [0.0001, 0.001, 0.01],
        'max_iter': [500, 1000]
    },
    "RandomForestRegressor": {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None] 
    },
    "GradientBoostingRegressor": {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'loss': ['squared_error', 'absolute_error', 'huber', 'quantile']
    },
    "ExtraTreesRegressor": {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
    },
    "HistGradientBoostingRegressor": {
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [5, 10, 15, None],
        'max_iter': [100, 200, 300],
        "l2_regularization": [0.0, 1.0, 5.0],
        'loss': ['squared_error', 'absolute_error']
    },
    "AdaBoostRegressor": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.5, 1.0],
        'loss': ['linear', 'square', 'exponential'] 
    },
    "KNeighborsRegressor": {
        'n_neighbors': [3, 5, 7, 9, 11, 15],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree'],
        'leaf_size': [20, 30, 40],
        'p': [1, 2]
    },
    "DecisionTreeRegressor": {
        'max_depth': [5, 10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'criterion': ['squared_error', 'friedman_mse', 'absolute_error'], 
        'max_features': ['sqrt', 'log2', None]
    },
    "HuberRegressor": {
        'epsilon': [1.1, 1.35, 1.5, 1.75],
        'alpha': [0.0001, 0.001, 0.01, 0.1]
    },
    "LinearSVR": {
        'C': [0.1, 1.0, 10.0, 100.0],
        'epsilon': [0.01, 0.1, 0.2],
        'loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'],
        'dual': [True, False],
        'max_iter': [1000, 5000]
    },
    "ARDRegressor": {
        'n_iter': [100, 300, 500],
        'alpha_1': [1e-6, 1e-4, 1e-2],
        'lambda_1': [1e-6, 1e-4, 1e-2],
        'tol': [1e-4, 1e-3, 1e-2]
    },
    "LassoCV": {
        'eps': [1e-4, 1e-3],
        'n_alphas': [50, 100, 200],
        'max_iter': [1000, 2000, 3000],
        'selection': ['cyclic', 'random']
    }
}
print("Гіперпараметри підготовлені для моделей регресії:")
for model_name in param_grids.keys():
    print(f" {model_name}  - {len(param_grids[model_name])} параметрів для перебору визначено")

Гіперпараметри підготовлені для моделей регресії:
  - 2: параметрів для перебору визначено
  - 2: параметрів для перебору визначено
  - 3: параметрів для перебору визначено
  - 4: параметрів для перебору визначено
  - 3: параметрів для перебору визначено
  - 5: параметрів для перебору визначено
  - 5: параметрів для перебору визначено
  - 4: параметрів для перебору визначено
  - 4: параметрів для перебору визначено
  - 5: параметрів для перебору визначено
  - 3: параметрів для перебору визначено
  - 5: параметрів для перебору визначено
  - 5: параметрів для перебору визначено
  - 2: параметрів для перебору визначено
  - 5: параметрів для перебору визначено
  - 4: параметрів для перебору визначено
  - 4: параметрів для перебору визначено


In [19]:
top_10_regressors = sorted_results[:10]
CV = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
feature_names = X_train.columns
feature_importance_dict = {}
tuned_results = {}
skip_model = ['SVR', 'NuSVR']
print("Початок підбору гіперпараметрів для Top-10 регресорів")
print("=" * 80)

for name, _ in top_10_regressors:
    if name in skip_model:
        continue
    print(f"\n{'='*80}")
    print(f"Обробка моделі: {name}")
    print(f"{'='*80}")
    
    RegressorClass = dict(classifiers)[name]
    base_model = RegressorClass()
    
    if name in param_grids:
        param_grid = param_grids[name]
        print(f"Підбір гіперпараметрів: {len(param_grid)} параметрів")
        
        try:
            grid_search = GridSearchCV(
                base_model, 
                param_grid, 
                cv=CV, 
                scoring='neg_mean_absolute_percentage_error', 
                n_jobs=-1,
                verbose=1
            )
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            best_params = grid_search.best_params_
            best_idx = grid_search.best_index_
            mean_score = -grid_search.cv_results_['mean_test_score'][best_idx]
            std_score = grid_search.cv_results_['std_test_score'][best_idx]
        except Exception as e:
            print(f"Помилка GridSearch: {e}")
            print("Використовується модель за замовчуванням")
            best_model = base_model
            best_model.fit(X_train, y_train)
            best_params = "Параметри за замовчуванням"
            scores = cross_val_score(best_model, X_train, y_train, cv=CV, scoring='neg_mean_absolute_percentage_error', n_jobs=-1)
            mean_score = -scores.mean()
            std_score = scores.std()
    else:
        print(f"Гіперпараметри не визначені, використовується модель за замовчуванням")
        best_model = base_model
        best_model.fit(X_train, y_train)
        best_params = "Параметри за замовчуванням"
        scores = cross_val_score(best_model, X_train, y_train, cv=CV, scoring='neg_mean_absolute_percentage_error', n_jobs=-1)
        mean_score = -scores.mean()
        std_score = scores.std()
    
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)
    
    train_mape = mean_absolute_percentage_error(y_train, y_train_pred)
    test_mape = mean_absolute_percentage_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    
    tuned_results[name] = {
        'best_params': best_params,
        'cv_mean_mape': mean_score,
        'cv_std_mape': std_score,
        'train_mape': train_mape,
        'test_mape': test_mape,
        'test_r2': test_r2,
        'train_r2': train_r2,
        'model': best_model
    }
    
    print(f"\n{'─'*80}")
    print(f"Результати для {name}:")
    print(f"{'─'*80}")
    print(f"Найкращі параметри: {best_params}")
    print(f"CV MAPE: {mean_score:.2%} (± {std_score:.2%})")
    print(f"Train MAPE: {train_mape:.2%}")
    print(f"Test MAPE:  {test_mape:.2%}")
    print(f"Test R2 Score: {test_r2:.4f}")
    print(f"Train R2 Score: {train_r2:.4f}")
    
    y_pred_full = best_model.predict(X)
    df_2[f'{name}_prediction'] = y_pred_full

    print(f"\n{'─'*80}")
    print(f"Важливість ознак для {name}:")
    print(f"{'─'*80}")
    
    if hasattr(best_model, 'feature_importances_'):
        importances = best_model.feature_importances_
        feature_importance_dict[name] = importances
        sort_idx = np.argsort(importances)[::-1]
        print("Топ-10 найважливіших ознак:")
        for i, idx in enumerate(sort_idx[:10], 1):
            print(f"{i:2d}. {feature_names[idx]:30s}: {importances[idx]:.4f}")
    else:
        print("Обчислення Permutation Importance...")
        perm_importances = permutation_importance(
            best_model, X_test, y_test, 
            scoring='neg_mean_absolute_percentage_error',
            n_repeats=5, 
            random_state=42,
            n_jobs=-1
        )
        importances = perm_importances.importances_mean
        feature_importance_dict[name] = importances
        sort_idx = np.argsort(importances)[::-1]
        print("Топ-10 найважливіших ознак:")
        for i, idx in enumerate(sort_idx[:10], 1):
            print(f"{i:2d}. {feature_names[idx]:30s}: {importances[idx]:.4f}")

print("\n" + "="*80)
print("Підбір гіперпараметрів завершено!")
print("="*80)

Початок підбору гіперпараметрів для Top-10 регресорів

Обробка моделі: GradientBoostingRegressor
Підбір гіперпараметрів: 4 параметрів
Fitting 5 folds for each of 144 candidates, totalling 720 fits

────────────────────────────────────────────────────────────────────────────────
Результати для GradientBoostingRegressor:
────────────────────────────────────────────────────────────────────────────────
Найкращі параметри: {'learning_rate': 0.1, 'loss': 'huber', 'max_depth': 3, 'n_estimators': 100}
CV MAPE: 30.70% (± 0.96%)
Train MAPE: 29.40%
Test MAPE:  30.31%
Test R2 Score: 0.7764

────────────────────────────────────────────────────────────────────────────────
Важливість ознак для GradientBoostingRegressor:
────────────────────────────────────────────────────────────────────────────────
Топ-10 найважливіших ознак:
 1. Stress Level (1-10)           : 0.6741
 2. Sleep Hours                   : 0.1844
 3. Therapy Sessions (per month)  : 0.0868
 4. Caffeine Intake (mg/day)      : 0.0364
 5. 

In [18]:
def suggest_params(trial, model_name):
    if model_name == "SVR":
        kernel = trial.suggest_categorical('kernel', ['rbf', 'poly', 'sigmoid'])
        C = trial.suggest_loguniform('C', 0.1, 100.0) 
        epsilon = trial.suggest_loguniform('epsilon', 0.001, 0.5)
        
        params = {'C': C, 'epsilon': epsilon, 'kernel': kernel}
        
        if kernel == 'poly':
            params['degree'] = trial.suggest_int('degree', 2, 5)
            
        return params
        
    elif model_name in ["RandomForestRegressor", "ExtraTreesRegressor"]:
        n_estimators = trial.suggest_int('n_estimators', 100, 300)
        max_depth = trial.suggest_categorical('max_depth', [10, 20, 30, None])
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
        max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
        
        return {
            'n_estimators': n_estimators,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'max_features': max_features
        }
        
    elif model_name == "GradientBoostingRegressor":
        n_estimators = trial.suggest_int('n_estimators', 100, 300)
        learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.2)
        max_depth = trial.suggest_int('max_depth', 3, 7)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
        loss = trial.suggest_categorical('loss', ['squared_error', 'absolute_error', 'huber'])
        
        return {
            'n_estimators': n_estimators,
            'learning_rate': learning_rate,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'loss': loss
        }
        
    elif model_name == "HistGradientBoostingRegressor":
        learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.1)
        max_depth = trial.suggest_categorical('max_depth', [5, 10, 15, None])
        max_iter = trial.suggest_int('max_iter', 100, 300)
        l2_regularization = trial.suggest_loguniform('l2_regularization', 0.001, 5.0)
        
        return {
            'learning_rate': learning_rate,
            'max_depth': max_depth,
            'max_iter': max_iter,
            'l2_regularization': l2_regularization
        }
    elif model_name == "HuberRegressor":
        epsilon = trial.suggest_categorical('epsilon', [1.1, 1.35, 1.5, 1.75])
        alpha = trial.suggest_loguniform('alpha', 0.0001, 0.1) 
        
        return {
            'epsilon': epsilon,
            'alpha': alpha
        }
    
    elif model_name == "LinearSVR":
        C = trial.suggest_loguniform('C', 0.1, 100.0) 
        epsilon = trial.suggest_categorical('epsilon', [0.01, 0.1, 0.2])
        loss = trial.suggest_categorical('loss', ['epsilon_insensitive', 'squared_epsilon_insensitive'])
        dual = trial.suggest_categorical('dual', [True, False])
        max_iter = trial.suggest_int('max_iter', 1000, 5000)
        
        return {
            'C': C,
            'epsilon': epsilon,
            'loss': loss,
            'dual': dual,
            'max_iter': max_iter
        }
        
    elif model_name == "ARDRegressor":
        n_iter = trial.suggest_int('n_iter', 100, 500)
        alpha_1 = trial.suggest_loguniform('alpha_1', 1e-6, 1e-2)
        lambda_1 = trial.suggest_loguniform('lambda_1', 1e-6, 1e-2)
        tol = trial.suggest_categorical('tol', [1e-4, 1e-3, 1e-2])
        
        return {
            'n_iter': n_iter,
            'alpha_1': alpha_1,
            'lambda_1': lambda_1,
            'tol': tol
        }
        
    elif model_name == "LassoCV":
        eps = trial.suggest_categorical('eps', [1e-4, 1e-3])
        n_alphas = trial.suggest_int('n_alphas', 50, 200)
        max_iter = trial.suggest_int('max_iter', 1000, 3000)
        selection = trial.suggest_categorical('selection', ['cyclic', 'random'])
        
        return {
            'eps': eps,
            'n_alphas': n_alphas,
            'max_iter': max_iter,
            'selection': selection
        }
        
    elif model_name == "NuSVR":
        nu = trial.suggest_uniform('nu', 0.25, 0.75)
        C = trial.suggest_loguniform('C', 0.1, 10.0)
        kernel = trial.suggest_categorical('kernel', ['rbf', 'poly', 'sigmoid'])
        
        return {
            'nu': nu,
            'C': C,
            'kernel': kernel
        }
    return {} 
    
regressors_dict = dict(all_estimators(type_filter='regressor'))

def objective(trial, model_name, X_train, y_train, CV, regressors_dict):
    try:
        params = suggest_params(trial, model_name)
    except ValueError:
        return 0.0
    
    if model_name not in regressors_dict:
        return 0.0

    RegressorClass = regressors_dict[model_name]
    try:
        model = RegressorClass(**params, random_state=42)
    except (ValueError, TypeError):
        return 0.0

    try:
        scores = cross_val_score(model, X_train, y_train, cv=CV, scoring='neg_mean_absolute_percentage_error', n_jobs=-1)
        mean_score = -scores.mean()
        if np.isnan(mean_score):
            return 0.0
        return mean_score
        
    except:
        return 0.0
    
top_10_regressors_names = [name for name, _ in sorted_results[:10]] 
tuned_results_optuna = {}
N_TRIALS = 50 

print("Початок підбору гіперпараметрів для регресорів за допомогою Optuna (50 спроб)")
print("=" * 80)

for name in top_10_regressors_names:
    print(f"\n{'='*80}")
    print(f"Обробка моделі: {name}")
    
    study = optuna.create_study(
        direction="minimize", 
        sampler=TPESampler(seed=42),
    )
    
    try:
        study.optimize(
            lambda trial: objective(trial, name, X_train, y_train, CV, regressors_dict),
            n_trials=N_TRIALS, 
            show_progress_bar=False,
            timeout=1800 
        )
    except Exception as e:
        print(f"Помилка в Optuna Study: {e}")
        continue
        
    try:
        best_params = study.best_params
        mean_score = study.best_value 
        if mean_score == 0.0:
             raise ValueError("Усі спроби повернули 0.0. Тюнінг, ймовірно, провалився.")
    except ValueError as e:
        print(f"Помилка при отриманні найкращого результату для {name}: {e}")
        print("Пропускаємо цю модель.")
        continue 
        
    try:
        RegressorClass = regressors_dict[name]
        best_model_for_cv = RegressorClass(**best_params, random_state=42)
        cv_scores_final = cross_val_score(best_model_for_cv, X_train, y_train, cv=CV, scoring='neg_mean_absolute_percentage_error', n_jobs=-1)
        cv_std_score = cv_scores_final.std()
        best_model = RegressorClass(**best_params, random_state=42)
        best_model.fit(X_train, y_train)

        y_train_pred = best_model.predict(X_train)
        y_test_pred = best_model.predict(X_test)
        
        test_r2 = r2_score(y_test, y_test_pred)
        train_r2 = r2_score(y_train, y_train_pred)
        train_mape = mean_absolute_percentage_error(y_train, y_train_pred)
        test_mape = mean_absolute_percentage_error(y_test, y_test_pred)

        tuned_results_optuna[name] = {
            'best_params': best_params,
            'cv_mean_mape': mean_score,
            'cv_std_mape': cv_std_score,
            'train_mape': train_mape,
            'test_mape': test_mape,
            'test_r2': test_r2,
            'train_r2': train_r2,
            'model': best_model
        }        
        
        print(f"\n{'─'*80}")
        print(f"Результати Optuna для {name}:")
        print(f"{'─'*80}")
        print(f"Найкращі параметри: {best_params}")
        print(f"CV MAPE: {mean_score:.2%} (± {cv_std_score:.2%})")
        print(f"Train MAPE: {train_mape:.2%}")
        print(f"Test MAPE:  {test_mape:.2%}")
        print(f"Test R2 Score: {test_r2:.4f}")
        print(f"Train R2 Score: {train_r2:.4f}")

    except Exception as e:
        print(f"Помилка фінального навчання моделі {name}: {e}")
            
print("\n" + "="*80)
print("Optuna тюнінг завершено!")
print("="*80)

Початок підбору гіперпараметрів для регресорів за допомогою Optuna (50 спроб)

Обробка моделі: GradientBoostingRegressor

────────────────────────────────────────────────────────────────────────────────
Результати Optuna для GradientBoostingRegressor:
────────────────────────────────────────────────────────────────────────────────
Найкращі параметри: {'n_estimators': 161, 'learning_rate': 0.042793616429287244, 'max_depth': 4, 'min_samples_split': 9, 'min_samples_leaf': 1, 'loss': 'huber'}
CV MAPE: 30.68% (± 0.98%)
Train MAPE: 28.84%
Test MAPE:  30.26%
Test R2 Score: 0.7807

Обробка моделі: HistGradientBoostingRegressor

────────────────────────────────────────────────────────────────────────────────
Результати Optuna для HistGradientBoostingRegressor:
────────────────────────────────────────────────────────────────────────────────
Найкращі параметри: {'learning_rate': 0.05921785288430964, 'max_depth': 15, 'max_iter': 100, 'l2_regularization': 0.008570395046480528}
CV MAPE: 30.92% (± 0.

In [None]:
tuned_results_voting_reg = {}

def objective_voting_reg(trial, X_train, y_train, CV, top_models):
    weights = []
    for name, _ in top_models:

        weights.append(trial.suggest_float(f'weight_{name}', 0.1, 3.0))
    
    voting_model_tuned = VotingRegressor(
        estimators=top_models,
        weights=weights
    )

    try:
        scores = cross_val_score(
            voting_model_tuned, X_train, y_train, 
            cv=CV, 
            scoring='neg_mean_absolute_percentage_error', 
            n_jobs=-1
        )
        return -scores.mean() 
        
    except Exception as e:
        print(f"Помилка при оцінці моделі: {e}")
        return np.nan

N_TRIALS = 50 

print("\n" + "="*80)
print(f"ПОЧАТОК ТЮНІНГУ: VotingRegressor з Optuna ({N_TRIALS} спроб)")
print("="*80)

voting_study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=42))

voting_study.optimize(
    lambda trial: objective_voting_reg(trial, X_train, y_train, CV, top_models),
    n_trials=N_TRIALS, 
    show_progress_bar=False,
    timeout=1800
)

best_voting_params = voting_study.best_params
best_voting_weights = [best_voting_params[f'weight_{name}'] for name, _ in top_models]

best_voting_reg_model = VotingRegressor(
    estimators=top_models,
    weights=best_voting_weights
)
final_scores = cross_val_score(
    best_voting_reg_model, X_train, y_train, 
    cv=CV, 
    scoring='neg_mean_absolute_percentage_error', 
    n_jobs=-1
)
final_scores_mape = -final_scores
std_score = final_scores_mape.std()

best_voting_reg_model.fit(X_train, y_train)

y_train_pred_vote = best_voting_reg_model.predict(X_train)
y_test_pred_vote = best_voting_reg_model.predict(X_test)

test_mape = mean_absolute_percentage_error(y_test, y_test_pred_vote)
train_mape = mean_absolute_percentage_error(y_train, y_train_pred_vote)
test_r2 = r2_score(y_test, y_test_pred_vote)
train_r2 = r2_score(y_train, y_train_pred_vote)

tuned_results_voting_reg['VotingRegressor'] = {
    'best_params': best_voting_params,
    'cv_mean_mape': voting_study.best_value,
    'cv_std_mape': std_score,
    'test_mape': test_mape,
    'train_mape': train_mape,
    'test_r2': test_r2,
    'train_r2': train_r2,
    'model': best_voting_reg_model
}

print(f"\n{'─'*80}")
print("РЕЗУЛЬТАТИ ТЮНІНГУ VotingRegressor:")
print(f"{'─'*80}")
print(f"Найкращі ваги: {best_voting_params}")
print(f"CV MAPE: {voting_study.best_value:.4f} (± {std_score:.2%})")
print(f"Test MAPE: {test_mape:.4f}")
print(f"Train MAPE: {train_mape:.2%}")
print(f"Test R2 Score: {test_r2:.4f}")
print(f"Train R2 Score: {train_r2:.4f}")

In [None]:
tuned_results_stacking_reg = {}

def objective_stacking_reg(trial, X_train, y_train, CV, top_models):
    alpha = trial.suggest_float('final_ridge_alpha', 0.01, 10.0, log=True)
    
    stacking_model_tuned = StackingRegressor(
        estimators=top_models,
        final_estimator=Ridge(alpha=alpha),
        cv=CV,
        n_jobs=-1
    )

    try:
        scores = cross_val_score(
            stacking_model_tuned, X_train, y_train, 
            cv=CV, 
            scoring='neg_mean_absolute_percentage_error', 
            n_jobs=-1
        )
        return -scores.mean() 
        
    except Exception as e:
        print(f"Помилка при оцінці стекингу: {e}")
        return np.nan

N_TRIALS = 50

print("\n" + "="*80)
print(f"ПОЧАТОК ТЮНІНГУ: StackingRegressor з Optuna ({N_TRIALS} спроб)")
print("="*80)

stack_study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=42))

stack_study.optimize(
    lambda trial: objective_stacking_reg(trial, X_train, y_train, CV, top_models),
    n_trials=N_TRIALS,
    show_progress_bar=False,
    timeout=1800
)

best_alpha = stack_study.best_params['final_ridge_alpha']
final_est = Ridge(alpha=best_alpha)

best_stacking_reg_model = StackingRegressor(
    estimators=top_models,
    final_estimator=final_est,
    cv=CV,
    n_jobs=-1
)

final_stack_scores = cross_val_score(
    best_stacking_reg_model, X_train, y_train, 
    cv=CV, 
    scoring='neg_mean_absolute_percentage_error', 
    n_jobs=-1
)
final_stack_mape_scores = -final_stack_scores
std_score_stack = final_stack_mape_scores.std()

best_stacking_reg_model.fit(X_train, y_train)

y_train_pred_stack = best_stacking_reg_model.predict(X_train)
y_test_pred_stack = best_stacking_reg_model.predict(X_test)

test_mape_stack = mean_absolute_percentage_error(y_test, y_test_pred_stack)
train_mape_stack = mean_absolute_percentage_error(y_train, y_train_pred_stack)
test_r2_stack = r2_score(y_test, y_test_pred_stack)
train_r2_stack = r2_score(y_train, y_train_pred_stack)

tuned_results_stacking_reg['StackingRegressor'] = {
    'best_params': stack_study.best_params,
    'cv_mean_mape': stack_study.best_value,
    'cv_std_mape': std_score_stack,
    'test_mape': test_mape_stack,
    'train_mape': train_mape_stack,
    'test_r2': test_r2_stack,
    'train_r2': train_r2_stack,
    'model': best_stacking_reg_model
}

print(f"\n{'─'*80}")
print("РЕЗУЛЬТАТИ ТЮНІНГУ StackingRegressor:")
print(f"{'─'*80}")
print(f"Найкращі параметри мета-моделі: {stack_study.best_params}")
print(f"CV MAPE: {stack_study.best_value:.4f} (± {std_score_stack:.2%})")
print(f"Test MAPE: {test_mape_stack:.44f}")
print(f"Train MAPE: {train_mape_stack:.2%}")
print(f"Test R2 Score: {test_r2_stack:.4f}")
print(f"Train R2 Score: {train_r2_stack:.4f}")

In [22]:
results_comparison = pd.DataFrame({
    'Model': list(tuned_results.keys()),
    'CV mape': [tuned_results[m]['cv_mean_mape'] for m in tuned_results],
    'CV Std': [tuned_results[m]['cv_std_mape'] for m in tuned_results],
    'Train MAPE': [tuned_results[m]['train_mape'] for m in tuned_results],
    'Test MAPE': [tuned_results[m]['test_mape'] for m in tuned_results],
    'Test R2': [tuned_results[m]['test_r2'] for m in tuned_results],
    'Train R2': [tuned_results[m]['train_r2'] for m in tuned_results],
    'Overfit (Train-Test)': [tuned_results[m]['train_mape'] - tuned_results[m]['test_mape'] 
                                  for m in tuned_results]
})

results_comparison_optuna = pd.DataFrame({
    'Model': list(tuned_results_optuna.keys()),
    'CV mape': [tuned_results_optuna[m]['cv_mean_mape'] for m in tuned_results_optuna],
    'CV Std': [tuned_results_optuna[m]['cv_std_mape'] for m in tuned_results_optuna],
    'Train MAPE': [tuned_results_optuna[m]['train_mape'] for m in tuned_results_optuna],
    'Test MAPE': [tuned_results_optuna[m]['test_mape'] for m in tuned_results_optuna],
    'Test R2': [tuned_results_optuna[m]['test_r2'] for m in tuned_results_optuna],
    'Train R2': [tuned_results_optuna[m]['train_r2'] for m in tuned_results_optuna],
    'Overfit (Train-Test)': [tuned_results_optuna[m]['train_mape'] - tuned_results_optuna[m]['test_mape'] 
                              for m in tuned_results_optuna]
})

results_comparison_voting = pd.DataFrame({
    'Model': 'VotingRegressor',
    'CV mape': tuned_results_voting_reg['VotingRegressor']['cv_mean_mape'],
    'CV Std': tuned_results_voting_reg['VotingRegressor']['cv_std_mape'],
    'Train MAPE': tuned_results_voting_reg['VotingRegressor']['train_mape'],
    'Test MAPE': tuned_results_voting_reg['VotingRegressor']['test_mape'],
    'Test R2': tuned_results_voting_reg['VotingRegressor']['test_r2'],
    'Train R2': tuned_results_voting_reg['VotingRegressor']['train_r2'],
    'Overfit (Train-Test)': tuned_results_voting_reg['VotingRegressor']['train_mape'] - tuned_results_voting_reg['VotingRegressor']['test_mape']
})

results_comparison_stacking = pd.DataFrame({
    'Model': 'StackingRegressor',
    'CV mape': tuned_results_stacking_reg['StackingRegressor']['cv_mean_mape'],
    'CV Std': tuned_results_stacking_reg['StackingRegressor']['cv_std_mape'],
    'Train MAPE': tuned_results_stacking_reg['StackingRegressor']['train_mape'],
    'Test MAPE': tuned_results_stacking_reg['StackingRegressor']['test_mape'],
    'Test R2': tuned_results_stacking_reg['StackingRegressor']['test_r2'],
    'Train R2': tuned_results_stacking_reg['StackingRegressor']['train_r2'],
    'Ovefit (Train-Test)': tuned_results_stacking_reg['StackingRegressor']['train_mape'] - tuned_results_stacking_reg['StackingRegressor']['test_mape']
})
results_comparison = results_comparison.sort_values('Test MAPE', ascending=False)
results_comparison_optuna = results_comparison_optuna.sort_values('Test MAPE', ascending=False)

print("\n" + "="*80)
print("ПІДСУМКОВЕ ПОРІВНЯННЯ МОДЕЛЕЙ")
print("="*80)
print("GRIDSEARCH")
print(results_comparison.to_string(index=False))
print("\nOPTUNA")
print(results_comparison_optuna.to_string(index=False))
print("\nVOTING REGRESSOR")
print(results_comparison_voting.to_string(index=False))
print("\nSTACKING REGRESSOR")
print(results_comparison_stacking.to_string(index=False))

print("\nПобудова графіків для GridSearch...")
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

models = results_comparison['Model']
x = np.arange(len(models))
width = 0.35

axes[0, 0].bar(x - width/2, results_comparison['Train MAPE'], width, label='Train MAPE', alpha=0.8, color='navy')
axes[0, 0].bar(x + width/2, results_comparison['Test MAPE'], width, label='Test MAPE', alpha=0.8, color='orange')
axes[0, 0].set_ylabel('MAPE (Error Rate)')
axes[0, 0].set_title('MAPE: Train vs Test (Lower is better)')
axes[0, 0].set_xticks(x)
axes[0, 0].set_xticklabels(models, rotation=45, ha='right')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].barh(models, results_comparison['Test R2'], alpha=0.8, color='seagreen')
axes[0, 1].set_xlabel('R2 Score')
axes[0, 1].set_title('R2 Score на тестовій вибірці (Higher is better)')
axes[0, 1].grid(True, alpha=0.3)

diff_mape = results_comparison['Overfit (Train-Test)'].abs()
colors = ['red' if x > 0.05 else 'green' for x in diff_mape]
axes[1, 0].barh(models, diff_mape, alpha=0.8, color=colors)
axes[1, 0].set_xlabel('Abs Delta MAPE (Train-Test)')
axes[1, 0].set_title('Оверфітинг: Різниця MAPE (Червоний > 5%)')
axes[1, 0].axvline(x=0.05, color='black', linestyle='--', linewidth=1)
axes[1, 0].grid(True, alpha=0.3)

axes[1, 1].barh(models, results_comparison['CV mape'], 
                xerr=results_comparison['CV Std'], 
                alpha=0.8, color='skyblue', capsize=5)
axes[1, 1].set_xlabel('Mean CV MAPE')
axes[1, 1].set_title('Крос-валідація: Середній MAPE (± std)')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('regression_results_gridsearch.png', dpi=300)
plt.show()

print("\nПобудова графіків для Optuna...")
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

models_opt = results_comparison_optuna['Model']
x_opt = np.arange(len(models_opt))

axes[0, 0].bar(x_opt - width/2, results_comparison_optuna['Train MAPE'], width, label='Train MAPE', alpha=0.8, color='navy')
axes[0, 0].bar(x_opt + width/2, results_comparison_optuna['Test MAPE'], width, label='Test MAPE', alpha=0.8, color='orange')
axes[0, 0].set_ylabel('MAPE')
axes[0, 0].set_title('Optuna: Train vs Test MAPE')
axes[0, 0].set_xticks(x_opt)
axes[0, 0].set_xticklabels(models_opt, rotation=45, ha='right')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].barh(models_opt, results_comparison_optuna['Test R2'], alpha=0.8, color='seagreen')
axes[0, 1].set_xlabel('R2 Score')
axes[0, 1].set_title('Optuna: R2 Score на тесті')
axes[0, 1].grid(True, alpha=0.3)

diff_mape_opt = results_comparison_optuna['Overfit (Train-Test)'].abs()
colors_opt = ['red' if x > 0.05 else 'green' for x in diff_mape_opt]
axes[1, 0].barh(models_opt, diff_mape_opt, alpha=0.8, color=colors_opt)
axes[1, 0].set_xlabel('Abs Delta MAPE')
axes[1, 0].set_title('Optuna: Оверфітинг (Червоний > 5%)')
axes[1, 0].axvline(x=0.05, color='black', linestyle='--', linewidth=1)
axes[1, 0].grid(True, alpha=0.3)

axes[1, 1].barh(models_opt, results_comparison_optuna['CV mape'], 
                xerr=results_comparison_optuna['CV Std'], 
                alpha=0.8, color='skyblue', capsize=5)
axes[1, 1].set_xlabel('Mean CV MAPE')
axes[1, 1].set_title('Optuna: Крос-валідація MAPE (± std)')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('regression_results_optuna.png', dpi=300)
plt.show()


ПІДСУМКОВЕ ПОРІВНЯННЯ МОДЕЛЕЙ
GRIDSEARCH
                        Model  CV mape   CV Std  Train MAPE  Test MAPE  Test R2  Overfit (Train-Test)
            TheilSenRegressor 0.322552 0.008957    0.321301   0.317336 0.726144              0.003965
                 SGDRegressor 0.322433 0.010456    0.321241   0.316478 0.725095              0.004763
               HuberRegressor 0.320025 0.009621    0.318970   0.315877 0.725660              0.003093
                    LinearSVR 0.318895 0.009176    0.317717   0.314736 0.725594              0.002981
HistGradientBoostingRegressor 0.308987 0.009415    0.238407   0.308863 0.771120             -0.070456
          ExtraTreesRegressor 0.310868 0.009791    0.243914   0.308096 0.783619             -0.064182
        RandomForestRegressor 0.311432 0.009616    0.243799   0.306804 0.780334             -0.063005
    GradientBoostingRegressor 0.307013 0.009609    0.293983   0.303086 0.776388             -0.009103

OPTUNA
                        Model  C

In [None]:
final_combined_results = pd.concat([
        results_comparison_optuna,
        results_comparison_voting,
        results_comparison_stacking
    ], ignore_index=True)
final_combined_results = final_combined_results.sort_values('Test MAPE', ascending=False).round(4).reset_index(drop=True)

print("\n" + "="*80)
print("ОБ'ЄДНАНІ РЕЗУЛЬТАТИ: Індивідуальні Optuna та Ансамблеві моделі")
print("="*80)
print(final_combined_results.to_string(index=False))

In [None]:
plt.figure(figsize=(12, 10))

sort_idx = np.argsort(y_test)
y_test_sorted = y_test.iloc[sort_idx] if hasattr(y_test, 'iloc') else y_test[sort_idx]

print("Побудова графіків регресії для моделей...")
print("="*80)

for name in tuned_results.keys():
    try:
        model = tuned_results[name]['model']
        y_pred = model.predict(X_test)
        y_pred_sorted = y_pred[sort_idx]

        from sklearn.metrics import mean_absolute_percentage_error
        mape = mean_absolute_percentage_error(y_test, y_pred)

        plt.plot(range(len(y_test_sorted)), y_pred_sorted, 
                 label=f"{name} (MAPE = {mape:.3%})", 
                 linewidth=2, alpha=0.7)
        
        print(f"  {name}: MAPE = {mape:.4f}")
        
    except Exception as e:
        print(f"  {name}: Помилка - {e}")
        continue

plt.plot(range(len(y_test_sorted)), y_test_sorted, 
         color='black', linestyle='--', linewidth=3, label='Actual Values', alpha=0.8)

plt.xlabel('Індекс спостереження (відсортований за зростанням цілі)', fontsize=12)
plt.ylabel('Цільова змінна (Value)', fontsize=12)
plt.title('Порівняння прогнозів регресорів проти фактичних значень', fontsize=14)
plt.legend(loc='upper left', fontsize=9)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('regression_prediction_comparison.png', dpi=300)
plt.show()

print("="*80)
print("Графіки регресії побудовано!")

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

best_model_name = results_comparison_optuna.iloc[0]['Model']
best_model_obj = tuned_results_optuna[best_model_name]['model']

print("="*80)
print(f"ДЕТАЛЬНИЙ АНАЛІЗ НАЙКРАЩОЇ РЕГРЕСІЙНОЇ МОДЕЛІ: {best_model_name}")
print("="*80)

y_train_pred = best_model_obj.predict(X_train)
y_test_pred = best_model_obj.predict(X_test)

def print_regression_report(y_true, y_pred, title):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    
    print(f"\nЗвіт по метриках: {title}")
    print("-" * 40)
    print(f"R2 Score: {r2:.4f}")
    print(f"MAE:      {mae:.4f}")
    print(f"RMSE:     {rmse:.4f}")
    print(f"MAPE:     {mape:.2%}")

print_regression_report(y_train, y_train_pred, "Тренувальна вибірка")
print_regression_report(y_test, y_test_pred, "Тестова вибірка")

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

ax1.scatter(y_test, y_test_pred, alpha=0.5, color='teal')
ax1.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
ax1.set_xlabel('Фактичні значення')
ax1.set_ylabel('Прогнозовані значення')
ax1.set_title(f'Actual vs Predicted - {best_model_name}')
ax1.grid(True, alpha=0.3)

residuals = y_test - y_test_pred
ax2.scatter(y_test_pred, residuals, alpha=0.5, color='darkorange')
ax2.axhline(y=0, color='black', linestyle='--', lw=2)
ax2.set_xlabel('Прогнозовані значення')
ax2.set_ylabel('Залишки (Errors)')
ax2.set_title(f'Residual Plot - {best_model_name}')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('regression_diagnostic_plots.png', dpi=300)
plt.show()

print("\n" + "="*80)
print(f"ВАЖЛИВІСТЬ ОЗНАК ДЛЯ {best_model_name}")
print("="*80)

importances = None
if hasattr(best_model_obj, 'feature_importances_'):
    importances = best_model_obj.feature_importances_
elif hasattr(best_model_obj, 'coef_'):
    importances = np.abs(best_model_obj.coef_)

if importances is not None:
    feature_imp_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': importances
    }).sort_values('Importance', ascending=False)
    
    print(feature_imp_df.head(15).to_string(index=False))
    
    top_n = min(20, len(feature_imp_df))
    plt.figure(figsize=(10, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_imp_df.head(top_n), palette='viridis')
    plt.title(f'Top-{top_n} найважливіших ознак - {best_model_name}')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('feature_importance_regression.png', dpi=300)
    plt.show()
else:
    print(f"Модель {best_model_name} не підтримує прямий вивід важливості ознак.")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def perform_regression_error_analysis(X, y_true, y_pred, dataset_name="Test", filename="plot.png"):
    print("\n" + "="*80)
    print(f"АНАЛІЗ ВЕЛИКИХ ПОМИЛОК РЕГРЕСІЇ ({dataset_name.upper()})")
    print("="*80)

    abs_errors = np.abs(y_true - y_pred)
    ape = abs_errors / np.where(y_true == 0, 1e-10, y_true) 
    
    errors_df = pd.DataFrame({
        'Index': X.index,
        'True_Value': y_true,
        'Predicted_Value': y_pred,
        'Abs_Error': abs_errors,
        'APE_%': ape * 100
    })
    
    full_errors_df = pd.concat([errors_df, X.reset_index(drop=True)], axis=1)
    
    threshold = errors_df['APE_%'].quantile(0.90) 
    outliers_df = full_errors_df[full_errors_df['APE_%'] >= threshold].sort_values('APE_%', ascending=False)

    print(f"\nЗагальний MAPE для {dataset_name}: {ape.mean():.2%}")
    print(f"Поріг для аналізу (топ-10% помилок): {threshold:.2f}%")
    print(f"\nТОП-10 ОБ'ЄКТІВ З НАЙБІЛЬШОЮ ВІДНОСНОЮ ПОМИЛКОЮ ({dataset_name}):")
    print("-" * 80)
    display(outliers_df.head(10))
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    ax1 = axes[0]
    ax1.hist(errors_df['APE_%'], bins=30, color='skyblue', edgecolor='black', alpha=0.7)
    ax1.axvline(threshold, color='red', linestyle='--', label=f'90-й перцентиль ({threshold:.1f}%)')
    ax1.set_title(f'Розподіл відсоткової помилки (APE) - {dataset_name}')
    ax1.set_xlabel('Помилка (%)')
    ax1.set_ylabel('Кількість спостережень')
    ax1.legend()
    ax1.grid(True, alpha=0.3)

    ax2 = axes[1]
    ax2.scatter(y_true, abs_errors, alpha=0.5, color='coral')
    ax2.set_title(f'Величина помилки vs Фактичне значення ({dataset_name})')
    ax2.set_xlabel('Фактичне значення (y_true)')
    ax2.set_ylabel('Абсолютна помилка')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    print(f"Графік аналізу помилок збережено як: {filename}")
    plt.show()

perform_regression_error_analysis(X_train, y_train, y_train_pred, dataset_name="Train", filename="errors_train_reg.png")
perform_regression_error_analysis(X_test, y_test, y_test_pred, dataset_name="Test", filename="errors_test_reg.png")