In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import optuna
import warnings
from dython.nominal import associations
from sklearn.utils import all_estimators
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, auc, accuracy_score, classification_report, precision_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler, LabelEncoder, label_binarize
from sklearn.inspection import permutation_importance
from sklearn.exceptions import ConvergenceWarning
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from optuna.samplers import TPESampler

optuna.logging.set_verbosity(logging.WARNING)
warnings.filterwarnings('ignore')
warnings.simplefilter("ignore", category=ConvergenceWarning)

In [3]:
df = pd.read_csv('final_datasets/enhanced_anxiety_dataset.csv')
# df = pd.read_csv('cleaned_enhanced_anxiety_dataset.csv')
target_column = 'Anxiety Level (1-10)'
df[target_column] = df[target_column].astype('object')

print("Total rows:", df.shape[0])
print("Total columns:", df.shape[1])
print("Target column:", target_column)

Total rows: 11000
Total columns: 19
Target column: Anxiety Level (1-10)


In [4]:
df.head()

Unnamed: 0,Age,Gender,Occupation,Sleep Hours,Physical Activity (hrs/week),Caffeine Intake (mg/day),Alcohol Consumption (drinks/week),Smoking,Family History of Anxiety,Stress Level (1-10),Heart Rate (bpm),Breathing Rate (breaths/min),Sweating Level (1-5),Dizziness,Medication,Therapy Sessions (per month),Recent Major Life Event,Diet Quality (1-10),Anxiety Level (1-10)
0,29,Female,Artist,6.0,2.7,181,10,Yes,No,10,114,14,4,No,Yes,3,Yes,7,5.0
1,46,Other,Nurse,6.2,5.7,200,8,Yes,Yes,1,62,23,2,Yes,No,2,No,8,3.0
2,64,Male,Other,5.0,3.7,117,4,No,Yes,1,91,28,3,No,No,1,Yes,1,1.0
3,20,Female,Scientist,5.8,2.8,360,6,Yes,No,4,86,17,3,No,No,0,No,1,2.0
4,49,Female,Other,8.2,2.3,247,4,Yes,No,1,98,19,4,Yes,Yes,1,No,3,1.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11000 entries, 0 to 10999
Data columns (total 19 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                11000 non-null  int64  
 1   Gender                             11000 non-null  object 
 2   Occupation                         11000 non-null  object 
 3   Sleep Hours                        11000 non-null  float64
 4   Physical Activity (hrs/week)       11000 non-null  float64
 5   Caffeine Intake (mg/day)           11000 non-null  int64  
 6   Alcohol Consumption (drinks/week)  11000 non-null  int64  
 7   Smoking                            11000 non-null  object 
 8   Family History of Anxiety          11000 non-null  object 
 9   Stress Level (1-10)                11000 non-null  int64  
 10  Heart Rate (bpm)                   11000 non-null  int64  
 11  Breathing Rate (breaths/min)       11000 non-null  int

In [6]:
df.isnull().sum()

Age                                  0
Gender                               0
Occupation                           0
Sleep Hours                          0
Physical Activity (hrs/week)         0
Caffeine Intake (mg/day)             0
Alcohol Consumption (drinks/week)    0
Smoking                              0
Family History of Anxiety            0
Stress Level (1-10)                  0
Heart Rate (bpm)                     0
Breathing Rate (breaths/min)         0
Sweating Level (1-5)                 0
Dizziness                            0
Medication                           0
Therapy Sessions (per month)         0
Recent Major Life Event              0
Diet Quality (1-10)                  0
Anxiety Level (1-10)                 0
dtype: int64

In [7]:
def unic_columns(data):
    object_columns = data.select_dtypes(include='object').columns
    for column in object_columns:
        print(f"Унікальні значення в колонці: '{column}' : ")
        print(data[column].value_counts())
        print("-" * 50)

unic_columns(df)

Унікальні значення в колонці: 'Gender' : 
Gender
Female    3730
Male      3657
Other     3613
Name: count, dtype: int64
--------------------------------------------------
Унікальні значення в колонці: 'Occupation' : 
Occupation
Musician      892
Artist        888
Student       878
Nurse         861
Chef          858
Doctor        842
Other         840
Freelancer    838
Engineer      833
Scientist     832
Athlete       822
Lawyer        809
Teacher       807
Name: count, dtype: int64
--------------------------------------------------
Унікальні значення в колонці: 'Smoking' : 
Smoking
Yes    5779
No     5221
Name: count, dtype: int64
--------------------------------------------------
Унікальні значення в колонці: 'Family History of Anxiety' : 
Family History of Anxiety
Yes    5847
No     5153
Name: count, dtype: int64
--------------------------------------------------
Унікальні значення в колонці: 'Dizziness' : 
Dizziness
Yes    5672
No     5328
Name: count, dtype: int64
----------------

In [None]:
def corr_dif(data, threshold=0.85):
    correlation_matrix = associations(data)
    
    def plot_lower_triangle_corr_matrix(corr_matrix):
        mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
        plt.figure(figsize=(12, 10))  
        sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', cmap='coolwarm', square=True)
        plt.title("Нижня трикутна кореляційна матриця")
        plt.tight_layout()
        plt.savefig('corr_matrix.png', dpi=300)
        plt.show()
        
    plot_lower_triangle_corr_matrix(correlation_matrix['corr'])
    
    def get_highly_correlated_pairs(corr_matrix, threshold=0.85):
        corr_pairs = []
        cols = corr_matrix.columns
        for i in range(len(cols)):
            for j in range(i):
                corr_value = corr_matrix.iloc[i, j]
                if abs(corr_value) >= threshold:
                    corr_pairs.append((cols[i], cols[j], corr_value))
        return sorted(corr_pairs, key=lambda x: -abs(x[2]))
    
    highly_correlated_pairs = get_highly_correlated_pairs(correlation_matrix['corr'], threshold)
    
    print("\nВисококорельовані пари ознак:")
    for col1, col2, value in highly_correlated_pairs:
        print(f"{col1} - {col2} --> кореляція {value:.2f}")
        
    target_correlations = correlation_matrix['corr'].iloc[:, -1]
    target_col = target_correlations.name
    
    corr_df = target_correlations.to_frame()
    corr_df.columns = ['Correlations']
    corr_df['Abs Correlation'] = corr_df['Correlations'].abs()
    sorted_corr_df = corr_df.sort_values(by='Abs Correlation', ascending=False)
    sorted_corr_df = sorted_corr_df.drop(columns=['Abs Correlation'])
    print(f"\nКоефіцієнт кореляції відносно цільового поля {target_col}")
    pd.set_option('display.max_rows', None)
    print(sorted_corr_df)
    
    return correlation_matrix

correlation_result = corr_dif(df, 0.8)

In [9]:
df_2 = df.copy()

In [10]:
target_original = 'Anxiety Level (1-10)'
def categorize_anxiety(level):
    level = int(level) if isinstance(level, str) else level
    if level <= 4:
        return 'Low'
    elif 4 < level <= 7:
        return 'Medium'
    else: 
        return 'High'

df['Anxiety Class'] = df[target_original].apply(categorize_anxiety).astype('category')
target_col = 'Anxiety Class'
df = df.drop(columns=[target_original])

numeric_cols = df.select_dtypes(include='number').columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

if target_col in categorical_cols:
    categorical_cols.remove(target_col)

print(f"Числові колонки ({len(numeric_cols)}): {numeric_cols}")
print(f"Категоріальні колонки ({len(categorical_cols)}): {categorical_cols}")
print(f"Цільова змінна: {target_col}")

Числові колонки (11): ['Age', 'Sleep Hours', 'Physical Activity (hrs/week)', 'Caffeine Intake (mg/day)', 'Alcohol Consumption (drinks/week)', 'Stress Level (1-10)', 'Heart Rate (bpm)', 'Breathing Rate (breaths/min)', 'Sweating Level (1-5)', 'Therapy Sessions (per month)', 'Diet Quality (1-10)']
Категоріальні колонки (7): ['Gender', 'Occupation', 'Smoking', 'Family History of Anxiety', 'Dizziness', 'Medication', 'Recent Major Life Event']
Цільова змінна: Anxiety Class


створюжм 3 датасети на кожен клас і в них визначаємо аутлаєри (95% варіансу має пояснення) і за допомогою Isolation forest видаляємо аутлаєри, далі цих три датасети конкатенуємо назад в один, також візуалізувати три кластери з цими класами

In [11]:
label_encoders = {}
df_encoded = df.copy()

for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le
    print(f"Закодовано {col}: {len(le.classes_)} унікальних значень")

le_y = LabelEncoder()
y = le_y.fit_transform(df[target_col])
print(f"\nКласи цільової змінної: {le_y.classes_}")
print(f"Розподіл класів: {np.bincount(y)}")

Закодовано Gender: 3 унікальних значень
Закодовано Occupation: 13 унікальних значень
Закодовано Smoking: 2 унікальних значень
Закодовано Family History of Anxiety: 2 унікальних значень
Закодовано Dizziness: 2 унікальних значень
Закодовано Medication: 2 унікальних значень
Закодовано Recent Major Life Event: 2 унікальних значень

Класи цільової змінної: ['High' 'Low' 'Medium']
Розподіл класів: [1014 7618 2368]


In [12]:
scaler = StandardScaler()
df_encoded[numeric_cols] = scaler.fit_transform(df_encoded[numeric_cols])

print("Масштабування числових ознак завершено")
print(f"Середнє значення після масштабування:\n{df_encoded[numeric_cols].mean()}")
print(f"\nСтандартне відхилення після масштабування:\n{df_encoded[numeric_cols].std()}")

Масштабування числових ознак завершено
Середнє значення після масштабування:
Age                                 -1.401707e-16
Sleep Hours                         -9.818409e-17
Physical Activity (hrs/week)        -7.492996e-17
Caffeine Intake (mg/day)             1.724681e-16
Alcohol Consumption (drinks/week)   -1.111030e-16
Stress Level (1-10)                 -3.035955e-17
Heart Rate (bpm)                     1.821573e-16
Breathing Rate (breaths/min)         2.306034e-16
Sweating Level (1-5)                -1.498599e-16
Therapy Sessions (per month)         5.748937e-17
Diet Quality (1-10)                  3.229740e-17
dtype: float64

Стандартне відхилення після масштабування:
Age                                  1.000045
Sleep Hours                          1.000045
Physical Activity (hrs/week)         1.000045
Caffeine Intake (mg/day)             1.000045
Alcohol Consumption (drinks/week)    1.000045
Stress Level (1-10)                  1.000045
Heart Rate (bpm)                     1

In [12]:
# # from sklearn.ensemble import IsolationForest
# # 
# # df_cleaned_list = []
# # contamination_rate = 0.05
# # 
# # for class_label in np.unique(y):
# #     class_mask = (y == class_label)
# #     X_class = df_encoded.drop(columns=[target_col])[class_mask]
# #     y_class = y[class_mask]
# #     
# #     iso = IsolationForest(contamination=contamination_rate, random_state=42)
# #     outlier_preds = iso.fit_predict(X_class)
# #     
# #     X_class_cleaned = X_class[outlier_preds == 1]
# #     y_class_cleaned = y_class[outlier_preds == 1]
# #     
# #     temp_df = X_class_cleaned.copy()
# #     temp_df[target_col] = y_class_cleaned
# #     df_cleaned_list.append(temp_df)
# #     
# #     print(f"Клас '{le_y.classes_[class_label]}': видалено {sum(outlier_preds == -1)} аутлаєрів")
# # 
# # df_final_cleaned = pd.concat(df_cleaned_list, axis=0).sample(frac=1, random_state=42)
# # df_final_cleaned.to_csv('final_cleaned.csv', index=False)
# # X = df_final_cleaned.drop(columns=[target_col])
# # y = df_final_cleaned[target_col].values
# # from imblearn.under_sampling import EditedNearestNeighbors
# # X_full = df_encoded.drop(columns=[target_col])
# # y_full = y # використовуємо вже закодований y
# 
# # print(f"Початкова кількість записів: {len(X_full)}")
# 
# # 2. Використовуємо алгоритм сімейства Nearest Neighbors для видалення аутлаєрів
# # EditedNearestNeighbors видаляє зразки, чий клас відрізняється від більшості сусідів.
# # Це ідеально підходить для очищення "шуму" та аутлаєрів на межах.
# # enn = EditedNearestNeighbors(n_neighbors=3)
# # X_resampled, y_resampled = enn.fit_resample(X_full, y_full)
# 
# # Оскільки ENN видаляє стільки, скільки вважає за потрібне, перевіримо результат
# # print(f"Кількість записів після ENN: {len(X_resampled)}")
# # print(f"Видалено всього: {len(X_full) - len(X_resampled)} рядків")
# 
# # 3. Якщо вам потрібно видалити СТРОГО 5% (як ви просили раніше)
# # Nearest Neighbors не гарантує фіксований відсоток, але він набагато точніше 
# # знаходить аномалії в ненормальному розподілі, ніж статистичні методи.
# 
# # 4. Формуємо фінальний очищений датасет
# # df_final_cleaned = X_resampled.copy()
# # df_final_cleaned[target_col] = y_resampled
# # 
# # # Перемішуємо дані
# # df_final_cleaned = df_final_cleaned.sample(frac=1, random_state=42)
# # 
# # # Оновлюємо змінні для навчання
# # X = df_final_cleaned.drop(columns=[target_col])
# # y = df_final_cleaned[target_col].values
# # X = df_encoded.drop(columns=[target_col])
# 
# from sklearn.neighbors import NearestNeighbors
# 
# # 1. Підготовка даних
# X_full = df_encoded.drop(columns=[target_col])
# y_full = y
# 
# # 2. Реалізація логіки Edited Nearest Neighbors (ENN) вручну
# # Ми видаляємо точки, чий клас не збігається з більшістю з 3-х найближчих сусідів
# n_neighbors = 3
# nn = NearestNeighbors(n_neighbors=n_neighbors + 1) # +1, бо точка є сусідом сама собі
# nn.fit(X_full)
# 
# # Знаходимо індекси сусідів для кожної точки
# distances, indices = nn.kneighbors(X_full)
# 
# # Маска для тих, кого ми залишимо
# keep_mask = []
# 
# for i in range(len(X_full)):
#     # Індекси сусідів (пропускаємо перший індекс, бо це сама точка)
#     neighbor_indices = indices[i][1:]
#     # Класи сусідів
#     neighbor_classes = y_full[neighbor_indices]
#     
#     # Якщо клас точки збігається з більшістю сусідів - залишаємо
#     # (Наприклад, якщо хоча б 2 з 3 сусідів мають такий самий клас)
#     most_common_neighbor_class = np.bincount(neighbor_classes).argmax()
#     
#     if y_full[i] == most_common_neighbor_class:
#         keep_mask.append(True)
#     else:
#         keep_mask.append(False)
# 
# keep_mask = np.array(keep_mask)
# 
# # 3. Формуємо очищений датасет
# X_resampled = X_full[keep_mask]
# y_resampled = y_full[keep_mask]
# 
# print(f"Початкова кількість записів: {len(X_full)}")
# print(f"Видалено 'шумних' точок (NN-outliers): {len(X_full) - len(X_resampled)}")
# 
# df_final_cleaned = X_resampled.copy()
# df_final_cleaned[target_col] = y_resampled
# df_final_cleaned[target_col] = le_y.inverse_transform(df_final_cleaned[target_col])
# df_final_cleaned = df_final_cleaned.sample(frac=1, random_state=42)
# df_final_cleaned.to_csv('anxiety_dataset_enn.csv')
# # Оновлюємо змінні для подальшого коду
# X = df_final_cleaned.drop(columns=[target_col])
# y = df_final_cleaned[target_col].values
# n_classes = len(np.unique(y))
# is_multiclass = n_classes > 2
# 
# print(f"Кількість класів: {n_classes}")
# print(f"Мультикласова класифікація: {is_multiclass}")
# 
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42, stratify=y
# )
# 
# print(f"\nРозмір тренувальної вибірки: {X_train.shape}")
# print(f"Розмір тестової вибірки: {X_test.shape}")
# # print(f"Розподіл класів у тренувальній вибірці: {np.bincount(y_train)}")
# # print(f"Розподіл класів у тестовій вибірці: {np.bincount(y_test)}")

In [13]:
X = df_encoded.drop(columns=[target_col])
n_classes = len(np.unique(y))
is_multiclass = n_classes > 2

print(f"Кількість класів: {n_classes}")
print(f"Мультикласова класифікація: {is_multiclass}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nРозмір тренувальної вибірки: {X_train.shape}")
print(f"Розмір тестової вибірки: {X_test.shape}")
print(f"Розподіл класів у тренувальній вибірці: {np.bincount(y_train)}")
print(f"Розподіл класів у тестовій вибірці: {np.bincount(y_test)}")

Кількість класів: 3
Мультикласова класифікація: True

Розмір тренувальної вибірки: (8800, 18)
Розмір тестової вибірки: (2200, 18)
Розподіл класів у тренувальній вибірці: [ 811 6095 1894]
Розподіл класів у тестовій вибірці: [ 203 1523  474]


In [None]:
classifiers = all_estimators(type_filter='classifier')
skip_models = ['GaussianProcessClassifier']
results = {}

print(f"Тестування різних класифікаторів з кросвалідацією (5 фолдів)")
print("=" * 80)

CV = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, ClassifierClass in classifiers:
    if name in skip_models:
        continue 
    try:
        model = ClassifierClass()
        scores = cross_val_score(model, X_train, y_train, cv=CV, scoring='precision_macro', n_jobs=-1)
        model.fit(X_train, y_train)
        
        roc_auc = np.nan
        if hasattr(model, 'predict_proba'):
            pred_proba = model.predict_proba(X_train)
            if is_multiclass:
                if pred_proba.ndim == 2 and pred_proba.shape[1] == n_classes:
                    roc_auc = roc_auc_score(y_train, pred_proba, multi_class='ovr')
                else:
                    roc_auc = np.nan
            else:
                if pred_proba.ndim == 2 and pred_proba.shape[1] == 2:
                    roc_auc = roc_auc_score(y_train, pred_proba[:, 1])
                elif pred_proba.ndim == 1:
                    roc_auc = roc_auc_score(y_train, pred_proba)
                else:
                    roc_auc = np.nan
        elif hasattr(model, 'decision_function'): 
            pred_score = model.decision_function(X_train)
            if is_multiclass:
                if pred_score.ndim == 2 and pred_score.shape[1] == n_classes:
                    roc_auc = roc_auc_score(y_train, pred_score, multi_class='ovr')
                else:
                    roc_auc = np.nan
            else:
                if pred_score.ndim == 1:
                    roc_auc = roc_auc_score(y_train, pred_score)
                elif pred_score.ndim == 2 and pred_score.shape[1] == 2:
                    roc_auc = roc_auc_score(y_train, pred_score[:, 1])
                else:
                    roc_auc = np.nan
        else:
            roc_auc = np.nan
            
        if not np.isnan(scores.mean()):
            results[name] = {
                'mean_precision': scores.mean(),
                'std_dev': scores.std(),
                'roc_auc': roc_auc,
            }
            print(f"{name}: MEAN PRECISION {scores.mean():.3f}, STD (± {scores.std():.3f}), ROC AUC: {roc_auc:.3f}")
        else:
            print(f"{name}: Пропущено через NaN у результаті") 

    except Exception as e:
        continue
       
sorted_results = sorted(results.items(), key=lambda x: x[1]['mean_precision'], reverse=True)

print("\nТоп-10 найбільш точних моделей:")
for i, (name, metric) in enumerate(sorted_results[:10], 1):
    print(f"{i:2d}. {name:40s} | ACC: {metric['mean_precision']:.3f} (± {metric['std_dev']:.3f}) | ROC-AUC: {metric['roc_auc']:.3f}")
    
results_df = pd.DataFrame.from_dict(results, orient='index')

Тестування різних класифікаторів з кросвалідацією (5 фолдів)


In [None]:
top5 = sorted_results[:5]
print("\nТОП-5 моделей для ансамблів:")
for name, metric in top5:
    print(name, "| precision =", metric['mean_precision'])

def instantiate_model(model_name):
    for name, Classifier in classifiers:
        if name == model_name:
            try:
                return Classifier()
            except:
                return None
    return None

top_models = []
for name, metric in top5:
    model_obj = instantiate_model(name)
    if model_obj is not None:
        top_models.append((name, model_obj))

voting_model = VotingClassifier(
    estimators=top_models,
    voting='soft'
)

voting_model.fit(X_train, y_train)
y_pred_vote = voting_model.predict(X_test)

precision_vote = precision_score(y_test, y_pred_vote, average="macro", zero_division=0)
print(f"\nPrecision (Voting, soft) = {precision_vote:.3f}")

stacked_model = StackingClassifier(
    estimators=top_models,
    final_estimator=LogisticRegression(max_iter=1000, random_state=42)
)

stacked_model.fit(X_train, y_train)
y_pred_stack = stacked_model.predict(X_test)

precision_stack = precision_score(y_test, y_pred_stack, average="macro", zero_division=0)
print(f"Precision (Stacking) = {precision_stack:.3f}")

In [None]:
param_grids = {
    "QuadraticDiscriminantAnalysis": { #
        'reg_param': [0.0, 0.1, 0.2, 0.3, 0.5, 0.7],
        'store_covariance': [True, False],
        'tol': [1e-4, 1e-3, 1e-2]
    },
    "LinearDiscriminantAnalysis": { #
        'solver': ['svd', 'lsqr', 'eigen'],
        'shrinkage': [None, 'auto', 0.1, 0.3, 0.5, 0.7, 0.9],
        'tol': [1e-4, 1e-3],
    },
    "LogisticRegressionCV": {
        'Cs': [10, 20],
        'max_iter': [1000, 2000, 3000],
        'solver': ['lbfgs', 'liblinear', 'saga'],
        'penalty': ['l2', 'l1'],
        'cv': [3, 5]
    },
    "GaussianNB": { #
        'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
    },
    "MLPClassifier": { #
        'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam'],
        'alpha': [0.0001, 0.001, 0.01],
        'max_iter': [500, 1000]
    },
    "RandomForestClassifier": { #
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None]
    },
    "GradientBoostingClassifier": { #
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7],
    },
    "LogisticRegression": { #
        'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
        'max_iter': [1000, 2000],
        'solver': ['lbfgs', 'liblinear', 'saga'],
        'penalty': ['l2', 'l1'],
        'class_weight': [None, 'balanced']
    },
    "ExtraTreesClassifier": { #
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
    },
    "CalibratedClassifierCV": {
        'method': ['sigmoid', 'isotonic'],
        'cv': [3, 5]
    },
    "HistGradientBoostingClassifier": { #
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [5, 10, 15, None],
        'max_iter': [100, 200, 300],
        "l2_regularization": [0.0, 1.0, 5.0]
    },
    "AdaBoostClassifier": { #
        'n_estimators': [50, 100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.5, 1.0, 1.5],
        'algorithm': ['SAMME', 'SAMME.R']
    },
    "KNeighborsClassifier": { #
        'n_neighbors': [3, 5, 7, 9, 11, 15],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree'],
        'leaf_size': [20, 30, 40, 50],
        'p': [1, 2]
    },
    "DecisionTreeClassifier": { #
        'max_depth': [5, 10, 20, 30, None],
        'min_samples_split': [2, 5, 10, 20],
        'min_samples_leaf': [1, 2, 4, 8],
        'criterion': ['gini', 'entropy'],
        'max_features': ['sqrt', 'log2', None]
    }
}

print("Гіперпараметри підготовлені для наступних моделей:")
for model_name in param_grids.keys():
    print(f"  - {model_name}: {len(param_grids[model_name])} параметрів")

In [None]:
top_10_classifiers = sorted_results[:10]
CV = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
feature_names = X_train.columns
feature_importance_dict = {}
tuned_results = {}

print("Початок підбору гіперпараметрів для Top-10 моделей")
print("=" * 80)

for name, _ in top_10_classifiers:
    print(f"\n{'='*80}")
    print(f"Обробка моделі: {name}")
    print(f"{'='*80}")
    
    ClassifierClass = dict(classifiers)[name]
    base_model = ClassifierClass()
    
    if name in param_grids:
        param_grid = param_grids[name]
        print(f"Підбір гіперпараметрів: {len(param_grid)} параметрів")
        print(f"Параметри: {list(param_grid.keys())}")
        
        try:
            grid_search = GridSearchCV(
                base_model, 
                param_grid, 
                cv=CV, 
                scoring='precision_macro',
                n_jobs=-1,
                verbose=1
            )
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            best_params = grid_search.best_params_
            best_idx = grid_search.best_index_
            mean_score = grid_search.cv_results_['mean_test_score'][best_idx]
            std_score = grid_search.cv_results_['std_test_score'][best_idx]
        except Exception as e:
            print(f"Помилка GridSearch: {e}")
            print("Використовується модель за замовчуванням")
            best_model = base_model
            best_model.fit(X_train, y_train)
            best_params = "Параметри за замовчуванням"
            scores = cross_val_score(best_model, X_train, y_train, cv=CV, scoring='precision_macro', n_jobs=-1)
            mean_score = scores.mean()
            std_score = scores.std()
    else:
        print(f"Гіперпараметри не визначені, використовується модель за замовчуванням")
        best_model = base_model
        best_model.fit(X_train, y_train)
        best_params = "Параметри за замовчуванням"
        scores = cross_val_score(best_model, X_train, y_train, cv=CV, scoring='precision_macro', n_jobs=-1)
        mean_score = scores.mean()
        std_score = scores.std()
    
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)
    
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    if hasattr(best_model, 'predict_proba'):
        proba_test = best_model.predict_proba(X_test)
        if is_multiclass:
            if proba_test.ndim == 2 and proba_test.shape[1] == n_classes:
                roc_auc_test = roc_auc_score(y_test, proba_test, multi_class='ovr')
            else:
                roc_auc_test = np.nan
        else:
            if proba_test.ndim == 2 and proba_test.shape[1] == 2:
                roc_auc_test = roc_auc_score(y_test, proba_test[:, 1])
            elif proba_test.ndim == 1:
                roc_auc_test = roc_auc_score(y_test, proba_test)
            else:
                roc_auc_test = np.nan
    elif hasattr(best_model, 'decision_function'):
        proba_test = best_model.decision_function(X_test)
        if is_multiclass:
            if proba_test.ndim == 2 and proba_test.shape[1] == n_classes:
                roc_auc_test = roc_auc_score(y_test, proba_test, multi_class='ovr')
            else:
                roc_auc_test = np.nan
        else:
            if proba_test.ndim == 1:
                roc_auc_test = roc_auc_score(y_test, proba_test)
            elif proba_test.ndim == 2 and proba_test.shape[1] == 2:
                roc_auc_test = roc_auc_score(y_test, proba_test[:, 1])
            else:
                roc_auc_test = np.nan
    else:
        roc_auc_test = np.nan
    
    tuned_results[name] = {
        'best_params': best_params,
        'cv_mean_score': mean_score,
        'cv_std_score': std_score,
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'roc_auc_test': roc_auc_test,
        'model': best_model
    }
    
    print(f"\n{'─'*80}")
    print(f"Результати для {name}:")
    print(f"{'─'*80}")
    print(f"Найкращі параметри: {best_params}")
    print(f"CV precision_macro: {mean_score:.4f} (± {std_score:.4f})")
    print(f"Train Accuracy: {train_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Test ROC-AUC: {roc_auc_test:.4f}")
    
    y_pred_full = best_model.predict(X)
    df_2[f'{name}_prediction'] = le_y.inverse_transform(y_pred_full)
    
    print(f"\n{'─'*80}")
    print(f"Важливість ознак для {name}:")
    print(f"{'─'*80}")
    
    if hasattr(best_model, 'feature_importances_'):
        importances = best_model.feature_importances_
        feature_importance_dict[name] = importances
        sort_idx = np.argsort(importances)[::-1]
        print("Топ-10 найважливіших ознак:")
        for i, idx in enumerate(sort_idx[:10], 1):
            print(f"{i:2d}. {feature_names[idx]:30s}: {importances[idx]:.4f}")
    else:
        print("Обчислення Permutation Importance...")
        perm_importances = permutation_importance(
            best_model, X_test, y_test, 
            n_repeats=10, 
            random_state=42,
            n_jobs=-1
        )
        importances = perm_importances.importances_mean
        feature_importance_dict[name] = importances
        sort_idx = np.argsort(importances)[::-1]
        print("Топ-10 найважливіших ознак:")
        for i, idx in enumerate(sort_idx[:10], 1):
            print(f"{i:2d}. {feature_names[idx]:30s}: {importances[idx]:.4f}")

print("\n" + "="*80)
print("Підбір гіперпараметрів завершено!")
print("="*80)

In [None]:
def suggest_params(trial, model_name):
    if model_name == "QuadraticDiscriminantAnalysis":
        reg_param = trial.suggest_loguniform('reg_param', 1e-4, 0.7) 
        store_covariance = trial.suggest_categorical('store_covariance', [True, False])
        tol = trial.suggest_categorical('tol', [1e-4, 1e-3, 1e-2])
        
        return {
            'reg_param': reg_param,
            'store_covariance': store_covariance,
            'tol': tol
        }
    elif model_name == "LinearDiscriminantAnalysis":
        solver = trial.suggest_categorical('solver', ['svd', 'lsqr', 'eigen'])
        params = {'solver': solver}
        if solver in ['lsqr', 'eigen']:
            params['shrinkage'] = trial.suggest_categorical('shrinkage', ['auto',  0.1, 0.3, 0.5, 0.7, 0.9])
            params['tol'] = trial.suggest_categorical('tol', [1e-4, 1e-3])  
        return params
        
    elif model_name == "LogisticRegression":
        solver = trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'saga'])
        
        if solver == 'lbfgs':
            penalty = 'l2'
        elif solver in ['liblinear', 'saga']:
            penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
        else:
            penalty = 'l2' 
            
        C = trial.suggest_float('C', 0.001, 100.0, log=True)
        max_iter = trial.suggest_int('max_iter', 1000, 3000)
        class_weight = trial.suggest_categorical('class_weight', ['balanced', None])
        return {'solver': solver, 'penalty': penalty, 'C': C, 'max_iter': max_iter, 'class_weight': class_weight}
        
    elif model_name == "GaussianNB":
        var_smoothing = trial.suggest_loguniform('var_smoothing', 1e-9, 1e-5)
        return {'var_smoothing': var_smoothing}
        
    elif model_name == "MLPClassifier":
        hidden_layer_sizes_choice = trial.suggest_categorical('hidden_layer_sizes_choice', ['small', 'medium', 'large'])
        if hidden_layer_sizes_choice == 'small':
            hidden_layer_sizes = (50,)
        elif hidden_layer_sizes_choice == 'medium':
            hidden_layer_sizes = (100,)
        else:
            hidden_layer_sizes = (50, 50) 
            
        activation = trial.suggest_categorical('activation', ['relu', 'tanh'])
        alpha = trial.suggest_loguniform('alpha', 0.0001, 0.01)
        max_iter = trial.suggest_int('max_iter', 500, 1000)
        solver = trial.suggest_categorical('solver', ['adam'])
        
        return {
            'hidden_layer_sizes': hidden_layer_sizes,
            'activation': activation,
            'alpha': alpha,
            'max_iter': max_iter,
            'solver': solver
        }
        
    elif model_name in ["RandomForestClassifier", "ExtraTreesClassifier"]:
        n_estimators = trial.suggest_int('n_estimators', 100, 300)
        max_depth = trial.suggest_categorical('max_depth', [10, 20, None])
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
        max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
        
        return {
            'n_estimators': n_estimators,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'max_features': max_features
        }
        
    elif model_name == "GradientBoostingClassifier":
        n_estimators = trial.suggest_int('n_estimators', 100, 300)
        learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.2)
        max_depth = trial.suggest_int('max_depth', 3, 7)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
        
        return {
            'n_estimators': n_estimators,
            'learning_rate': learning_rate,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf
        }
        
    elif model_name == "HistGradientBoostingClassifier":
        learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.1)
        max_depth = trial.suggest_categorical('max_depth', [5, 10, 15, None])
        max_iter = trial.suggest_int('max_iter', 100, 300)
        l2_regularization = trial.suggest_loguniform('l2_regularization', 0.001, 5.0)
        
        return {
            'learning_rate': learning_rate,
            'max_depth': max_depth,
            'max_iter': max_iter,
            'l2_regularization': l2_regularization
        }

    elif model_name == "AdaBoostClassifier":
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 1.5)
        algorithm = trial.suggest_categorical('algorithm', ['SAMME', 'SAMME.R'])
        
        return {
            'n_estimators': n_estimators,
            'learning_rate': learning_rate,
            'algorithm': algorithm
        }
        
    elif model_name == "KNeighborsClassifier":
        n_neighbors = trial.suggest_int('n_neighbors', 3, 15)
        weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
        algorithm = trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree'])
        p = trial.suggest_int('p', 1, 2)
        leaf_size = trial.suggest_int('leaf_size', 20, 50)
        
        return {
            'n_neighbors': n_neighbors,
            'weights': weights,
            'algorithm': algorithm,
            'p': p,
            'leaf_size': leaf_size
        }
        
    elif model_name == "DecisionTreeClassifier":
        max_depth = trial.suggest_categorical('max_depth', [5, 10, 20, 30, None])
        min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 8)
        criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
        max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
        
        return {
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'criterion': criterion,
            'max_features': max_features
        }
    
    elif model_name == "LogisticRegressionCV":
        solver = trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'saga'])
        Cs_count = trial.suggest_int('Cs', 5, 30) 
        
        if solver == 'lbfgs' :
            penalty = 'l2' 
        elif solver in ['liblinear', 'saga']:
            penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
        else:
            penalty = 'l2' 
            
        max_iter = trial.suggest_int('max_iter', 1000, 3000)
        class_weight = trial.suggest_categorical('class_weight', ['balanced', None])
        cv_inner = trial.suggest_categorical('cv', [3, 5, 7])
        
        params = {
            'solver': solver,
            'penalty': penalty,
            'Cs': Cs_count,
            'max_iter': max_iter,
            'class_weight': class_weight,
            'cv': cv_inner
        }
        
        return params
        
    return {} 
    
classifiers_dict = dict(all_estimators(type_filter='classifier'))

def objective(trial, model_name, X_train, y_train, CV, classifiers_dict):
    try:
        params = suggest_params(trial, model_name)
    except ValueError:
        return 0.0
    
    if model_name not in classifiers_dict:
        return 0.0

    ClassifierClass = classifiers_dict[model_name]
    try:
        model = ClassifierClass(**params, random_state=42)
    except (ValueError, TypeError):
        return 0.0

    try:
        scores = cross_val_score(model, X_train, y_train, cv=CV, scoring='precision_macro', n_jobs=-1)
        mean_precision = scores.mean()
        if np.isnan(mean_precision):
            return 0.0
        return mean_precision
        
    except Exception as e:
        return 0.0
    
top_10_classifiers_names = [name for name, _ in sorted_results[:10]] 
tuned_results_optuna = {}
N_TRIALS = 50 

print("Початок підбору гіперпараметрів за допомогою Optuna (50 спроб)")
print("=" * 80)

for name in top_10_classifiers_names:
    print(f"\n{'='*80}")
    print(f"Обробка моделі: {name}")
    
    study = optuna.create_study(
        direction="maximize", 
        sampler=TPESampler(seed=42),
    )
    
    try:
        study.optimize(
            lambda trial: objective(trial, name, X_train, y_train, CV, classifiers_dict),
            n_trials=N_TRIALS, 
            show_progress_bar=False,
            timeout=1800 
        )
    except Exception as e:
        print(f"Помилка в Optuna Study: {e}")
        continue
    try:
        best_params = study.best_params
        mean_score = study.best_value
        if mean_score == 0.0:
             raise ValueError("Усі спроби повернули 0.0. Тюнінг, ймовірно, провалився.")
    except ValueError as e:
        print(f"Помилка при отриманні найкращого результату для {name}: {e}")
        print("Пропускаємо цю модель.")
        continue 
        
    try:
        ClassifierClass = classifiers_dict[name]
        best_model_for_cv = ClassifierClass(**best_params, random_state=42)
        cv_scores_final = cross_val_score(best_model_for_cv, X_train, y_train, cv=CV, scoring='precision_macro', n_jobs=-1)
        cv_std_score = cv_scores_final.std()
        best_model = ClassifierClass(**best_params, random_state=42)
        best_model.fit(X_train, y_train)

        y_train_pred = best_model.predict(X_train)
        y_test_pred = best_model.predict(X_test)
        train_accuracy = accuracy_score(y_train, y_train_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        
        roc_auc_test = np.nan
        if hasattr(best_model, 'predict_proba'):
            proba_test = best_model.predict_proba(X_test)
            if is_multiclass and proba_test.ndim == 2 and proba_test.shape[1] == n_classes:
                roc_auc_test = roc_auc_score(y_test, proba_test, multi_class='ovr')
        
        tuned_results_optuna[name] = {
            'best_params': best_params,
            'cv_mean_score': mean_score,
            'cv_std_score': cv_std_score,
            'train_accuracy': train_accuracy,
            'test_accuracy': test_accuracy,
            'roc_auc_test': roc_auc_test,
            'model': best_model
        }
        
        print(f"\n{'─'*80}")
        print(f"Результати Optuna для {name}:")
        print(f"{'─'*80}")
        print(f"Найкращі параметри: {best_params}")
        print(f"CV precision_macro: {mean_score:.4f}")
        print(f"Train Accuracy: {train_accuracy:.4f}")
        print(f"Test Accuracy: {test_accuracy:.4f}")
        print(f"Test ROC-AUC: {roc_auc_test:.4f}")

    except Exception as e:
        print(f"Помилка фінального навчання моделі {name}: {e}")
            
print("\n" + "="*80)
print("Optuna тюнінг завершено!")
print("="*80)

In [None]:
tuned_results_voting = {}

def objective_voting(trial, X_train, y_train, CV, top_models):
    
    weights = {}
    
    for i, (name, _) in enumerate(top_models):
        weights[name] = trial.suggest_float(f'weight_{name}', 0.5, 3.0, log=False)
    
    estimators_with_weights = [
        (name, model) for name, model in top_models
    ]
    
    voting_model_tuned = VotingClassifier(
        estimators=estimators_with_weights,
        voting='soft',
        weights=[weights[name] for name, _ in top_models]
    )

    try:
        scores = cross_val_score(voting_model_tuned, X_train, y_train, cv=CV, scoring='precision_macro', n_jobs=-1)
        mean_precision = scores.mean()
        return mean_precision
        
    except Exception as e:
        return np.nan

N_TRIALS = 50 

print("\n" + "="*80)
print(f"ПОЧАТОК ТЮНІНГУ: VotingClassifier (soft) з Optuna ({N_TRIALS} спроб)")
print("="*80)

voting_study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=42))

try:
    voting_study.optimize(
        lambda trial: objective_voting(trial, X_train, y_train, CV, top_models),
        n_trials=N_TRIALS, 
        show_progress_bar=False,
        timeout=1800 
    )
except Exception as e:
    print(f"Помилка в Optuna Study: {e}")

best_voting_params = voting_study.best_params
best_voting_weights = [best_voting_params[key] for key in best_voting_params if key.startswith('weight_')]

best_voting_model = VotingClassifier(
    estimators=top_models,
    voting='soft',
    weights=best_voting_weights
)

best_voting_model.fit(X_train, y_train)

y_train_pred_vote_tuned = best_voting_model.predict(X_train) 
y_pred_vote_tuned = best_voting_model.predict(X_test)
precision_vote_tuned = precision_score(y_test, y_pred_vote_tuned, average="macro", zero_division=0)
train_accuracy_vote_tuned = accuracy_score(y_train, y_train_pred_vote_tuned) 
test_accuracy_vote_tuned = accuracy_score(y_test, y_pred_vote_tuned) 

scores_final_vote = cross_val_score(best_voting_model, X_train, y_train, cv=CV, scoring='precision_macro', n_jobs=-1)
cv_std_score_vote = scores_final_vote.std()

roc_auc_test_vote = np.nan
if hasattr(best_voting_model, 'predict_proba'):
    proba_test_vote = best_voting_model.predict_proba(X_test)
    if is_multiclass and proba_test_vote.ndim == 2 and proba_test_vote.shape[1] == n_classes:
        roc_auc_test_vote = roc_auc_score(y_test, proba_test_vote, multi_class='ovr')
    elif not is_multiclass and proba_test_vote.ndim == 2 and proba_test_vote.shape[1] == 2:
        roc_auc_test_vote = roc_auc_score(y_test, proba_test_vote[:, 1])

tuned_results_voting['VotingClassifier'] = {
    'best_params': best_voting_params,
    'cv_mean_score': voting_study.best_value,
    'cv_std_score': cv_std_score_vote,
    'train_accuracy': train_accuracy_vote_tuned,
    'test_accuracy': test_accuracy_vote_tuned,
    'roc_auc_test': roc_auc_test_vote,
    'model': best_voting_model,
    'precision (Test, Tuned)': precision_vote_tuned,
}


print(f"\n{'─'*80}")
print("РЕЗУЛЬТАТИ ТЮНІНГУ VotingClassifier:")
print(f"{'─'*80}")
print(f"Найкращі ваги: {best_voting_params}")
print(f"CV precision_macro (Best): {voting_study.best_value:.4f}")
print(f"CV precision_macro (Std): {cv_std_score_vote:.4f}")
print(f"Train Accuracy: {train_accuracy_vote_tuned:.4f}") 
print(f"Test Accuracy: {test_accuracy_vote_tuned:.4f}") 
print(f"Precision (Test, Tuned) = {precision_vote_tuned:.4f}")
print(f"ROC-AUC Test: {roc_auc_test_vote:.4f}") 

In [None]:
tuned_results_stacking = {}

def objective_stacking(trial, X_train, y_train, CV, top_models):
    solver = trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'saga'])
    
    if solver == 'lbfgs':
        penalty = 'l2'
    elif solver in ['liblinear', 'saga']:
        penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    else:
        penalty = 'l2' 
        
    C = trial.suggest_float('C', 0.001, 100.0, log=True)
    max_iter = trial.suggest_int('max_iter', 1000, 3000)
    
    final_estimator = LogisticRegression(
        solver=solver, 
        penalty=penalty, 
        C=C, 
        max_iter=max_iter, 
        random_state=42,
        class_weight='balanced', 
        multi_class='ovr' 
    )
    
    stacked_model_tuned = StackingClassifier(
        estimators=top_models,
        final_estimator=final_estimator,
        cv=3, 
        n_jobs=-1
    )

    try:
        scores = cross_val_score(stacked_model_tuned, X_train, y_train, cv=CV, scoring='precision_macro', n_jobs=-1)
        mean_precision = scores.mean()
        if np.isnan(mean_precision):
            return 0.0
        return mean_precision
        
    except Exception as e:
        return 0.0

print("\n" + "="*80)
print(f"ПОЧАТОК ТЮНІНГУ: StackingClassifier Final Estimator ({N_TRIALS} спроб)")
print("="*80)

stacking_study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=42))

try:
    stacking_study.optimize(
        lambda trial: objective_stacking(trial, X_train, y_train, CV, top_models),
        n_trials=N_TRIALS, 
        show_progress_bar=False,
        timeout=1800 
    )
except Exception as e:
    print(f"Помилка в Optuna Study: {e}")

best_stacking_params = stacking_study.best_params
best_solver = best_stacking_params['solver']

if best_solver == 'lbfgs':
    best_penalty = 'l2'
elif best_solver in ['liblinear', 'saga']:
    best_penalty = best_stacking_params['penalty']
else:
    best_penalty = 'l2'

best_final_estimator = LogisticRegression(
    solver=best_solver,
    penalty=best_penalty,
    C=best_stacking_params['C'],
    max_iter=best_stacking_params['max_iter'],
    random_state=42,
    class_weight='balanced',
    multi_class='ovr'
)

best_stacked_model = StackingClassifier(
    estimators=top_models,
    final_estimator=best_final_estimator
)

best_stacked_model.fit(X_train, y_train)
y_train_pred_stack_tuned = best_stacked_model.predict(X_train)
y_pred_stack_tuned = best_stacked_model.predict(X_test)
precision_stack_tuned = precision_score(y_test, y_pred_stack_tuned, average="macro", zero_division=0)
train_accuracy_stack_tuned = accuracy_score(y_train, y_train_pred_stack_tuned)
test_accuracy_stack_tuned = accuracy_score(y_test, y_pred_stack_tuned)
scores_final_stack = cross_val_score(best_stacked_model, X_train, y_train, cv=CV, scoring='precision_macro', n_jobs=-1)
cv_std_score_stack = scores_final_stack.std()

roc_auc_test_stack = np.nan
if hasattr(best_stacked_model, 'predict_proba'):
    proba_test_stack = best_stacked_model.predict_proba(X_test)
    if is_multiclass and proba_test_stack.ndim == 2 and proba_test_stack.shape[1] == n_classes:
        roc_auc_test_stack = roc_auc_score(y_test, proba_test_stack, multi_class='ovr')
    elif not is_multiclass and proba_test_stack.ndim == 2 and proba_test_stack.shape[1] == 2:
        roc_auc_test_stack = roc_auc_score(y_test, proba_test_stack[:, 1])

tuned_results_stacking['StackingClassifier'] = {
    'best_params': best_stacking_params,
    'cv_mean_score': stacking_study.best_value,
    'cv_std_score': cv_std_score_stack,
    'train_accuracy': train_accuracy_stack_tuned,
    'test_accuracy': test_accuracy_stack_tuned,
    'roc_auc_test': roc_auc_test_stack,
    'model': best_stacked_model,
    'precision (Stacking, Tuned)': precision_stack_tuned,
}

print(f"\n{'─'*80}")
print("РЕЗУЛЬТАТИ ТЮНІНГУ StackingClassifier:")
print(f"{'─'*80}")
print(f"Найкращі параметри Final Estimator: {best_stacking_params}")
print(f"CV precision_macro (Best): {stacking_study.best_value:.4f}")
print(f"CV precision_macro (Std): {cv_std_score_stack:.4f}") 
print(f"Train Accuracy: {train_accuracy_stack_tuned:.4f}")
print(f"Test Accuracy: {test_accuracy_stack_tuned:.4f}")
print(f"Precision (Stacking, Tuned) = {precision_stack_tuned:.4f}")
print(f"ROC-AUC Test: {roc_auc_test_stack:.4f}")    

In [None]:
results_comparison = pd.DataFrame({
    'Model': list(tuned_results.keys()),
    'CV precision_macro': [tuned_results[m]['cv_mean_score'] for m in tuned_results],
    'CV Std': [tuned_results[m]['cv_std_score'] for m in tuned_results],
    'Train Accuracy': [tuned_results[m]['train_accuracy'] for m in tuned_results],
    'Test Accuracy': [tuned_results[m]['test_accuracy'] for m in tuned_results],
    'ROC-AUC Test': [tuned_results[m]['roc_auc_test'] for m in tuned_results],
    'Overfit (Train-Test)': [tuned_results[m]['train_accuracy'] - tuned_results[m]['test_accuracy'] 
                              for m in tuned_results]
})

results_comparison_optuna = pd.DataFrame({
    'Model': list(tuned_results_optuna.keys()),
    'CV precision_macro': [tuned_results_optuna[m]['cv_mean_score'] for m in tuned_results_optuna],
    'CV Std': [tuned_results_optuna[m]['cv_std_score'] for m in tuned_results_optuna],
    'Train Accuracy': [tuned_results_optuna[m]['train_accuracy'] for m in tuned_results_optuna],
    'Test Accuracy': [tuned_results_optuna[m]['test_accuracy'] for m in tuned_results_optuna],
    'ROC-AUC Test': [tuned_results_optuna[m]['roc_auc_test'] for m in tuned_results_optuna],
    'Overfit (Train-Test)': [tuned_results_optuna[m]['train_accuracy'] - tuned_results_optuna[m]['test_accuracy'] 
                              for m in tuned_results_optuna]
})

results_comparison_voting = pd.DataFrame({
    'Model': 'VotingClassifier',
    'CV precision_macro': [tuned_results_voting['VotingClassifier']['cv_mean_score']],
    'CV Std': [tuned_results_voting['VotingClassifier']['cv_std_score']],
    'Train Accuracy': [tuned_results_voting['VotingClassifier']['train_accuracy']],
    'Test Accuracy': [tuned_results_voting['VotingClassifier']['test_accuracy']],
    'ROC-AUC Test': [tuned_results_voting['VotingClassifier']['roc_auc_test']],
    'Precision': [tuned_results_voting['VotingClassifier']['precision (Test, Tuned)']],
    'Overfit (Train-Test)': [tuned_results_voting['VotingClassifier']['train_accuracy'] - tuned_results_voting['VotingClassifier']['test_accuracy']]
})

results_comparison_stacking = pd.DataFrame({
    'Model': 'StackingClassifier',
    'CV precision_macro': [tuned_results_stacking['StackingClassifier']['cv_mean_score']],
    'CV Std': [tuned_results_stacking['StackingClassifier']['cv_std_score']],
    'Train Accuracy': [tuned_results_stacking['StackingClassifier']['train_accuracy']],
    'Test Accuracy': [tuned_results_stacking['StackingClassifier']['test_accuracy']],
    'ROC-AUC Test': [tuned_results_stacking['StackingClassifier']['roc_auc_test']],
    'Precision': [tuned_results_stacking['StackingClassifier']['precision (Stacking, Tuned)']],
    'Overfit (Train-Test)': [tuned_results_stacking['StackingClassifier']['train_accuracy'] - tuned_results_stacking['StackingClassifier']['test_accuracy']]
})

results_comparison = results_comparison.sort_values('Test Accuracy', ascending=False)
results_comparison_optuna = results_comparison_optuna.sort_values('Test Accuracy', ascending=False)

print("\n" + "="*80)
print("ПІДСУМКОВЕ ПОРІВНЯННЯ МОДЕЛЕЙ")
print("="*80)
print("GRIDSEARCH")
print(results_comparison.to_string(index=False))
print("\nOPTUNA")
print(results_comparison_optuna.to_string(index=False))
print("\nVOTING CLASSIFIER")
print(results_comparison_voting.to_string(index=False))
print("\nSTACKING CLASSIFIER")
print(results_comparison_stacking.to_string(index=False))

print("\nGRIDSEARCH RESULTS")
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

ax1 = axes[0, 0]
models = results_comparison['Model']
x = np.arange(len(models))
width = 0.35

ax1.bar(x - width/2, results_comparison['Train Accuracy'], width, label='Train', alpha=0.8)
ax1.bar(x + width/2, results_comparison['Test Accuracy'], width, label='Test', alpha=0.8)
ax1.set_xlabel('Моделі')
ax1.set_ylabel('Accuracy')
ax1.set_title('Train vs Test Accuracy')
ax1.set_xticks(x)
ax1.set_xticklabels(models, rotation=45, ha='right')
ax1.legend()
ax1.grid(True, alpha=0.3)

ax2 = axes[0, 1]
ax2.barh(models, results_comparison['ROC-AUC Test'], alpha=0.8, color='coral')
ax2.set_xlabel('ROC-AUC Score')
ax2.set_title('ROC-AUC на тестовій вибірці')
ax2.grid(True, alpha=0.3)

ax3 = axes[1, 0]
colors = ['red' if x > 0.05 else 'green' for x in results_comparison['Overfit (Train-Test)']]
ax3.barh(models, results_comparison['Overfit (Train-Test)'], alpha=0.8, color=colors)
ax3.set_xlabel('Overfit (Train - Test)')
ax3.set_title('Оверфітинг моделей (червоний > 0.05)')
ax3.axvline(x=0.05, color='black', linestyle='--', linewidth=1)
ax3.grid(True, alpha=0.3)

ax4 = axes[1, 1]
ax4.barh(models, results_comparison['CV precision_macro'], 
         xerr=results_comparison['CV Std'], 
         alpha=0.8, color='skyblue', capsize=5)
ax4.set_xlabel('CV precision_macro')
ax4.set_title('Cross-Validation precision_macro (± std)')
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('grid_search_results.png', dpi=300)
plt.show()

print("\nOPTUNA RESULTS")
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

ax1 = axes[0, 0]
models = results_comparison_optuna['Model']
x = np.arange(len(models))
width = 0.35

ax1.bar(x - width/2, results_comparison_optuna['Train Accuracy'], width, label='Train', alpha=0.8)
ax1.bar(x + width/2, results_comparison_optuna['Test Accuracy'], width, label='Test', alpha=0.8)
ax1.set_xlabel('Моделі')
ax1.set_ylabel('Accuracy')
ax1.set_title('Train vs Test Accuracy')
ax1.set_xticks(x)
ax1.set_xticklabels(models, rotation=45, ha='right')
ax1.legend()
ax1.grid(True, alpha=0.3)

ax2 = axes[0, 1]
ax2.barh(models, results_comparison_optuna['ROC-AUC Test'], alpha=0.8, color='coral')
ax2.set_xlabel('ROC-AUC Score')
ax2.set_title('ROC-AUC на тестовій вибірці')
ax2.grid(True, alpha=0.3)

ax3 = axes[1, 0]
colors = ['red' if x > 0.05 else 'green' for x in results_comparison_optuna['Overfit (Train-Test)']]
ax3.barh(models, results_comparison_optuna['Overfit (Train-Test)'], alpha=0.8, color=colors)
ax3.set_xlabel('Overfit (Train - Test)')
ax3.set_title('Оверфітинг моделей (червоний > 0.05)')
ax3.axvline(x=0.05, color='black', linestyle='--', linewidth=1)
ax3.grid(True, alpha=0.3)

ax4 = axes[1, 1]
ax4.barh(models, results_comparison_optuna['CV precision_macro'], 
         xerr=results_comparison_optuna['CV Std'], 
         alpha=0.8, color='skyblue', capsize=5)
ax4.set_xlabel('CV precision_macro')
ax4.set_title('Cross-Validation precision_macro (± std)')
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('optuna_results.png', dpi=300)
plt.show()

In [None]:
final_combined_results = pd.concat([
        results_comparison_optuna,
        results_comparison_voting,
        results_comparison_stacking
    ], ignore_index=True)
final_combined_results = final_combined_results.sort_values('Test Accuracy', ascending=False).round(4).reset_index(drop=True)

print("\n" + "="*80)
print("ОБ'ЄДНАНІ РЕЗУЛЬТАТИ: Індивідуальні Optuna та Ансамблеві моделі")
print("="*80)
print(final_combined_results.to_string(index=False))

In [None]:
plt.figure(figsize=(12, 10))
mean_fpr = np.linspace(0, 1, 100)

print("Побудова ROC-кривих для Top-10 моделей...")
print("="*80)

for name in tuned_results.keys():
    try:
        best_model = tuned_results[name]['model']
        tprs, aucs = [], []

        for fold, (train_index, val_index) in enumerate(CV.split(X_train, y_train), 1):
            X_tr, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
            y_tr, y_val = y_train[train_index], y_train[val_index]
            
            from sklearn.base import clone
            model_fold = clone(best_model)
            model_fold.fit(X_tr, y_tr)

            if hasattr(model_fold, 'predict_proba'):
                y_scores = model_fold.predict_proba(X_val)
            elif hasattr(model_fold, 'decision_function'):
                y_scores = model_fold.decision_function(X_val)
                if y_scores.ndim == 1:
                    y_scores = np.vstack([1 - y_scores, y_scores]).T
            else:
                continue

            if is_multiclass:
                y_val_bin = label_binarize(y_val, classes=np.arange(n_classes))
                if y_scores.shape[1] == n_classes:
                    fpr, tpr, _ = roc_curve(y_val_bin.ravel(), y_scores.ravel())
                    mean_auc = auc(fpr, tpr)
                    aucs.append(mean_auc)
                    tprs.append(np.interp(mean_fpr, fpr, tpr))
            else:
                if y_scores.ndim == 2:
                    y_scores = y_scores[:, 1]
                fpr, tpr, _ = roc_curve(y_val, y_scores)
                aucs.append(auc(fpr, tpr))
                tprs.append(np.interp(mean_fpr, fpr, tpr))

        if not tprs:
            print(f"  {name}: Неможливо побудувати ROC-криву")
            continue

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = np.mean(aucs)
        std_auc = np.std(aucs)

        plt.plot(mean_fpr, mean_tpr, 
                label=f"{name} (AUC = {mean_auc:.3f} ± {std_auc:.3f})",
                linewidth=2)
        print(f"  {name}: AUC = {mean_auc:.3f} ± {std_auc:.3f}")
        
    except Exception as e:
        print(f"  {name}: Помилка - {e}")
        continue

plt.plot([0, 1], [0, 1], linestyle='--', color='gray', 
         label='Random (AUC = 0.5)', linewidth=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC-криві (Top-10 класифікаторів з оптимальними параметрами)', fontsize=14)
plt.legend(loc='lower right', fontsize=9)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('roc_curves.png', dpi=300)
plt.show()

print("="*80)
print("ROC-криві побудовано!")

In [None]:
best_model_name = results_comparison_optuna.iloc[0]['Model']
best_model_obj = tuned_results_optuna[best_model_name]['model']

print("="*80)
print(f"ДЕТАЛЬНИЙ АНАЛІЗ НАЙКРАЩОЇ МОДЕЛІ: {best_model_name}")
print("="*80)

y_train_pred = best_model_obj.predict(X_train)
y_test_pred = best_model_obj.predict(X_test)

print("\nClassification Report на тренувальній вибірці:")
print("-" * 40)
print(classification_report(y_train, y_train_pred, target_names=le_y.classes_))

print("\nClassification Report на тестовій вибірці:")
print("-"*80)
print(classification_report(y_test, y_test_pred, target_names=le_y.classes_))


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

cm_train = confusion_matrix(y_train, y_train_pred)
disp_train = ConfusionMatrixDisplay(confusion_matrix=cm_train, display_labels=le_y.classes_)
disp_train.plot(ax=ax1, cmap='Greens', values_format='d')
ax1.set_title(f'Confusion Matrix (Train) - {best_model_name}')

cm_test = confusion_matrix(y_test, y_test_pred)
disp_test = ConfusionMatrixDisplay(confusion_matrix=cm_test, display_labels=le_y.classes_)
disp_test.plot(ax=ax2, cmap='Blues', values_format='d')
ax2.set_title(f'Confusion Matrix (Test) - {best_model_name}')

plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=300)
plt.show()

print("\n" + "="*80)
print(f"ВАЖЛИВІСТЬ ОЗНАК ДЛЯ {best_model_name}")
print("="*80)

if best_model_name in feature_importance_dict:
    importances = feature_importance_dict[best_model_name]
    feature_imp_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    }).sort_values('Importance', ascending=False)
    
    print(feature_imp_df.to_string(index=False))
    
    top_n = min(20, len(feature_imp_df))
    plt.figure(figsize=(10, 8))
    plt.barh(range(top_n), feature_imp_df['Importance'].head(top_n), alpha=0.8)
    plt.yticks(range(top_n), feature_imp_df['Feature'].head(top_n))
    plt.xlabel('Важливість')
    plt.title(f'Top-{top_n} найважливіших ознак - {best_model_name}')
    plt.gca().invert_yaxis()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('feature_importance.png', dpi=300)
    plt.show()

In [None]:
def perform_error_analysis(X, y_true, y_pred, model, le, dataset_name="Test", filename="plot.png"):
    print("\n" + "="*80)
    print(f"АНАЛІЗ ПОМИЛКОВИХ ПЕРЕДБАЧЕНЬ ({dataset_name.upper()})")
    print("="*80)

    wrong_indices = np.where(y_true != y_pred)[0]
    num_errors = len(wrong_indices)
    
    if num_errors > 0:
        X_errors = X.iloc[wrong_indices].copy()
        errors_df = pd.DataFrame({
            'Index': X.iloc[wrong_indices].index,
            'True_Class': le.inverse_transform(y_true[wrong_indices]),
            'Predicted_Class': le.inverse_transform(y_pred[wrong_indices])
        })
        
        if hasattr(model, 'predict_proba'):
            proba = model.predict_proba(X_errors)
            errors_df['Confidence'] = proba.max(axis=1)
            
        full_errors_df = pd.concat([errors_df, X_errors], axis=1)
        print(f"\nКількість помилок: {num_errors} з {len(y_true)} ({num_errors/len(y_true)*100:.2f}%)")
        print(f"\nПЕРШІ 10 РЯДКІВ ІЗ ПОМИЛКОВИМИ ЗНАЧЕННЯМИ ({dataset_name}):")
        print("-" * 80)
        display(full_errors_df.head(10))
        
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
 
        ax1 = axes[0]
        true_class_errors = full_errors_df['True_Class'].value_counts()
        ax1.bar(true_class_errors.index, true_class_errors.values, alpha=0.8, color='coral')
        ax1.set_title(f'Які класи модель пропускає ({dataset_name})')
        ax1.grid(True, alpha=0.3)

        ax2 = axes[1]
        pred_class_errors = full_errors_df['Predicted_Class'].value_counts()
        ax2.bar(pred_class_errors.index, pred_class_errors.values, alpha=0.8, color='skyblue')
        ax2.set_title(f'У які класи модель помилково відносить ({dataset_name})')
        ax2.grid(True, alpha=0.3)
        
        plt.tight_layout()

        plt.savefig(filename, dpi=300, bbox_inches='tight')
        print(f"Графік збережено як: {filename}")
        
        plt.show()
    else:
        print(f"Помилок у наборі {dataset_name} не виявлено!")

perform_error_analysis(X_train, y_train, y_train_pred, best_model_obj, le_y, dataset_name="Train", filename="errors_train_analysis.png")
perform_error_analysis(X_test, y_test, y_test_pred, best_model_obj, le_y, dataset_name="Test", filename="errors_test_analysis.png")

In [None]:
def deep_error_reason_analysis(X, y_true, y_pred, model, dataset_name="Test"):
    print("\n" + "="*80)
    print(f"ГЛИБОКИЙ АНАЛІЗ ПРИЧИН ПОМИЛОК ({dataset_name.upper()})")
    print("="*80)

    wrong_idx = np.where(y_true != y_pred)[0]
    correct_idx = np.where(y_true == y_pred)[0]
    
    if len(wrong_idx) == 0:
        print(f"Немає помилок для аналізу в наборі {dataset_name}.")
        return

    if hasattr(model, 'predict_proba'):
        all_probs = model.predict_proba(X)
        confidences = all_probs.max(axis=1)
        
        plt.figure(figsize=(10, 5))
        plt.hist(confidences[correct_idx], bins=30, alpha=0.5, label='Правильні передбачення', color='green')
        plt.hist(confidences[wrong_idx], bins=30, alpha=0.5, label='Помилкові передбачення', color='red')
        plt.axvline(confidences[wrong_idx].mean(), color='darkred', linestyle='--', label='Сер. впевненість помилок')
        plt.title(f'Розподіл впевненості моделі ({dataset_name})')
        plt.xlabel('Впевненість (max probability)')
        plt.ylabel('Кількість прикладів')
        plt.legend()
        plt.grid(True, alpha=0.2)
        plt.savefig(f'confidence_distribution_{dataset_name}.png', dpi=300)
        plt.show()

    mean_errors = X.iloc[wrong_idx].mean()
    mean_total = X.mean()
    
    feature_diff = ((mean_errors - mean_total) / (mean_total + 1e-9)) * 100
    
    diff_df = pd.DataFrame({
        'Mean_in_Errors': mean_errors,
        'Mean_Overall': mean_total,
        'Difference_%': feature_diff
    }).sort_values(by='Difference_%', key=abs, ascending=False)

    print(f"\nТоп ознак, що найбільше відрізняються в помилкових прогнозах ({dataset_name}):")
    print("Це вказує на те, які фактори найчастіше корелюють із помилкою.")
    print("-" * 80)
    display(diff_df.head(15))

    plt.figure(figsize=(10, 6))
    top_diffs = diff_df.head(10)
    colors = ['red' if x > 0 else 'blue' for x in top_diffs['Difference_%']]
    plt.barh(top_diffs.index, top_diffs['Difference_%'], color=colors, alpha=0.7)
    plt.xlabel('Відхилення від середнього (%)')
    plt.title(f'Аномалії в ознаках помилкових прогнозів ({dataset_name})')
    plt.grid(axis='x', alpha=0.3)
    plt.gca().invert_yaxis()
    plt.savefig(f'feature_anomaly_{dataset_name}.png', dpi=300)
    plt.show()

deep_error_reason_analysis(X_train, y_train, y_train_pred, best_model_obj, dataset_name="Train")
deep_error_reason_analysis(X_test, y_test, y_test_pred, best_model_obj, dataset_name="Test")

In [None]:
import shap

def perform_shap_analysis(model, X_train, X_test, feature_names):
    print("\n" + "="*80)
    print("SHAP ANALYSIS (INTERPRETABILITY)")
    print("="*80)

    try:
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test)
    except:
        explainer = shap.KernelExplainer(model.predict_proba, shap.sample(X_train, 100))
        shap_values = explainer.shap_values(X_test.iloc[:100])

    print("\n[INFO] Побудова Summary Plot (Загальний вплив ознак)...")
    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values, X_test, feature_names=feature_names, show=False)
    plt.title("SHAP Summary Plot - Загальний внесок ознак")
    plt.tight_layout()
    plt.savefig('shap_summary_plot.png', dpi=300)
    plt.show()
    
    sample_idx = 0 
    print(f"\n[INFO] Побудова Waterfall Plot для конкретного прикладу (Index: {X_test.index[sample_idx]})...")
    class_idx = 0 
    
    plt.figure(figsize=(12, 4))
    if isinstance(shap_values, list):
        expected_value = explainer.expected_value[class_idx]
        current_shap_values = shap_values[class_idx][sample_idx]
    else:
        expected_value = explainer.expected_value
        current_shap_values = shap_values[sample_idx]

    shap.plots._waterfall.waterfall_legacy(expected_value, current_shap_values, feature_names=feature_names, show=False)
    plt.title(f"Логіка прогнозу для класу {class_idx} (Рядок {sample_idx})")
    plt.tight_layout()
    plt.savefig('shap_waterfall_plot.png', dpi=300)
    plt.show()

perform_shap_analysis(best_model_obj, X_train, X_test, feature_names)