In [1]:
import os
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, roc_curve, precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier
import pickle
from sklearn.inspection import permutation_importance
import joblib
from joblib import Parallel, delayed
from tqdm import tqdm

In [2]:
marketing_df = pd.read_csv('../3. data_after_txt_proc/bank_marketing_data_after_txt_proc.csv')
personal_df = pd.read_csv('../3. data_after_txt_proc/Bank_Personal_Loan_Modelling_after_txt_proc.csv')

In [3]:
class ModelPipeline:
    def __init__(self, data, target_column, test_size=0.2, random_state=42, save_dir='results'):
        """
        Инициализация класса.
        """
        self.data = data
        self.target_column = target_column
        self.test_size = test_size
        self.random_state = random_state
        self.save_dir = save_dir

        self.X = self.data.drop(columns=[self.target_column])
        self.y = self.data[self.target_column]
        self.feature_names = self.X.columns.tolist()

        # Разделение на обучающие и тестовые данные
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=self.test_size, random_state=self.random_state
        )

        # Стандартизация данных
        self.scaler = StandardScaler()
        self.X_train_scaled = self.scaler.fit_transform(self.X_train)
        self.X_test_scaled = self.scaler.transform(self.X_test)

        # Словарь для хранения моделей
        self.models = {
            "Logistic Regression": LogisticRegression(max_iter=1000, random_state=self.random_state),
            "SVM": SVC(kernel='rbf', C=1, gamma='scale', random_state=self.random_state),
            "Random Forest": RandomForestClassifier(n_estimators=1000, random_state=self.random_state),
            "Boosting": None  # Будет заполняться через подбор гиперпараметров с Optuna
        }
        '''self.models = {
            "Boosting": None  # Будет заполняться через подбор гиперпараметров с Optuna
        }'''

    def save_metrics_and_plots(self, metrics, model_name, importances):
        """
        Сохраняет метрики и графики важности признаков в отдельную директорию.
        """
        # Создание директории для сохранения результатов, если она не существует
        dataset_dir = os.path.join(self.save_dir, f"{self.target_column}")
        if not os.path.exists(dataset_dir):
            os.makedirs(dataset_dir)
    
        # Создание директории для модели
        model_dir = os.path.join(dataset_dir, model_name)
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
    
        # Сохранение метрик в текстовый файл
        metrics_file = os.path.join(model_dir, f"{model_name}_metrics.txt")
        with open(metrics_file, "w") as f:
            for metric, value in metrics.items():
                f.write(f"{metric}: {value:.4f}\n")
    
        # Сохранение графика и списка важности признаков, если она есть
        if importances is not None:
            feature_importance_file = os.path.join(model_dir, f"{model_name}_feature_importance.txt")
            with open(feature_importance_file, "w") as f:
                sorted_indices = importances.argsort()[::-1]
                for idx in sorted_indices:
                    f.write(f"{self.feature_names[idx]}: {importances[idx]:.4f}\n")
    
            # Сохранение графика важности признаков
            plt.figure(figsize=(10, 6))
            sorted_importances = importances[sorted_indices]
            sorted_features = [self.feature_names[i] for i in sorted_indices]
            plt.barh(sorted_features, sorted_importances, color='skyblue')
            plt.title(f"Feature Importance ({model_name})")
            plt.xlabel("Importance")
            plt.ylabel("Features")
            plt.tight_layout()
    
            plot_file = os.path.join(model_dir, f"{model_name}_feature_importance.png")
            plt.savefig(plot_file)
            plt.close()


    def calculate_metrics(self, y_pred, y_pred_proba):
        """
        Рассчитывает метрики модели.
        """
        metrics = {
            'accuracy': accuracy_score(self.y_test, y_pred),
            'f1': f1_score(self.y_test, y_pred),
            'precision': precision_score(self.y_test, y_pred),
            'recall': recall_score(self.y_test, y_pred),
        }
        if y_pred_proba is not None:  # Проверяем доступность вероятностей
            metrics['roc_auc'] = roc_auc_score(self.y_test, y_pred_proba)
        return metrics

    def train_and_evaluate_model(self, model_name, model):
        """
        Обучает модель и вычисляет её метрики.
        """
        print(f"Training {model_name}...")
        model.fit(self.X_train_scaled, self.y_train)
        print(f"Model {model_name} trained. Evaluating...")
    
        y_pred = model.predict(self.X_test_scaled)
        y_pred_proba = None
        if hasattr(model, "predict_proba"):
            print(f"Using predict_proba for {model_name}...")
            y_pred_proba = model.predict_proba(self.X_test_scaled)[:, 1]
        elif hasattr(model, "decision_function"):
            print(f"Using decision_function for {model_name}...")
            y_pred_proba = model.decision_function(self.X_test_scaled)
    
        model_metrics = self.calculate_metrics(y_pred, y_pred_proba)
        print(f"Metrics calculated for {model_name}. Saving results...")
    
        importances = None
        if hasattr(model, 'feature_importances_'):
            print(f"Extracting feature importance for {model_name}...")
            importances = model.feature_importances_
        elif hasattr(model, 'get_feature_importance'):
            importances = model.get_feature_importance()
        else:
            print(f"Calculating permutation importance for {model_name}...")
            perm_importance = permutation_importance(
                model, self.X_test_scaled, self.y_test, n_repeats=10, random_state=self.random_state
            )
            importances = perm_importance.importances_mean
    
        self.save_metrics_and_plots(model_metrics, model_name, importances)
        print(f"Results saved for {model_name}.")
        return model_metrics



    def objective_logreg(self, trial):
        """
        Определяет гиперпараметры для Logistic Regression через Optuna.
        """
        C = trial.suggest_float('C', 1e-5, 1e2, log=True)
        model = LogisticRegression(max_iter=2000, C=C, random_state=self.random_state)

        score = cross_val_score(model, self.X_train_scaled, self.y_train, cv=5, scoring='f1').mean()
        return score

    def objective_svm(self, trial):
        """
        Определяет гиперпараметры для SVM через Optuna.
        """
        C = trial.suggest_float('C', 1e-5, 1e2, log=True)
        kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
        gamma = None
        degree = None
        if kernel in ['rbf', 'poly', 'sigmoid']:
            gamma = trial.suggest_float('gamma', 1e-5, 1e1, log=True)
        if kernel == 'poly':
            degree = trial.suggest_int('degree', 2, 5)

        model_m = Pipeline([
            ('scaler', StandardScaler()),
            ('clf', SVC(
                C=C,
                kernel=kernel,
                gamma=gamma if kernel in ['rbf', 'poly', 'sigmoid'] else 'scale',
                degree=degree if kernel == 'poly' else 3,
                random_state=42,
            ))
        ])

        score = cross_val_score(model_m, self.X_train_scaled, self.y_train, cv=5, scoring='f1').mean()
        return score

    def objective_rf(self, trial):
        """
        Определяет гиперпараметры для Random Forest через Optuna.
        """
        n_estimators = trial.suggest_int('n_estimators', 50, 1500, 25)
        max_depth = trial.suggest_int('max_depth', 2, 20, 2)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            random_state=42
        )
        score = cross_val_score(model, self.X_train_scaled, self.y_train, cv=5, scoring='f1').mean()
        return score

    def objective_boosting(self, trial):
        """
        Определяет гиперпараметры для Boosting через Optuna.
        """
        learning_rate = trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True)
        depth = trial.suggest_int('depth', 4, 12)
        iterations = trial.suggest_int('iterations', 100, 2000, step=25)

        model = CatBoostClassifier(
            task_type='GPU',  # Для использования GPU
            iterations=iterations,
            depth=depth,
            learning_rate=learning_rate,
            loss_function='Logloss',
            random_seed=self.random_state,
            logging_level='Silent'
        )

        score = cross_val_score(model, self.X_train_scaled, self.y_train, cv=5, scoring='f1').mean()
        return score
    
    def initialize_directories(self, model_name):
        """
        Создаёт структуру директорий для сохранения результатов.
        """
        # Создание базовой директории
        dataset_dir = os.path.join(self.save_dir, f"{self.target_column}")
        if not os.path.exists(dataset_dir):
            os.makedirs(dataset_dir)
    
        # Директория для текущей модели
        model_dir = os.path.join(dataset_dir, model_name)
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
    
        return model_dir
    def save_model(self, model, model_name):
        """
        Сохраняет обученную модель в соответствующую директорию.
        """
        model_dir = self.initialize_directories(model_name)
        model_path = os.path.join(model_dir, f"{model_name}_model.pkl")
    
        with open(model_path, 'wb') as f:
            pickle.dump(model, f)
        print(f"Model {model_name} saved to {model_path}.")
    def load_model(self, model_name):
        """
        Загружает модель из соответствующей директории.
        """
        model_dir = self.initialize_directories(model_name)
        model_path = os.path.join(model_dir, f"{model_name}_model.pkl")
    
        if os.path.exists(model_path):
            with open(model_path, 'rb') as f:
                model = pickle.load(f)
            print(f"Model {model_name} loaded from {model_path}.")
            return model
        else:
            print(f"No saved model found for {model_name} in {model_dir}.")
            return None

    def optimize_model(self, model_name):
        """
        Запускает Optuna для поиска лучших гиперпараметров.
        """
        print(f"Starting optimization for {model_name}...")
        def objective(trial):
            if model_name == "Logistic Regression":
                return self.objective_logreg(trial)
            elif model_name == "SVM":
                return self.objective_svm(trial)
            elif model_name == "Random Forest":
                return self.objective_rf(trial)
            elif model_name == "Boosting":
                return self.objective_boosting(trial)
    
        # Убираем joblib.parallel_backend для предотвращения проблем
        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=50)  # Без параллелизации
    
        best_params = study.best_params
        print(f"Optimization completed for {model_name}. Best parameters: {best_params}")
    
        # Создание модели с лучшими параметрами
        if model_name == "Logistic Regression":
            return LogisticRegression(max_iter=1000, C=best_params['C'], random_state=self.random_state)
        elif model_name == "SVM":
            return SVC(
                C=best_params['C'],
                kernel=best_params['kernel'],
                gamma=best_params.get('gamma', 'scale'),
                random_state=self.random_state
            )
        elif model_name == "Random Forest":
            return RandomForestClassifier(
                n_estimators=best_params['n_estimators'],
                max_depth=best_params['max_depth'],
                min_samples_split=best_params['min_samples_split'],
                min_samples_leaf=best_params['min_samples_leaf'],
                random_state=self.random_state
            )
        elif model_name == "Boosting":
            return CatBoostClassifier(
                task_type='GPU',
                iterations=best_params['iterations'],
                depth=best_params['depth'],
                learning_rate=best_params['learning_rate'],
                loss_function='Logloss',
                random_seed=self.random_state,
                logging_level='Silent'
            )





    def run_pipeline(self):
        """
        Полный запуск конвейера: оптимизация гиперпараметров, обучение и оценка моделей.
        """
        def process_model(model_name):
            print(f"\nProcessing {model_name}...")
            optimized_model = self.optimize_model(model_name)
            print(f"Optimization completed for {model_name}. Training...")
            metrics = self.train_and_evaluate_model(model_name, optimized_model)
            self.save_model(optimized_model, model_name)
            print(f"Finished processing {model_name}. Metrics: {metrics}")

        print("Starting the pipeline...")
        for model_name in tqdm(self.models.keys(), desc="Models"):
            process_model(model_name)
        print("Pipeline completed.")


    def save_metric_plots(self, model_name, y_true, y_pred_proba):
        """
        Сохраняет графики, такие как ROC-кривая и Precision-Recall.
        """
        model_dir = self.initialize_directories(model_name)
    
        # ROC-кривая
        fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, label="ROC Curve")
        plt.title(f"ROC Curve for {model_name}")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.legend()
        plt.tight_layout()
        roc_path = os.path.join(model_dir, "roc_curve.png")
        plt.savefig(roc_path)
        plt.close()
    
        # Precision-Recall кривая
        precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)
        plt.figure(figsize=(8, 6))
        plt.plot(recall, precision, label="Precision-Recall Curve")
        plt.title(f"Precision-Recall Curve for {model_name}")
        plt.xlabel("Recall")
        plt.ylabel("Precision")
        plt.legend()
        plt.tight_layout()
        pr_path = os.path.join(model_dir, "precision_recall_curve.png")
        plt.savefig(pr_path)
        plt.close()
    
        print(f"Metric plots saved for {model_name} in {model_dir}.")


In [5]:
%%time
marketing_pipeline = ModelPipeline(data=marketing_df, target_column='deposit', test_size=0.2, random_state=42, save_dir='../pipelines/marketing_df')
marketing_pipeline.run_pipeline()

Starting the pipeline...


Models:   0%|          | 0/4 [00:00<?, ?it/s][I 2024-12-06 23:09:00,771] A new study created in memory with name: no-name-fe687835-fcde-4889-953b-8ca6794606c3
[W 2024-12-06 23:09:00,879] Trial 0 failed with parameters: {'C': 4.736980677163974} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\mitay\AppData\Local\anaconda3\envs\course_work\lib\genericpath.py", line 19, in exists
    os.stat(path)
FileNotFoundError: [WinError 3] Системе не удается найти указанный путь: '/dev/shm'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\mitay\AppData\Local\anaconda3\envs\course_work\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\mitay\AppData\Local\Temp\ipykernel_15012\603090270.py", line 261, in objective
    return self.objective_logreg(trial)
  File "C:\Users\mitay\AppData\Local\Temp\ipykernel_1


Processing Logistic Regression...
Starting optimization for Logistic Regression...





KeyboardInterrupt: 

In [4]:
%%time
personal_pipeline = ModelPipeline(data=personal_df, target_column='Personal Loan', test_size=0.2, random_state=42, save_dir='../pipelines/personal_df')
personal_pipeline.run_pipeline()

Starting the pipeline...


Models:   0%|          | 0/4 [00:00<?, ?it/s][I 2024-12-06 21:06:04,939] A new study created in memory with name: no-name-8ceb33e4-e98d-4e55-95ae-e45f0d41be44



Processing Logistic Regression...
Starting optimization for Logistic Regression...


[I 2024-12-06 21:06:09,448] Trial 0 finished with value: 0.0 and parameters: {'C': 1.1119181665445307e-05}. Best is trial 0 with value: 0.0.
[I 2024-12-06 21:06:13,645] Trial 1 finished with value: 0.6930760222601825 and parameters: {'C': 3.1056973831422336}. Best is trial 1 with value: 0.6930760222601825.
[I 2024-12-06 21:06:17,901] Trial 2 finished with value: 0.6909844868271116 and parameters: {'C': 1.1317839218616637}. Best is trial 1 with value: 0.6930760222601825.
[I 2024-12-06 21:06:22,094] Trial 3 finished with value: 0.6929507139841785 and parameters: {'C': 20.935887697602965}. Best is trial 1 with value: 0.6930760222601825.
[I 2024-12-06 21:06:26,264] Trial 4 finished with value: 0.0 and parameters: {'C': 0.00037568290870803705}. Best is trial 1 with value: 0.6930760222601825.
[I 2024-12-06 21:06:30,413] Trial 5 finished with value: 0.6929507139841785 and parameters: {'C': 18.254798852375977}. Best is trial 1 with value: 0.6930760222601825.
[I 2024-12-06 21:06:34,564] Trial 6

Optimization completed for Logistic Regression. Best parameters: {'C': 0.37879989038981304}
Optimization completed for Logistic Regression. Training...
Training Logistic Regression...
Model Logistic Regression trained. Evaluating...
Using predict_proba for Logistic Regression...
Metrics calculated for Logistic Regression. Saving results...
Calculating permutation importance for Logistic Regression...


Models:  25%|██▌       | 1/4 [03:32<10:37, 212.60s/it][I 2024-12-06 21:09:37,539] A new study created in memory with name: no-name-045f40c8-61c0-4ffc-9ebf-10ec7a1f8e54


Results saved for Logistic Regression.
Model Logistic Regression saved to ../pipelines/personal_df\Personal Loan\Logistic Regression\Logistic Regression_model.pkl.
Finished processing Logistic Regression. Metrics: {'accuracy': 0.955, 'f1': 0.7593582887700534, 'precision': 0.8658536585365854, 'recall': 0.6761904761904762, 'roc_auc': 0.9679382814578346}

Processing SVM...
Starting optimization for SVM...


[I 2024-12-06 21:09:42,060] Trial 0 finished with value: 0.6531474714824445 and parameters: {'C': 6.94691774394952e-05, 'kernel': 'poly', 'gamma': 6.394637601163642, 'degree': 2}. Best is trial 0 with value: 0.6531474714824445.
[I 2024-12-06 21:09:46,858] Trial 1 finished with value: 0.0 and parameters: {'C': 1.25283245298357e-05, 'kernel': 'sigmoid', 'gamma': 0.0007589851772779918}. Best is trial 0 with value: 0.6531474714824445.
[I 2024-12-06 21:09:51,700] Trial 2 finished with value: 0.0 and parameters: {'C': 0.0004480199248253714, 'kernel': 'rbf', 'gamma': 9.829445779912104e-05}. Best is trial 0 with value: 0.6531474714824445.
[I 2024-12-06 21:09:56,382] Trial 3 finished with value: 0.6881323173405285 and parameters: {'C': 4.559700835344471, 'kernel': 'linear'}. Best is trial 3 with value: 0.6881323173405285.
[I 2024-12-06 21:10:00,592] Trial 4 finished with value: 0.0 and parameters: {'C': 0.0009103402507249163, 'kernel': 'poly', 'gamma': 0.017075552083526426, 'degree': 2}. Best i

Optimization completed for SVM. Best parameters: {'C': 14.255349278148298, 'kernel': 'rbf', 'gamma': 0.03248729569247875}
Optimization completed for SVM. Training...
Training SVM...
Model SVM trained. Evaluating...
Using decision_function for SVM...
Metrics calculated for SVM. Saving results...
Calculating permutation importance for SVM...


Models:  50%|█████     | 2/4 [07:11<07:12, 216.33s/it][I 2024-12-06 21:13:16,474] A new study created in memory with name: no-name-2fcb6c86-9578-469e-b413-993fdf5c25c2


Results saved for SVM.
Model SVM saved to ../pipelines/personal_df\Personal Loan\SVM\SVM_model.pkl.
Finished processing SVM. Metrics: {'accuracy': 0.987, 'f1': 0.9353233830845771, 'precision': 0.9791666666666666, 'recall': 0.8952380952380953, 'roc_auc': 0.9949773876030858}

Processing Random Forest...
Starting optimization for Random Forest...


  n_estimators = trial.suggest_int('n_estimators', 50, 1500, 25)
  max_depth = trial.suggest_int('max_depth', 2, 20, 2)
[I 2024-12-06 21:13:22,947] Trial 0 finished with value: 0.902421803548564 and parameters: {'n_estimators': 625, 'max_depth': 14, 'min_samples_split': 8, 'min_samples_leaf': 7, 'max_features': None}. Best is trial 0 with value: 0.902421803548564.
  n_estimators = trial.suggest_int('n_estimators', 50, 1500, 25)
  max_depth = trial.suggest_int('max_depth', 2, 20, 2)
[I 2024-12-06 21:13:30,479] Trial 1 finished with value: 0.9076982810329655 and parameters: {'n_estimators': 1350, 'max_depth': 8, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 1 with value: 0.9076982810329655.
  n_estimators = trial.suggest_int('n_estimators', 50, 1500, 25)
  max_depth = trial.suggest_int('max_depth', 2, 20, 2)
[I 2024-12-06 21:13:37,279] Trial 2 finished with value: 0.9011249443793645 and parameters: {'n_estimators': 650, 'max_depth': 20, 'min_sample

Optimization completed for Random Forest. Best parameters: {'n_estimators': 525, 'max_depth': 12, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
Optimization completed for Random Forest. Training...
Training Random Forest...
Model Random Forest trained. Evaluating...
Using predict_proba for Random Forest...
Metrics calculated for Random Forest. Saving results...
Extracting feature importance for Random Forest...


Models:  75%|███████▌  | 3/4 [15:26<05:43, 343.36s/it][I 2024-12-06 21:21:31,015] A new study created in memory with name: no-name-7c09f732-f5fb-49bb-8a41-1bc4d3e447a3


Results saved for Random Forest.
Model Random Forest saved to ../pipelines/personal_df\Personal Loan\Random Forest\Random Forest_model.pkl.
Finished processing Random Forest. Metrics: {'accuracy': 0.989, 'f1': 0.9458128078817734, 'precision': 0.9795918367346939, 'recall': 0.9142857142857143, 'roc_auc': 0.9987017823889333}

Processing Boosting...
Starting optimization for Boosting...
Optimization running without parallelization for Boosting...


[I 2024-12-06 21:22:45,472] Trial 0 finished with value: 0.8887882166885145 and parameters: {'learning_rate': 0.0010147374618939041, 'depth': 8, 'iterations': 275}. Best is trial 0 with value: 0.8887882166885145.
[I 2024-12-06 21:37:03,139] Trial 1 finished with value: 0.9171221788658471 and parameters: {'learning_rate': 0.01362383936923281, 'depth': 9, 'iterations': 1850}. Best is trial 1 with value: 0.9171221788658471.
[I 2024-12-06 21:38:04,839] Trial 2 finished with value: 0.9193710780903501 and parameters: {'learning_rate': 0.008350769026173054, 'depth': 6, 'iterations': 525}. Best is trial 2 with value: 0.9193710780903501.
[I 2024-12-06 21:38:13,681] Trial 3 finished with value: 0.9037719231420661 and parameters: {'learning_rate': 0.00586675150046469, 'depth': 5, 'iterations': 350}. Best is trial 2 with value: 0.9193710780903501.
[I 2024-12-06 21:41:18,249] Trial 4 finished with value: 0.9047823729301495 and parameters: {'learning_rate': 0.002430438068786175, 'depth': 4, 'iterati