# <center><font color = '#DF9166' size = 20 center> **Modeling**</font></center>



## <font color = '#DF9166' size=6>**Table of content**<font/><a class = 'anchor' id = 'introduction'/>

1. [**Import Libraries**](#import)
2. [**Modeling**](#modeling)


## <font color = '#DF9166' size=6>**Import Libraries**<font/><a class = 'anchor' id = 'import'/>


In [None]:
import os
import sys
import pandas as pd

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)

In [None]:
# # sys.path.append(os.path.abspath(os.path.pardir))
# from scripts.eda.transaction_analysis import TransactionAnalysis

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## <font color = '#DF9166' size=6>**Modeling**<font/><a class = 'anchor' id = 'modeling'/>

In [8]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

class ModelTrainer:
    def __init__(self, x_path, y_path, test_size=0.2, random_state=42):
        """
        Initializes the ModelTrainer class by loading the data and splitting it into training and test sets.
        """
        self.x = np.load(x_path)
        self.y = np.load(y_path)
        self.test_size = test_size
        self.random_state = random_state
        self.models = {
            "Logistic Regression": LogisticRegression(),
            "Decision Tree": DecisionTreeClassifier(),
            "Random Forest": RandomForestClassifier(),
            "Gradient Boosting": GradientBoostingClassifier()
        }
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(
            self.x, self.y, test_size=self.test_size, random_state=self.random_state
        )
        self.trained_models = {}
        self.best_model = None

    def train_models(self):
        """Trains each model on the training dataset."""
        for name, model in self.models.items():
            model.fit(self.x_train, self.y_train)
            self.trained_models[name] = model
            print(f"Trained {name} model successfully.")

    def hyperparameter_tuning(self, model_name, param_grid, search_type='grid', n_iter=10):
        """
        Performs hyperparameter tuning using Grid Search or Random Search.
        """
        if model_name not in self.models:
            raise ValueError("Model not found. Choose from: " + ", ".join(self.models.keys()))

        model = self.models[model_name]

        if search_type == 'grid':
            search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        elif search_type == 'random':
            search = RandomizedSearchCV(model, param_grid, n_iter=n_iter, cv=5, scoring='accuracy', n_jobs=-1, random_state=self.random_state)
        else:
            raise ValueError("Invalid search type. Use 'grid' or 'random'.")

        search.fit(self.x_train, self.y_train)
        self.trained_models[model_name] = search.best_estimator_
        self.best_model = search.best_estimator_
        print(f"Best parameters for {model_name}: {search.best_params_}")

    def train_best_model(self):
        """Trains the best model obtained from hyperparameter tuning on the full training dataset."""
        if self.best_model is None:
            print("No best model found. Perform hyperparameter tuning first.")
            return

        self.best_model.fit(self.x_train, self.y_train)
        print("Best model trained successfully.")

    def evaluate_models(self):
        """Evaluates all trained models using the test dataset and prints performance metrics."""
        results = []
        for name, model in self.trained_models.items():
            y_pred = model.predict(self.x_test)
            y_prob = model.predict_proba(self.x_test)[:, 1] if hasattr(model, "predict_proba") else None

            accuracy = accuracy_score(self.y_test, y_pred)
            precision = precision_score(self.y_test, y_pred, average='binary', zero_division=1)
            recall = recall_score(self.y_test, y_pred, average='binary', zero_division=1)
            f1 = f1_score(self.y_test, y_pred, average='binary')
            roc_auc = roc_auc_score(self.y_test, y_prob) if y_prob is not None else 'N/A'

            results.append({
                "Model": name,
                "Accuracy": accuracy,
                "Precision": precision,
                "Recall": recall,
                "F1 Score": f1,
                "ROC-AUC": roc_auc
            })

            print(f"\n{name} Performance:")
            print(f"Accuracy: {accuracy:.4f}")
            print(f"Precision: {precision:.4f}")
            print(f"Recall: {recall:.4f}")
            print(f"F1 Score: {f1:.4f}")
            print(f"ROC-AUC: {roc_auc}")

        return pd.DataFrame(results)

    def save_best_model(self, filename="best_model.pkl"):
        """Serializes the best model and saves it to a file."""
        if self.best_model is None:
            print("No best model found. Perform hyperparameter tuning first.")
            return

        with open(filename, "wb") as file:
            pickle.dump(self.best_model, file)
        print(f"Best model saved as {filename}.")

# Example usage:
# trainer = ModelTrainer("x.npy", "y.npy")
# trainer.train_models()
# param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 10]}
# trainer.hyperparameter_tuning("Random Forest", param_grid, search_type='grid')
# trainer.train_best_model()
# trainer.evaluate_models()


In [9]:
# Initialize  class
trainer = ModelTrainer("/content/drive/MyDrive/10 acadamy/W6 Challenge/data/processed/X_features.npy", "/content/drive/MyDrive/10 acadamy/W6 Challenge/data/processed/y_labels.npy")

In [10]:
param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 10]}
trainer.hyperparameter_tuning("Random Forest", param_grid, search_type='grid')

Best parameters for Random Forest: {'max_depth': 10, 'n_estimators': 200}


In [12]:
trainer.train_best_model()

Best model trained successfully.


In [13]:
trainer.evaluate_models()


Random Forest Performance:
Accuracy: 0.9981
Precision: 1.0000
Recall: 0.0000
F1 Score: 0.0000
ROC-AUC: 0.9971148755185515


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC-AUC
0,Random Forest,0.998118,1.0,0.0,0.0,0.997115


In [14]:
trainer.save_best_model('/content/drive/MyDrive/10 acadamy/W6 Challenge/checkpoints/best_model.pkl')

Best model saved as /content/drive/MyDrive/10 acadamy/W6 Challenge/checkpoints/best_model.pkl.
