<a href="https://colab.research.google.com/github/HugoPfeffer/rfb-poc/blob/main/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas
!pip install numpy
!pip install scikit-learn



In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, precision_recall_fscore_support
import warnings
warnings.filterwarnings('ignore')

In [3]:
class FraudDetectionModel:
    def __init__(self):
        self.model = None
        self.best_params = None

    def load_data(self):
        """Load training and test datasets"""
        try:
            train_data = pd.read_csv('datasets/train_data.csv')
            test_data = pd.read_csv('datasets/test_data.csv')

            # Assuming the target variable is named 'fraud' or 'target'
            # Modify these according to your actual column names
            X_train = train_data.drop('target', axis=1)
            y_train = train_data['target']
            X_test = test_data.drop('target', axis=1)
            y_test = test_data['target']

            return X_train, X_test, y_train, y_test
        except Exception as e:
            print(f"Error loading data: {str(e)}")
            return None

    def tune_hyperparameters(self, X_train, y_train):
        """Perform hyperparameter tuning using RandomizedSearchCV"""
        # Define the parameter grid
        param_grid = {
            'n_estimators': [100, 200, 300, 400, 500],
            'max_depth': [10, 20, 30, 40, 50, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['auto', 'sqrt'],
            'class_weight': ['balanced', 'balanced_subsample', None]
        }

        # Initialize Random Forest
        rf = RandomForestClassifier(random_state=42)

        # Perform RandomizedSearchCV
        random_search = RandomizedSearchCV(
            estimator=rf,
            param_distributions=param_grid,
            n_iter=20,
            cv=3,
            scoring='f1',
            n_jobs=-1,
            random_state=42
        )

        # Fit the random search
        random_search.fit(X_train, y_train)
        self.best_params = random_search.best_params_
        return random_search.best_estimator_

    def train_model(self, X_train, y_train):
        """Train the Random Forest model with best parameters"""
        print("Training model with best parameters:", self.best_params)
        self.model = RandomForestClassifier(**self.best_params, random_state=42)
        self.model.fit(X_train, y_train)

    def evaluate_model(self, X_test, y_test):
        """Evaluate the model performance"""
        if self.model is None:
            print("Model not trained yet!")
            return

        # Make predictions
        y_pred = self.model.predict(X_test)

        # Calculate metrics
        precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

        # Print detailed classification report
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))

        # Print summary metrics
        print("\nSummary Metrics:")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1:.4f}")

        return precision, recall, f1


In [4]:
def main():
    # Initialize model
    fraud_model = FraudDetectionModel()

    # Load data
    print("Loading data...")
    data = fraud_model.load_data()
    if data is None:
        return
    X_train, X_test, y_train, y_test = data

    # Tune hyperparameters
    print("\nTuning hyperparameters...")
    best_model = fraud_model.tune_hyperparameters(X_train, y_train)

    # Train model
    print("\nTraining model...")
    fraud_model.train_model(X_train, y_train)

    # Evaluate model
    print("\nEvaluating model...")
    fraud_model.evaluate_model(X_test, y_test)

In [5]:
if __name__ == "__main__":
    main()

Loading data...
Error loading data: "['target'] not found in axis"
