# AI-Based Transaction Failure Prediction - Model Training

This notebook demonstrates the process of training machine learning models to predict transaction failures using the developed modules.

In [None]:
import sys, os
project_root = os.path.dirname(os.getcwd())
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [None]:
import pandas as pd
import numpy as np

# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd()))
sys.path.insert(0, project_root)

# Import our custom modules
from src.model.data_processor import load_transaction_data, preprocess_data, prepare_features_target, split_data, scale_features
from src.model.model_trainer import ModelTrainer

ModuleNotFoundError: No module named 'src'

## Generate Sample Data

Since we don't have real transaction data yet, we'll generate some sample data that mimics the structure of financial transactions.

In [None]:
# Generate sample transaction data
np.random.seed(42)
n_samples = 10000

data = {
    'transaction_id': range(1, n_samples + 1),
    'transaction_amount': np.random.lognormal(mean=3, sigma=1.5, size=n_samples),
    'account_balance': np.random.lognormal(mean=4, sigma=1, size=n_samples),
    'time_of_day': np.random.randint(0, 24, size=n_samples),
    'day_of_week': np.random.randint(0, 7, size=n_samples),
    'merchant_category': np.random.choice(['grocery', 'gas', 'retail', 'restaurant', 'online'], size=n_samples),
    'transaction_type': np.random.choice(['debit', 'credit'], size=n_samples),
    'location_risk_score': np.random.uniform(0, 1, size=n_samples),
    'historical_failure_rate': np.random.beta(2, 8, size=n_samples),
}

# Create a synthetic target variable based on some logical rules
failure_prob = (
    0.1 +  # Base failure rate
    0.2 * (data['transaction_amount'] > 1000) +  # High amount = higher risk
    0.1 * (data['location_risk_score'] > 0.8) +  # High risk location = higher risk
    0.1 * (data['historical_failure_rate'] > 0.5)  # High historical failure = higher risk
)

data['transaction_failure'] = np.random.binomial(1, failure_prob, size=n_samples)

# Create DataFrame
df = pd.DataFrame(data)
print(f"Generated {len(df)} sample transactions")
print(f"Failure rate: {df['transaction_failure'].mean():.2%}")
df.head()

## Save Sample Data

Save the generated sample data to a CSV file for use in the training process.

In [None]:
# Create data directory if it doesn't exist
os.makedirs('data', exist_ok=True)

# Save the sample data
sample_data_path = os.path.join('data', 'sample_transaction_data.csv')
df.to_csv(sample_data_path, index=False)
print(f"Sample data saved to {sample_data_path}")

## Load and Preprocess Data

Use our data processor module to load and preprocess the data.

In [None]:
# Load data using our module
df_loaded = load_transaction_data(sample_data_path)
print(f"Loaded {len(df_loaded)} transactions")

# Preprocess data
df_processed, label_encoders = preprocess_data(df_loaded)
print(f"Processed data shape: {df_processed.shape}")

## Prepare Features and Target

Separate the features from the target variable.

In [None]:
# Prepare features and target
X, y = prepare_features_target(df_processed, 'transaction_failure')
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

## Split Data

Split the data into training and testing sets.

In [None]:
# Split data
X_train, X_test, y_train, y_test = split_data(X, y)
print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

## Scale Features

Scale the features for better model performance.

In [None]:
# Scale features
X_train_scaled, X_test_scaled, scaler = scale_features(X_train, X_test)
print("Features scaled successfully")

## Train Models

Train multiple models using our model trainer module.

In [None]:
# Initialize and train models
trainer = ModelTrainer()
trainer.train_models(X_train_scaled, y_train)
print("Models trained successfully")

## Evaluate Models

Evaluate the performance of all trained models.

In [None]:
# Evaluate models
best_model_name = trainer.evaluate_models(X_test_scaled, y_test)
print(f"Best model based on F1 score: {best_model_name}")

# Print detailed scores
scores = trainer.get_model_scores()
for model_name, metrics in scores.items():
    print(f"\n{model_name}:")
    print(f"  Accuracy:  {metrics['accuracy']:.4f}")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall:    {metrics['recall']:.4f}")
    print(f"  F1 Score:  {metrics['f1_score']:.4f}")

## Save the Best Model

Save the best performing model for later use.

In [None]:
# Create models directory if it doesn't exist
os.makedirs('models', exist_ok=True)

# Save the best model
model_path = os.path.join('models', 'best_transaction_model.pkl')
trainer.save_model(best_model_name, model_path)
print(f"Best model saved to {model_path}")

## Summary

We have successfully completed the model training process:
1. Generated sample transaction data
2. Loaded and preprocessed the data
3. Trained multiple ML models (Logistic Regression, Random Forest, Naive Bayes)
4. Evaluated the models based on accuracy, precision, recall, and F1 score
5. Saved the best performing model