# Transaction Simulator Demo

This notebook demonstrates the transaction simulator that generates synthetic banking transaction data with various failure scenarios.

In [1]:
import sys, os
project_root = os.path.dirname(os.getcwd())
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd()))
sys.path.insert(0, project_root)

# Import our simulator module
from src.simulator.transaction_simulator import TransactionSimulator

## Initialize the Transaction Simulator

Create an instance of the TransactionSimulator with default parameters.

In [None]:
# Initialize the simulator with default parameters
simulator = TransactionSimulator(
    failure_rate=0.15,  # 15% failure rate
    high_amount_threshold=1000.0,  # Transactions above $1000 are high-risk
    high_risk_locations=['high_risk_location_1', 'high_risk_location_2'],
    failure_types={
        'timeout': 0.4,
        'network_error': 0.2,
        'insufficient_funds': 0.2,
        'duplicate_transaction': 0.1,
        'routing_error': 0.1
    }
)

print("Transaction Simulator initialized with:")
print(f"- Failure rate: {simulator.failure_rate}")
print(f"- High amount threshold: ${simulator.high_amount_threshold}")
print(f"- High risk locations: {simulator.high_risk_locations}")
print(f"- Failure types: {simulator.failure_types}")

## Generate Sample Transactions

Generate a small set of transactions to understand the data structure.

In [None]:
# Generate a small sample of transactions
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 12, 31)

sample_transactions = []
for i in range(10):
    transaction = simulator.generate_transaction(start_date, end_date)
    sample_transactions.append(transaction)

# Convert to DataFrame for better visualization
sample_df = pd.DataFrame(sample_transactions)
print(f"Generated {len(sample_df)} sample transactions")
sample_df

## Generate a Larger Dataset

Generate a larger dataset of synthetic transactions for model training.

In [None]:
# Generate a larger dataset
print("Generating 5000 synthetic transactions...")

# Create data directory if it doesn't exist
os.makedirs('data', exist_ok=True)

large_df = simulator.generate_transactions(
    count=5000,
    start_date=start_date,
    end_date=end_date,
    output_file='data/synthetic_transactions.csv'
)

print(f"Generated {len(large_df)} transactions")
print(f"Failure rate: {large_df['transaction_failure'].mean():.2%}")

## Analyze the Generated Data

Explore the characteristics of the generated transaction data.

In [None]:
# Basic statistics
print("Dataset shape:", large_df.shape)
print("\nColumn names:", list(large_df.columns))

# Transaction failure distribution
print("\nTransaction failure distribution:")
print(large_df['transaction_failure'].value_counts())

# Failure type distribution
print("\nFailure type distribution:")
print(large_df['failure_type'].value_counts())

# Amount distribution for failed vs successful transactions
print("\nAverage transaction amount:")
print(f"  All transactions: ${large_df['transaction_amount'].mean():.2f}")
print(f"  Failed transactions: ${large_df[large_df['transaction_failure'] == 1]['transaction_amount'].mean():.2f}")
print(f"  Successful transactions: ${large_df[large_df['transaction_failure'] == 0]['transaction_amount'].mean():.2f}")

## Visualize the Data

Create some basic visualizations to understand the data better.

In [None]:
import matplotlib.pyplot as plt

# Set up the figure
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Analysis of Generated Transaction Data', fontsize=16)

# Plot 1: Transaction failure distribution
failure_counts = large_df['transaction_failure'].value_counts()
axes[0, 0].pie(failure_counts.values, labels=['Success', 'Failure'], autopct='%1.1f%%', startangle=90)
axes[0, 0].set_title('Transaction Success/Failure Distribution')

# Plot 2: Failure type distribution
failure_type_counts = large_df[large_df['transaction_failure'] == 1]['failure_type'].value_counts()
axes[0, 1].bar(failure_type_counts.index, failure_type_counts.values)
axes[0, 1].set_title('Distribution of Failure Types')
axes[0, 1].tick_params(axis='x', rotation=45)

# Plot 3: Transaction amount distribution
axes[1, 0].hist(large_df['transaction_amount'], bins=50, alpha=0.7, label='All Transactions')
axes[1, 0].axvline(large_df['transaction_amount'].mean(), color='r', linestyle='--', label='Mean')
axes[1, 0].set_title('Distribution of Transaction Amounts')
axes[1, 0].set_xlabel('Amount ($)', fontsize=12)
axes[1, 0].set_ylabel('Frequency', fontsize=12)
axes[1, 0].legend()

# Plot 4: Amount vs failure
success_amounts = large_df[large_df['transaction_failure'] == 0]['transaction_amount']
failure_amounts = large_df[large_df['transaction_failure'] == 1]['transaction_amount']
axes[1, 1].hist([success_amounts, failure_amounts], bins=30, label=['Success', 'Failure'], alpha=0.7)
axes[1, 1].set_title('Transaction Amount Distribution by Outcome')
axes[1, 1].set_xlabel('Amount ($)', fontsize=12)
axes[1, 1].set_ylabel('Frequency', fontsize=12)
axes[1, 1].legend()

plt.tight_layout()
plt.show()

## Combine with Sample Data from Stage 1

Combine the simulated data with the sample data generated in Stage 1 to create a larger dataset.

In [None]:
# Load the sample data from Stage 1 if it exists
try:
    sample_data_path = 'data/sample_transaction_data.csv'
    sample_df = pd.read_csv(sample_data_path)
    print(f"Loaded sample data with {len(sample_df)} transactions")
    
    # Combine datasets
    combined_df = pd.concat([sample_df, large_df], ignore_index=True)
    print(f"Combined dataset has {len(combined_df)} transactions")
    
    # Save the combined dataset
    combined_output_path = 'data/combined_transaction_data.csv'
    combined_df.to_csv(combined_output_path, index=False)
    print(f"Combined dataset saved to {combined_output_path}")
    
    print(f"Combined failure rate: {combined_df['transaction_failure'].mean():.2%}")
    
except FileNotFoundError:
    print(f"Sample data file {sample_data_path} not found, using only simulated data")
    combined_df = large_df
    combined_output_path = 'data/combined_transaction_data.csv'
    combined_df.to_csv(combined_output_path, index=False)
    print(f"Simulated dataset saved to {combined_output_path}")

## Summary

We have successfully:
1. Created a transaction simulator that generates realistic banking transactions
2. Generated a dataset of 5000 synthetic transactions
3. Analyzed the characteristics of the generated data
4. Combined with existing sample data to create a larger dataset

The simulator creates transactions with various failure types (timeout, network error, etc.) and considers risk factors like transaction amount and location.