In [None]:
from faker import Faker
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sdv.tabular import GaussianCopula

In [None]:
# Initialize Faker with a Kenyan locale approximation
fake = Faker('en_US')  # No specific Kenyan locale, so we tweak manually
np.random.seed(42)  # For reproducibility

In [None]:
# Define Kenyan locations and transaction types
kenyan_locations = ['Nairobi', 'Mombasa', 'Kisumu', 'Eldoret', 'Nakuru']
transaction_types = ['Send Money', 'Pay Bill', 'Withdraw', 'Buy Goods']


In [None]:
# Generate 1,000 transactions
data = {
    'Timestamp': [fake.date_time_this_year() for _ in range(1000)],
    'Sender_ID': [f"2547{fake.unique.random_number(digits=8)}" for _ in range(1000)],  # Kenyan phone format
    'Receiver_ID': [f"2547{fake.unique.random_number(digits=8)}" for _ in range(1000)],
    'Amount': [np.random.randint(50, 10000) for _ in range(1000)],  # KES 50 to 10,000
    'Location': [np.random.choice(kenyan_locations) for _ in range(1000)],
    'Device_ID': [fake.uuid4() for _ in range(1000)],
    'Transaction_Type': [np.random.choice(transaction_types) for _ in range(1000)],
}

In [None]:
# Create DataFrame
df = pd.DataFrame(data)

# Sort by timestamp for realism
df = df.sort_values('Timestamp').reset_index(drop=True)


In [None]:
# Preview initial data
print("Initial Dataset Preview:")
print(df.head())


In [None]:
# Bias 30% of transactions to month-end (salary disbursement spike)
month_end = pd.date_range(start='2025-03-28', end='2025-03-30', freq='min')
df.loc[:300, 'Timestamp'] = [np.random.choice(month_end) for _ in range(301)]  # 301 to match slice size


In [None]:
# Increase 'Send Money' frequency to mimic M-Pesa usage
df.loc[:500, 'Transaction_Type'] = 'Send Money'

In [None]:
# Inject fraudulent transactions (rapid small transfers)
fraud_sender = "254799999999"
fraud_times = [datetime(2025, 3, 15, 10, i) for i in range(0, 10)]  # 10 mins apart
fraud_data = {
    'Timestamp': fraud_times,
    'Sender_ID': [fraud_sender] * 10,
    'Receiver_ID': [f"2547{fake.unique.random_number(digits=8)}" for _ in range(10)],
    'Amount': [np.random.randint(50, 200) for _ in range(10)],
    'Location': ['Nairobi'] * 10,
    'Device_ID': [fake.uuid4()] * 10,
    'Transaction_Type': ['Send Money'] * 10,
}
fraud_df = pd.DataFrame(fraud_data)


In [None]:
# Combine base and fraud data
df = pd.concat([df, fraud_df], ignore_index=True)


In [None]:
# Add fraud labels
df['Fraud_Label'] = 0
df.loc[df['Sender_ID'] == fraud_sender, 'Fraud_Label'] = 1



In [None]:
# Save the base dataset
df.to_csv('mobile_money_transactions.csv', index=False)
print("Base dataset saved as 'mobile_money_transactions.csv'")
print(f"Total transactions: {len(df)}, Fraudulent: {df['Fraud_Label'].sum()}")


In [None]:
# Generate synthetic data with SDV
print("Generating synthetic data with SDV...")
model = GaussianCopula()
model.fit(df)
synthetic_data = model.sample(5000)  # Generate 5,000 rows
synthetic_data.to_csv('synthetic_mobile_money.csv', index=False)
print("Synthetic dataset saved as 'synthetic_mobile_money.csv'")
print("Synthetic Dataset Preview:")
print(synthetic_data.head())