In [1]:
from faker import Faker
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [2]:
# Initialize Faker with a Kenyan locale approximation
fake = Faker('en_US')  # No specific Kenyan locale, so we tweak manually
np.random.seed(42)  # For reproducibility

In [3]:
# Define Kenyan locations and transaction types
kenyan_locations = ['Nairobi', 'Mombasa', 'Kisumu', 'Eldoret', 'Nakuru']
transaction_types = ['Send Money', 'Pay Bill', 'Withdraw', 'Buy Goods']


In [4]:
# Generate 1,000 transactions
data = {
    'Timestamp': [fake.date_time_this_year() for _ in range(1000)],
    'Sender_ID': [f"2547{fake.unique.random_number(digits=8)}" for _ in range(1000)],  # Kenyan phone format
    'Receiver_ID': [f"2547{fake.unique.random_number(digits=8)}" for _ in range(1000)],
    'Amount': [np.random.randint(50, 10000) for _ in range(1000)],  # KES 50 to 10,000
    'Location': [np.random.choice(kenyan_locations) for _ in range(1000)],
    'Device_ID': [fake.uuid4() for _ in range(1000)],
    'Transaction_Type': [np.random.choice(transaction_types) for _ in range(1000)],
}

In [5]:
# Create DataFrame
df = pd.DataFrame(data)

# Sort by timestamp for realism
df = df.sort_values('Timestamp').reset_index(drop=True)


In [6]:
# Preview initial data
print("Initial Dataset Preview:")
print(df.head())


Initial Dataset Preview:
            Timestamp     Sender_ID   Receiver_ID  Amount Location  \
0 2025-01-01 00:08:02   25471645902  254725507820    7106   Kisumu   
1 2025-01-01 01:49:27  254761195883  254719710033    6470  Nairobi   
2 2025-01-01 01:53:24  254746310147  254773367924    7031   Kisumu   
3 2025-01-01 04:22:48  254736124581   25471972739    1116   Kisumu   
4 2025-01-01 06:34:10  254726227469  254772252369    4827   Kisumu   

                              Device_ID Transaction_Type  
0  92ba5452-ae93-4a42-bc22-1ae069f5af76       Send Money  
1  469c96ef-9390-45bb-8ac6-ad448d44982c         Withdraw  
2  47b1cdaf-5cf2-4339-9e29-70f086628510         Withdraw  
3  879efc8a-5474-4712-b37b-7514e167dbe5         Pay Bill  
4  1e5a1400-f5da-4593-a620-66015e5f445c         Withdraw  


In [7]:
# Bias 30% of transactions to month-end (salary disbursement spike)
month_end = pd.date_range(start='2025-03-28', end='2025-03-30', freq='min')
df.loc[:300, 'Timestamp'] = [np.random.choice(month_end) for _ in range(301)]  # 301 to match slice size


In [8]:
# Increase 'Send Money' frequency to mimic M-Pesa usage
df.loc[:500, 'Transaction_Type'] = 'Send Money'

In [9]:
# Inject fraudulent transactions (rapid small transfers)
fraud_sender = "254799999999"
fraud_times = [datetime(2025, 3, 15, 10, i) for i in range(0, 10)]  # 10 mins apart
fraud_data = {
    'Timestamp': fraud_times,
    'Sender_ID': [fraud_sender] * 10,
    'Receiver_ID': [f"2547{fake.unique.random_number(digits=8)}" for _ in range(10)],
    'Amount': [np.random.randint(50, 200) for _ in range(10)],
    'Location': ['Nairobi'] * 10,
    'Device_ID': [fake.uuid4()] * 10,
    'Transaction_Type': ['Send Money'] * 10,
}
fraud_df = pd.DataFrame(fraud_data)


In [None]:
# Combine base and fraud data
df = pd.concat([df, fraud_df], ignore_index=True)


In [10]:
# Add fraud labels
df['Fraud_Label'] = 0
df.loc[df['Sender_ID'] == fraud_sender, 'Fraud_Label'] = 1



In [11]:
# Save the base dataset
df.to_csv('mobile_money_transactions.csv', index=False)
print("Base dataset saved as 'mobile_money_transactions.csv'")
print(f"Total transactions: {len(df)}, Fraudulent: {df['Fraud_Label'].sum()}")


Base dataset saved as 'mobile_money_transactions.csv'
Total transactions: 1000, Fraudulent: 0
