In [7]:
from faker import Faker
import pandas as pd
import numpy as np
from tqdm import tqdm
import random

fake = Faker()
Faker.seed(42)
np.random.seed(42)

TOTAL_RECORDS = 200_200
UNSEEN_SIZE = 200
MAIN_DATASET_SIZE = TOTAL_RECORDS - UNSEEN_SIZE

FRAUD_RATE = 0.0375
TOTAL_FRAUDS = int(TOTAL_RECORDS * FRAUD_RATE)
fraud_indices = set(np.random.choice(TOTAL_RECORDS, TOTAL_FRAUDS, replace=False))


def generate_record(index):
    is_fraud = 1 if index in fraud_indices else 0
    country = fake.country_code()
    city = fake.city()
    latitude = round(fake.latitude(), 6) if random.random() > 0.1 else None
    longitude = round(fake.longitude(), 6) if random.random() > 0.1 else None
    ip = fake.ipv4_public()

    vpn_usage = random.choices(
        [1, 0], weights=[0.6 if is_fraud else 0.1, 0.4 if is_fraud else 0.9]
    )[0]
    proxy_usage = random.choices(
        [1, 0], weights=[0.4 if is_fraud else 0.05, 0.6 if is_fraud else 0.95]
    )[0]
    ip_risk_score = np.clip(np.random.normal(85 if is_fraud else 30, 10), 0, 100)

    amount = round(np.random.exponential(50 if not is_fraud else 500), 2)
    currency = random.choice(["USD", "EUR", "GBP", "JPY"])
    transaction_type = random.choice(["PURCHASE", "TRANSFER", "WITHDRAWAL"])

    return {
        "transaction_id": index + 1,
        "timestamp": fake.date_time_this_year(),
        "amount": amount,
        "currency": currency,
        "transaction_type": transaction_type,
        "country": country,
        "city": city,
        "latitude": latitude,
        "longitude": longitude,
        "ip_address": ip,
        "timezone": fake.timezone(),
        "user_id": fake.random_int(min=1, max=50_000),
        "device_id": fake.uuid4() if random.random() > 0.05 else None,
        "device_type": random.choice(["Mobile", "Desktop", "Tablet"]),
        "operating_system": random.choice(
            ["iOS", "Android", "Windows", "macOS", "Linux"]
        ),
        "app_version": f"{random.randint(1,6)}.{random.randint(0,9)}.{random.randint(0,9)}",
        "user_agent": fake.user_agent(),
        "avg_spend_30d": (
            round(np.random.normal(200, 75), 2) if random.random() > 0.05 else None
        ),
        "transactions_last_7d": np.random.poisson(5),
        "time_since_last_login": round(random.uniform(0.1, 168), 1),
        "login_attempts_last_24h": random.randint(0, 5),
        "is_new_device": random.choices(
            [1, 0], weights=[0.6 if is_fraud else 0.1, 0.4 if is_fraud else 0.9]
        )[0],
        "vpn_usage": vpn_usage,
        "proxy_usage": proxy_usage,
        "ip_risk_score": int(ip_risk_score),
        "ASN": random.randint(1000, 99999),
        "ISP": fake.company(),
        "is_fraud": is_fraud,
    }

In [10]:
from sklearn.model_selection import train_test_split

records = []
for i in tqdm(range(TOTAL_RECORDS)):
    records.append(generate_record(i))

df_full = pd.DataFrame(records)

# Stratified split to preserve fraud ratio in both datasets
df_main, df_unseen = train_test_split(
    df_full, test_size=UNSEEN_SIZE, stratify=df_full["is_fraud"], random_state=42
)

# Confirm fraud ratio in both datasets
print(f"Main dataset fraud ratio: {df_main['is_fraud'].mean():.4f}")
print(f"Unseen dataset fraud ratio: {df_unseen['is_fraud'].mean():.4f}")

# Save both datasets
df_main.to_csv("data/geolocation_fraud_dataset.csv", index=False)
df_unseen.to_csv("data/geolocation_fraud_data_unseen.csv", index=False)
print("Data generation complete:")
print("- geolocation_fraud_dataset.csv → 200,000 rows")
print("- geolocation_fraud_data_unseen.csv → 200 unseen rows")

100%|██████████| 200200/200200 [00:47<00:00, 4226.29it/s]


Main dataset fraud ratio: 0.0375
Unseen dataset fraud ratio: 0.0350
Data generation complete:
- geolocation_fraud_dataset.csv → 200,000 rows
- geolocation_fraud_data_unseen.csv → 200 unseen rows
