In [2]:
!pip install faker

Collecting faker
  Downloading faker-37.0.2-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.0.2-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/1.9 MB[0m [31m99.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.0.2


In [3]:
from faker import Faker
import pandas as pd
import numpy as np
from tqdm import tqdm
import random

fake = Faker()
Faker.seed(42)
np.random.seed(42)

TOTAL_RECORDS = 200_200
UNSEEN_SIZE = 200
MAIN_DATASET_SIZE = TOTAL_RECORDS - UNSEEN_SIZE

FRAUD_RATE = 0.0375
TOTAL_FRAUDS = int(TOTAL_RECORDS * FRAUD_RATE)
fraud_indices = set(np.random.choice(TOTAL_RECORDS, TOTAL_FRAUDS, replace=False))


def generate_record(index):
    is_fraud = 1 if index in fraud_indices else 0

    # Simulate noisy or overlapping feature distributions
    country = fake.country_code() if random.random() > 0.03 else 'XX'  # missing country ~3%
    city = fake.city()
    latitude = round(fake.latitude(), 6) if random.random() > 0.2 else None
    longitude = round(fake.longitude(), 6) if random.random() > 0.2 else None
    ip = fake.ipv4_public()

    # Reduce separation between fraud and non-fraud
    vpn_usage = random.choices([1, 0], weights=[0.45 if is_fraud else 0.15, 0.55 if is_fraud else 0.85])[0]
    proxy_usage = random.choices([1, 0], weights=[0.3 if is_fraud else 0.05, 0.7 if is_fraud else 0.95])[0]
    ip_risk_score = np.clip(
        np.random.normal(75 if is_fraud else 40, 20), 0, 100
    )

    # Simulate some frauds that look like normal transactions
    if is_fraud and random.random() < 0.2:
        vpn_usage = 0
        proxy_usage = 0
        ip_risk_score = np.random.normal(30, 10)

    amount = round(
        np.random.exponential(100 if not is_fraud else 300), 2
    )
    currency = random.choice(["USD", "EUR", "GBP", "JPY"])
    transaction_type = random.choice(["PURCHASE", "TRANSFER", "WITHDRAWAL"])

    return {
        "transaction_id": index + 1,
        "timestamp": fake.date_time_this_year(),
        "amount": amount,
        "currency": currency,
        "transaction_type": transaction_type,
        "country": country,
        "city": city,
        "latitude": latitude,
        "longitude": longitude,
        "ip_address": ip,
        "timezone": fake.timezone(),
        "user_id": fake.random_int(min=1, max=50_000),
        "device_id": fake.uuid4() if random.random() > 0.1 else None,
        "device_type": random.choice(["Mobile", "Desktop", "Tablet"]),
        "operating_system": random.choice(["iOS", "Android", "Windows", "macOS", "Linux"]),
        "app_version": f"{random.randint(1,6)}.{random.randint(0,9)}.{random.randint(0,9)}",
        "user_agent": fake.user_agent(),
        "avg_spend_30d": round(np.random.normal(200, 100), 2) if random.random() > 0.15 else None,
        "transactions_last_7d": np.random.poisson(6),
        "time_since_last_login": round(random.uniform(0.1, 168), 1),
        "login_attempts_last_24h": random.randint(0, 6),
        "is_new_device": random.choices([1, 0], weights=[0.5 if is_fraud else 0.15, 0.5 if is_fraud else 0.85])[0],
        "vpn_usage": vpn_usage,
        "proxy_usage": proxy_usage,
        "ip_risk_score": int(ip_risk_score),
        "ASN": random.randint(1000, 99999),
        "ISP": fake.company(),
        "is_fraud": is_fraud,
    }


In [5]:
import google.colab as colab
colab.drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
from sklearn.model_selection import train_test_split

records = []
for i in tqdm(range(TOTAL_RECORDS)):
    records.append(generate_record(i))

df_full = pd.DataFrame(records)

# Stratified split to preserve fraud ratio in both datasets
df_main, df_unseen = train_test_split(
    df_full, test_size=UNSEEN_SIZE, stratify=df_full["is_fraud"], random_state=42
)

# Confirm fraud ratio in both datasets
print(f"Main dataset fraud ratio: {df_main['is_fraud'].mean():.4f}")
print(f"Unseen dataset fraud ratio: {df_unseen['is_fraud'].mean():.4f}")

# Save both datasets
# df_main.to_csv("data/geolocation_fraud_dataset.csv", index=False)
# df_unseen.to_csv("data/geolocation_fraud_data_unseen.csv", index=False)
df_main.to_csv("/content/drive/My Drive/FYP_model_assets/geolocation_fraud_dataset.csv", index=False)
df_unseen.to_csv("/content/drive/My Drive/FYP_model_assets/geolocation_fraud_data_unseen.csv", index=False)
print("Data generation complete:")
print("- geolocation_fraud_dataset.csv → 200,000 rows")
print("- geolocation_fraud_data_unseen.csv → 200 unseen rows")

100%|██████████| 200200/200200 [02:03<00:00, 1624.69it/s]


Main dataset fraud ratio: 0.0375
Unseen dataset fraud ratio: 0.0350
Data generation complete:
- geolocation_fraud_dataset.csv → 200,000 rows
- geolocation_fraud_data_unseen.csv → 200 unseen rows
