In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import random


In [2]:
fake = Faker()

In [3]:

# Generate 10,000 unique customer IDs
customer_ids = [f'C{str(i).zfill(5)}' for i in range(1, 10001)]
store_ids = [f'S{str(i).zfill(3)}' for i in range(1, 51)]
categories = ['Electronics', 'Clothing', 'Home', 'Toys', 'Beauty']

In [4]:
transactions = []
for i in range(50000):
    cust_id = random.choice(customer_ids)
    trans_id = f'T{100000 + i}'
    date = fake.date_between(start_date='-2y', end_date='today')
    # Randomly introduce inconsistent formats
    date_str = date.strftime('%m/%d/%Y') if random.random() > 0.5 else date.strftime('%Y-%m-%d')
    store = random.choice(store_ids)
    category = random.choice(categories)
    amount = round(np.random.normal(100, 50), 2)
    amount = np.nan if random.random() < 0.05 else max(5, amount)  # 5% missing
    quantity = np.random.randint(1, 5)
    if random.random() < 0.02:  # 2% outliers
        quantity = np.random.randint(10, 100)
    discount = random.choice([0, 5, 10, 15, 20])

    transactions.append([cust_id, trans_id, date_str, store, category, amount, quantity, discount])

In [5]:
df_transactions = pd.DataFrame(transactions, columns=[
    'CustomerID', 'TransactionID', 'Date', 'StoreID', 'ProductCategory',
    'AmountSpent', 'Quantity', 'DiscountApplied'
])

In [6]:
genders = ['M', 'Male', 'male', 'F', 'Female', 'female']
income_levels = ['Low', 'Medium', 'High']

In [7]:
demographics = []
for cust_id in customer_ids:
    age = np.random.randint(18, 70)
    gender = random.choice(genders)
    income = random.choice(income_levels) if random.random() > 0.1 else np.nan  # 10% missing
    loyalty = random.choice(['Y', 'N'])
    zip_code = fake.zipcode()

    demographics.append([cust_id, age, gender, income, loyalty, zip_code])

In [8]:
df_demographics = pd.DataFrame(demographics, columns=[
    'CustomerID', 'Age', 'Gender', 'IncomeLevel', 'LoyaltyMember', 'ZipCode'
])

In [9]:
regions = ['East', 'West', 'Midwest', 'South']
urban_rural = ['Urban', 'Suburban', 'Rural']

In [10]:
stores = []
for store_id in store_ids:
    region = random.choice(regions)
    size = np.random.normal(20000, 5000)
    if random.random() < 0.05:  # 5% chance of being a very large store
        size = np.random.normal(100000, 10000)
    area_type = random.choice(urban_rural)

    stores.append([store_id, region, abs(int(size)), area_type])

In [11]:
df_stores = pd.DataFrame(stores, columns=[
    'StoreID', 'Region', 'StoreSize', 'UrbanRural'
])

In [14]:
df_transactions.to_csv('mock_transactions.csv', index=False)
df_demographics.to_csv('mock_demographics.csv', index=False)
df_stores.to_csv('mock_stores.csv', index=False)

In [13]:
print("Datasets created and saved as CSV files.")

Datasets created and saved as CSV files.
