# Data Generation
---


#### Script Configs

In [24]:
# Imports
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import timedelta

# Initialize Faker
fake = Faker()

# Set the number of rows and unique users
num_rows = 100000
num_unique_users = 15735

# Pandas dispay configs
pd.set_option('display.max_columns', None)

In [19]:
# Define lists for transaction states, types, and other categoricals
transaction_states = ['Success', 'Failed', 'Pending']
transaction_types = ['Purchase', 'Refund', 'Withdrawal']
tender_types = ['Credit Card', 'Debit Card', 'PayPal', 'Bank Transfer']
card_types = ['visa', 'mastercard', 'amex']
merchant_names = ['Amazon', 'eBay', 'Walmart', 'Best Buy', 'Target']
product_categories = {
    'Electronics': ['Smartphone', 'Laptop', 'Headphones', 'Camera'],
    'Clothing': ['T-shirt', 'Jeans', 'Jacket', 'Dress'],
    'Books': ['Novel', 'Science Fiction', 'History', 'Biography'],
    'Furniture': ['Chair', 'Table', 'Sofa', 'Bed'],
    'Toys': ['Doll', 'Action Figure', 'Puzzle', 'Board Game']
}
order_types = ['Standard', 'Express', 'Same-day']
shipping_types = ['Ground', 'Air', 'Sea']
email_domains = ['gmail.com', 'yahoo.com', 'outlook.com']

# Create a list of countries and their correct country codes
countries = {'United States of America': 'USA', 'Canada': 'CA', 'United Kingdom': 'GB', 'Germany': 'DE', 'France': 'FR', 'South Africa': 'ZAR'}

# Generate unique merchant details
merchants = {}
for name in merchant_names:
    merchant_id = fake.unique.uuid4()
    merchants[merchant_id] = name

# Generate unique user details
users = {}
for _ in range(num_unique_users):
    user_id = fake.unique.uuid4()
    user_name = fake.name()
    user_country = random.choice(list(countries.keys()))  # Assign country first to a variable
    users[user_id] = {
        "user_id": user_id,
        "user_name": user_name,
        "user_msisdn": fake.phone_number(),
        "user_email": user_name.lower().replace(' ', '.') + '@' + random.choice(email_domains),
        "user_dob": fake.date_of_birth(minimum_age=16, maximum_age=80),
        "user_signup_date": fake.date_this_decade(),
        "user_onboarding_type": random.choice(['Online', 'In-person', 'Referral']),
        "user_country": user_country,
        "user_country_code": countries[user_country]  # Use the variable
    }

# Create transactions and shipping dates
transaction_dates = [fake.date_this_decade() for _ in range(num_rows)]
shipping_dates = [date + timedelta(days=random.choice([0, 1])) for date in transaction_dates]

# Select users and merchants for transactions
selected_user_ids = np.random.choice(list(users.keys()), num_rows, replace=True)
selected_merchant_ids = np.random.choice(list(merchants.keys()), num_rows, replace=True)

# List generation for product details
product_details = [random.choice(list(product_categories.items())) for _ in range(num_rows)]
product_names = [details[1][random.randint(0, len(details[1])-1)] for details in product_details]
product_categories_list = [details[0] for details in product_details]

In [20]:
# Complete DataFrame creation code
data = {
    "transaction_id": [fake.unique.uuid4() for _ in range(num_rows)],
    "amount": np.round(np.random.uniform(5.0, 1000.0, num_rows), 2),
    "transaction_state": [random.choice(transaction_states) for _ in range(num_rows)],
    "transaction_type": [random.choice(transaction_types) for _ in range(num_rows)],
    "tender_type": [random.choice(tender_types) for _ in range(num_rows)],
    "transaction_date": transaction_dates,
    "shipping_date": shipping_dates,
    "merchant_name": [merchants[mid] for mid in selected_merchant_ids],
    "merchant_id": selected_merchant_ids,
    "card_type": [random.choice(card_types) for _ in range(num_rows)],
    "card_number_masked": [fake.credit_card_number(card_type=random.choice(card_types))[:-4] + "****" for _ in range(num_rows)],
    "user_id": selected_user_ids,
    "user_name": [users[uid]["user_name"] for uid in selected_user_ids],
    "user_msisdn": [users[uid]["user_msisdn"] for uid in selected_user_ids],
    "user_email": [users[uid]["user_email"] for uid in selected_user_ids],
    "user_dob": [users[uid]["user_dob"] for uid in selected_user_ids],
    "user_signup_date": [users[uid]["user_signup_date"] for uid in selected_user_ids],
    "user_onboarding_type": [users[uid]["user_onboarding_type"] for uid in selected_user_ids],
    "user_country": [users[uid]["user_country"] for uid in selected_user_ids],
    "user_country_code": [users[uid]["user_country_code"] for uid in selected_user_ids],
    "product_name": product_names,
    "product_id": [fake.unique.uuid4() for _ in range(num_rows)],
    "product_category": product_categories_list,
    "order_id": [fake.unique.uuid4() for _ in range(num_rows)],
    "order_type": [random.choice(order_types) for _ in range(num_rows)],
    "shipping_type": [random.choice(shipping_types) for _ in range(num_rows)],
}

In [21]:
# Create a Pandas Dataframe from the generated data
df = pd.DataFrame(data)

In [26]:
# Data Preview
df.head()

Unnamed: 0,transaction_id,amount,transaction_state,transaction_type,tender_type,transaction_date,shipping_date,merchant_name,merchant_id,card_type,card_number_masked,user_id,user_name,user_msisdn,user_email,user_dob,user_signup_date,user_onboarding_type,user_country,user_country_code,product_name,product_id,product_category,order_id,order_type,shipping_type
0,5cf906ae-d4a8-4936-845d-25dff56534af,916.01,Success,Refund,Credit Card,2020-08-11,2020-08-12,Amazon,61809b00-2503-4fc3-8f5b-c5f55434a828,amex,487682427955****,2683eb4e-3688-4389-845f-60fb0e3655b0,Rebecca Clayton,668-463-4486,rebecca.clayton@outlook.com,1977-01-12,2020-06-08,In-person,South Africa,ZAR,Puzzle,8bcf82d2-5d69-4efb-90c6-0b83a5c5332d,Toys,2228318c-2d86-429b-9e48-9632c08b86dc,Same-day,Ground
1,d42558e3-33c6-4e6f-b49d-cd817a9547ae,456.03,Failed,Purchase,Credit Card,2024-04-02,2024-04-02,Target,876ed3ac-e8bd-47b3-8aa6-2324e5e5d23e,mastercard,227301635981****,30de8170-b05c-4d74-8460-e193a3090291,Donna Juarez,395-204-8179x6364,donna.juarez@gmail.com,1983-01-11,2020-01-20,In-person,France,FR,Smartphone,88fa63fd-cb43-4132-83dd-0245be2ba3a1,Electronics,24445566-f8a0-45bd-8aa0-81081a2d384c,Express,Sea
2,1e5f4581-78d2-4c18-88f3-e5051659580c,302.71,Pending,Refund,Bank Transfer,2021-07-19,2021-07-20,Amazon,61809b00-2503-4fc3-8f5b-c5f55434a828,amex,37104008466****,00702f4a-74e7-441d-b4fa-8a6b3812ec54,Vanessa Lopez,+1-386-914-4013,vanessa.lopez@gmail.com,1964-07-03,2023-04-26,Online,South Africa,ZAR,Smartphone,97f2fc9a-0e07-4f8f-9165-c0937f33fcd2,Electronics,1844eb36-6e6a-404d-b6d5-91bda2855640,Express,Sea
3,4c087d0b-5861-4350-be1c-33d78cc14bbc,929.13,Success,Withdrawal,Bank Transfer,2022-03-12,2022-03-12,Amazon,61809b00-2503-4fc3-8f5b-c5f55434a828,visa,225624848037****,903c4e8f-9f04-44ff-87d4-a2d2eb25da54,Steven Ortiz,408-239-4207x202,steven.ortiz@outlook.com,1976-06-07,2021-09-18,Referral,United Kingdom,GB,Novel,20016a20-d78b-4a9a-9f17-32d38848b690,Books,e99d0c33-fd19-4e14-ad5d-14fae7949bfd,Express,Sea
4,610b097d-f1b7-4487-920a-f7d48a4531cf,576.2,Pending,Purchase,Debit Card,2022-09-16,2022-09-17,Target,876ed3ac-e8bd-47b3-8aa6-2324e5e5d23e,amex,229220622931****,7ebfab6c-3f73-45f8-870e-3ef03ac3efdf,Adam Conway,977.895.4755,adam.conway@gmail.com,1964-06-02,2022-07-04,In-person,Canada,CA,Chair,3f889995-ac51-4a8d-bcdc-d06e61ddd8c4,Furniture,604018bb-c0da-4c23-ada4-e051a4e9c6ec,Same-day,Ground


### Quick Checks

In [8]:
# Data Types Preview
df.dtypes

transaction_id           object
amount                  float64
transaction_state        object
transaction_type         object
tender_type              object
transaction_date         object
shipping_date            object
merchant_name            object
merchant_id              object
card_type                object
card_number_masked       object
user_id                  object
user_name                object
user_msisdn              object
user_email               object
user_dob                 object
user_signup_date         object
user_onboarding_type     object
user_country             object
user_country_code        object
product_name             object
product_id               object
product_category         object
order_id                 object
order_type               object
shipping_type            object
dtype: object

In [10]:
# Type Casting for checks
cols = ['transaction_date', 'user_dob', 'user_signup_date', 'shipping_date']
for col in cols: 
    df[col] = pd.to_datetime(df[col])

In [14]:
# Date Checks
print("Transaction Dates")
print(df['transaction_date'].min())
print(df['transaction_date'].max())
print('\nUser DOBs')
print(df['user_dob'].min())
print(df['user_dob'].max())
print('\nUser Signup Dates')
print(df['user_signup_date'].min())
print(df['user_signup_date'].max())
print('\nShipping Dates')
print(df['shipping_date'].min())
print(df['shipping_date'].max())

Transaction Dates
2020-01-01 00:00:00
2024-08-23 00:00:00

User DOBs
1943-08-25 00:00:00
2008-08-22 00:00:00

User Signup Dates
2020-01-01 00:00:00
2024-08-23 00:00:00

Shipping Dates
2020-01-01 00:00:00
2024-08-24 00:00:00


In [15]:
print(len(df))
print(df['transaction_id'].nunique())
print(df['user_id'].nunique())

100000
100000
15711


### Save data

In [None]:
# Save the DataFrame to a CSV file
df.to_csv("data/ecom_data.csv", index=False)