# Data Generation
---


#### Script Configs

In [2]:
# Imports
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import timedelta

# Initialize Faker
fake = Faker()

# Set the number of rows and unique users
num_rows = 100000
num_unique_users = 15735

# Pandas dispay configs
pd.set_option('display.max_columns', None)

In [3]:
# Define lists for transaction states, types, and other categoricals
transaction_states = ['Success', 'Failed', 'Pending']
transaction_types = ['Purchase', 'Refund', 'Withdrawal']
tender_types = ['Credit Card', 'Debit Card', 'PayPal', 'Bank Transfer']
card_types = ['visa', 'mastercard', 'amex']
merchant_names = ['Amazon', 'eBay', 'Walmart', 'Best Buy', 'Target']
product_categories = {
    'Electronics': ['Smartphone', 'Laptop', 'Headphones', 'Camera'],
    'Clothing': ['T-shirt', 'Jeans', 'Jacket', 'Dress'],
    'Books': ['Novel', 'Science Fiction', 'History', 'Biography'],
    'Furniture': ['Chair', 'Table', 'Sofa', 'Bed'],
    'Toys': ['Doll', 'Action Figure', 'Puzzle', 'Board Game']
}
order_types = ['Standard', 'Express', 'Same-day']
shipping_types = ['Ground', 'Air', 'Sea']
email_domains = ['gmail.com', 'yahoo.com', 'outlook.com']

# Create a list of countries and their correct country codes
countries = {'United States of America': 'USA', 'Canada': 'CA', 'United Kingdom': 'GB', 'Germany': 'DE', 'France': 'FR', 'South Africa': 'ZAR'}

# Generate unique merchant details
merchants = {}
for name in merchant_names:
    merchant_id = fake.unique.uuid4()
    merchants[merchant_id] = name

# Generate unique user details
users = {}
for _ in range(num_unique_users):
    user_id = fake.unique.uuid4()
    user_name = fake.name()
    user_country = random.choice(list(countries.keys()))  # Assign country first to a variable
    users[user_id] = {
        "user_id": user_id,
        "user_name": user_name,
        "user_msisdn": fake.phone_number(),
        "user_email": user_name.lower().replace(' ', '.') + '@' + random.choice(email_domains),
        "user_dob": fake.date_of_birth(minimum_age=16, maximum_age=80),
        "user_signup_date": fake.date_this_decade(),
        "user_onboarding_type": random.choice(['Online', 'In-person', 'Referral']),
        "user_country": user_country,
        "user_country_code": countries[user_country]  # Use the variable
    }

# Create transactions and shipping dates
transaction_dates = [fake.date_this_decade() for _ in range(num_rows)]
shipping_dates = [date + timedelta(days=random.choice([0, 1])) for date in transaction_dates]

# Select users and merchants for transactions
selected_user_ids = np.random.choice(list(users.keys()), num_rows, replace=True)
selected_merchant_ids = np.random.choice(list(merchants.keys()), num_rows, replace=True)

# List generation for product details
product_details = [random.choice(list(product_categories.items())) for _ in range(num_rows)]
product_names = [details[1][random.randint(0, len(details[1])-1)] for details in product_details]
product_categories_list = [details[0] for details in product_details]

In [4]:
# Complete DataFrame creation code
data = {
    "transaction_id": [fake.unique.uuid4() for _ in range(num_rows)],
    "amount": np.round(np.random.uniform(5.0, 1000.0, num_rows), 2),
    "transaction_state": [random.choice(transaction_states) for _ in range(num_rows)],
    "transaction_type": [random.choice(transaction_types) for _ in range(num_rows)],
    "tender_type": [random.choice(tender_types) for _ in range(num_rows)],
    "transaction_date": transaction_dates,
    "shipping_date": shipping_dates,
    "merchant_name": [merchants[mid] for mid in selected_merchant_ids],
    "merchant_id": selected_merchant_ids,
    "card_type": [random.choice(card_types) for _ in range(num_rows)],
    "card_number_masked": [fake.credit_card_number(card_type=random.choice(card_types))[:-4] + "****" for _ in range(num_rows)],
    "user_id": selected_user_ids,
    "user_name": [users[uid]["user_name"] for uid in selected_user_ids],
    "user_msisdn": [users[uid]["user_msisdn"] for uid in selected_user_ids],
    "user_email": [users[uid]["user_email"] for uid in selected_user_ids],
    "user_dob": [users[uid]["user_dob"] for uid in selected_user_ids],
    "user_signup_date": [users[uid]["user_signup_date"] for uid in selected_user_ids],
    "user_onboarding_type": [users[uid]["user_onboarding_type"] for uid in selected_user_ids],
    "user_country": [users[uid]["user_country"] for uid in selected_user_ids],
    "user_country_code": [users[uid]["user_country_code"] for uid in selected_user_ids],
    "product_name": product_names,
    "product_id": [fake.unique.uuid4() for _ in range(num_rows)],
    "product_category": product_categories_list,
    "order_id": [fake.unique.uuid4() for _ in range(num_rows)],
    "order_type": [random.choice(order_types) for _ in range(num_rows)],
    "shipping_type": [random.choice(shipping_types) for _ in range(num_rows)],
}

In [5]:
# Create a Pandas Dataframe from the generated data
df = pd.DataFrame(data)

In [6]:
# Data Preview
df.head()

Unnamed: 0,transaction_id,amount,transaction_state,transaction_type,tender_type,transaction_date,shipping_date,merchant_name,merchant_id,card_type,card_number_masked,user_id,user_name,user_msisdn,user_email,user_dob,user_signup_date,user_onboarding_type,user_country,user_country_code,product_name,product_id,product_category,order_id,order_type,shipping_type
0,0319a628-94ed-47c7-984e-f43045965ece,875.75,Failed,Withdrawal,Debit Card,2020-07-06,2020-07-06,Target,da97efc4-2c1c-475c-b7b0-f99349a3df9b,mastercard,37304285925****,604e97e1-f383-4b37-914c-a661aa9b4a6d,Brandi Andrews,791.452.9504x1483,brandi.andrews@yahoo.com,2000-08-15,2020-01-23,Online,Germany,DE,Science Fiction,ee3814cd-3bec-4c31-b1ad-f89c6987a606,Books,9ab28724-522a-4069-a738-60197beff74f,Standard,Ground
1,b1bb0e6c-a33d-4340-9975-5b57e65b24b2,436.08,Success,Withdrawal,Debit Card,2020-11-29,2020-11-30,Best Buy,58833bd9-9e5e-4ccf-b743-2cd6a26b089a,visa,223727844361****,ef95cb42-3aed-4854-a49e-0fa5f27a9669,Mariah Harris,+1-629-713-9891x308,mariah.harris@gmail.com,1969-05-06,2024-03-05,Referral,South Africa,ZAR,Puzzle,b183cb57-fa9c-4934-920c-d596d4a61a50,Toys,da20ae0c-9756-423f-bb2b-dc863f9e0d75,Express,Ground
2,843298c9-97df-4392-8816-606806d7c8be,371.74,Pending,Refund,Bank Transfer,2020-04-16,2020-04-17,eBay,b93cb87c-0860-47cf-9fb4-88e057679853,mastercard,266636321913****,6a585de5-1ba4-4f88-8bed-4dcb567522d3,Wendy Lynn,469.730.0609x0183,wendy.lynn@gmail.com,1989-07-26,2020-07-14,Referral,Germany,DE,Jeans,e587babf-8c38-495f-8820-4a558a0d2679,Clothing,3ee6ebae-9405-486a-ac62-59e8c058cd89,Standard,Sea
3,d7de6b6f-9285-45c2-834d-e6b31358a964,658.61,Failed,Refund,Bank Transfer,2022-11-09,2022-11-09,Amazon,1ee16bfa-4a03-4232-983c-49c635880592,visa,266615474622****,81013d7a-7d44-46ea-aefd-3f4d7a3a1800,Kaitlin Hernandez,(622)562-1805x73056,kaitlin.hernandez@outlook.com,1955-01-25,2020-02-18,Referral,South Africa,ZAR,Table,68e81f09-d887-45c0-9a17-f5b72a700027,Furniture,2c113106-cb35-47e3-8303-5c9d3131bf59,Standard,Ground
4,9a340686-0e82-4450-9fae-22376a6f6b16,146.92,Success,Refund,PayPal,2023-11-01,2023-11-02,Amazon,1ee16bfa-4a03-4232-983c-49c635880592,visa,37109477291****,61fda781-a89c-497c-bf24-8edeed56b540,Tammy Lowe,(820)470-4921x90580,tammy.lowe@yahoo.com,1947-01-09,2021-09-09,Referral,France,FR,Camera,9d1528f2-43a8-4690-952e-bf3ee98acf10,Electronics,4b505419-de46-4e75-b555-2d79719a44ed,Express,Air


### Quick Checks

In [7]:
# Data Types Preview
df.dtypes

transaction_id           object
amount                  float64
transaction_state        object
transaction_type         object
tender_type              object
transaction_date         object
shipping_date            object
merchant_name            object
merchant_id              object
card_type                object
card_number_masked       object
user_id                  object
user_name                object
user_msisdn              object
user_email               object
user_dob                 object
user_signup_date         object
user_onboarding_type     object
user_country             object
user_country_code        object
product_name             object
product_id               object
product_category         object
order_id                 object
order_type               object
shipping_type            object
dtype: object

In [8]:
# Type Casting for checks
cols = ['transaction_date', 'user_dob', 'user_signup_date', 'shipping_date']
for col in cols: 
    df[col] = pd.to_datetime(df[col])

In [9]:
# Date Checks
print("Transaction Dates")
print(df['transaction_date'].min())
print(df['transaction_date'].max())
print('\nUser DOBs')
print(df['user_dob'].min())
print(df['user_dob'].max())
print('\nUser Signup Dates')
print(df['user_signup_date'].min())
print(df['user_signup_date'].max())
print('\nShipping Dates')
print(df['shipping_date'].min())
print(df['shipping_date'].max())

Transaction Dates
2020-01-01 00:00:00
2024-08-23 00:00:00

User DOBs
1943-08-25 00:00:00
2008-08-23 00:00:00

User Signup Dates
2020-01-01 00:00:00
2024-08-23 00:00:00

Shipping Dates
2020-01-01 00:00:00
2024-08-24 00:00:00


In [10]:
print(len(df))
print(df['transaction_id'].nunique())
print(df['user_id'].nunique())

100000
100000
15712


### Save data

In [11]:
# Save the DataFrame to a CSV file
df.to_csv("data/ecom_data.csv", index=False)