Generating ecommerce data

In [1]:
# generate_ecommerce_data.py

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import os

In [3]:
# Set seeds to make the random output reproducible

random.seed(42)
np.random.seed(42)

In [4]:
# Configuration for dataset size

n_customers = 2000       # Total number of unique customers
n_records = 20000        # Total number of purchase records to generate
start_date = datetime(2020,1,1)
end_date = datetime(2023,12,31)
date_range_days = (end_date - start_date).days  # Number of days between start and end date

In [5]:
# Sample lists of countries and products

countries = [
    "United Kingdom","France","Germany","Spain","Netherlands",
    "Belgium","USA","Canada","Australia"
]

In [6]:

# Each product: (product code prefix, description)

products = [
    ("PEN","Pen, blue ink"),
    ("NOTE","Notebook A4"),
    ("MUG","Coffee Mug"),
    ("BAG","Reusable Bag"),
    ("TSHIRT","Company T-Shirt"),
    ("USB","USB Flash Drive"),
    ("BOOK","Technical Book"),
    ("HEADPH","Headphones"),
    ("PHONEC","Phone Charger"),
    ("LAMP","Desk Lamp")
]

In [7]:
# Generate customer list and assign each a random country

customer_ids = [f"C{1000+i}" for i in range(n_customers)]

In [8]:
# Assign countries to customers with a probability distribution
cust_country = np.random.choice(
    countries,
    size=n_customers,
    p=[0.45,0.08,0.07,0.06,0.05,0.04,0.10,0.08,0.07]
)

In [9]:
# Create a dataframe of customers and their assigned countries
customers_df = pd.DataFrame({"CustomerID": customer_ids, "Country": cust_country})

In [11]:
# Generate transactional ecommerce records

records = []

for i in range(n_records):
    customer = random.choice(customer_ids)       # Pick a random customer
    invoice_no = f"INV{100000 + i}"              # Unique invoice number
    
    # Choose a random product and get its details
    prod_code, prod_desc = random.choice(products)
    
    # Quantity based on Poisson distribution (skewed toward small integers)
    qty = np.random.poisson(lam=2)
    if qty == 0:
        qty = 1   # Avoid zero quantity
    
    # Generate a random unit price with a normal distribution
    unit_price = round(float(abs(np.random.normal(loc=20, scale=15))) + 1, 2)
    
    # Random invoice date within the date range
    invoice_date = start_date + timedelta(
        days=random.randint(0, date_range_days),
        seconds=random.randint(0, 86400)
    )
    
    # Create a stock code using product prefix + random number
    stock_code = prod_code + str(random.randint(100, 999))
    
    # Introduce small % of returns (negative quantity)
    if random.random() < 0.01:
        qty = -qty
    
    # Append record to list
    records.append({
        "InvoiceNo": invoice_no,
        "StockCode": stock_code,
        "Description": prod_desc,
        "Quantity": qty,
        "InvoiceDate": invoice_date.strftime("%Y-%m-%d %H:%M:%S"),
        "UnitPrice": unit_price,
        "CustomerID": customer,
        "Country": customers_df.loc[customers_df["CustomerID"] == customer, "Country"].values[0]
    })

In [12]:
# Convert list of dicts into a DataFrame
df = pd.DataFrame(records)
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,INV100000,BAG489,Reusable Bag,1,2020-03-14 18:36:18,20.48,C2130,Belgium
1,INV100001,PEN540,"Pen, blue ink",2,2020-08-05 05:35:07,4.87,C1977,United Kingdom
2,INV100002,HEADPH716,Headphones,1,2020-02-07 16:02:31,5.12,C2281,Netherlands
3,INV100003,MUG365,Coffee Mug,5,2022-12-04 06:32:29,21.43,C1566,United Kingdom
4,INV100004,NOTE797,Notebook A4,1,2021-09-23 09:59:02,33.72,C1494,France


In [13]:
# Compute total price for each line item
df["TotalPrice"] = (df["Quantity"] * df["UnitPrice"]).round(2)
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPrice
0,INV100000,BAG489,Reusable Bag,1,2020-03-14 18:36:18,20.48,C2130,Belgium,20.48
1,INV100001,PEN540,"Pen, blue ink",2,2020-08-05 05:35:07,4.87,C1977,United Kingdom,9.74
2,INV100002,HEADPH716,Headphones,1,2020-02-07 16:02:31,5.12,C2281,Netherlands,5.12
3,INV100003,MUG365,Coffee Mug,5,2022-12-04 06:32:29,21.43,C1566,United Kingdom,107.15
4,INV100004,NOTE797,Notebook A4,1,2021-09-23 09:59:02,33.72,C1494,France,33.72


In [14]:
# Save generated dataset as CSV in the specified directory

os.makedirs("C:/Users/muzna/anaconda_projects", exist_ok=True)

csv_path = "C:/Users/muzna/anaconda_projects/ecommerce_sample.csv"
df.to_csv(csv_path, index=False)

print("Saved:", csv_path)


Saved: C:/Users/muzna/anaconda_projects/ecommerce_sample.csv
