# Dummy Shopping Data Generator

In [None]:
import random
import unicodedata

import numpy as np
import pandas as pd

from faker import Faker

In [None]:
faker = Faker("nl_NL")

In [None]:
def clip(value, min_value, max_value):
    value = min(value, max_value)
    value = max(value, min_value)
    return value
        
def normalize(value):
    return unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()

def fake_mail(name):
    if random.randint(0, 10) < 8:
        domain = random.choice([
            "gmail.com", "hotmail.com", "outlook.com",
            "kpn.nl", "ziggo.nl",
        ])
    else:
        domain = faker.domain_name()
    
    clean_name = normalize(name.lower().replace(" ", "."))
    return f"{clean_name}@{domain}"

## Customers

In [None]:
# Customer settings
n = 250
birthday = "-80y", "-18y"
created = "-5y", "-1y"

In [None]:
customers = []

for _ in range(n):
    name = faker.unique.name().split("-", 1)[0]
    address = faker.unique.address().split('\n')
    
    customers.append({
        "customer_id": faker.unique.bothify(text='CST-##########'),
        "name": name,
        "e-mail": fake_mail(name),
        "birthdate": faker.date_between(*birthday),
        "address": address[0],
        "postcode": address[1],
        "city": address[2],
        "created_on": faker.date_between(*created),
    })
    
customers = pd.DataFrame(customers)

In [None]:
customers.sample(5)

In [None]:
customers.to_csv("customers.csv", index=False)

## Products

In [None]:
# Product specs: name, packaging, package_unit, category, subcategory, price
products = pd.DataFrame(
    data = [
        
        ("Margarine", 500, "gram", "Zuivel", "Levensmiddelen", 0.99),
        ("Halfvolle melk", 1, "liter", "Zuivel", "Levensmiddelen", 1.19),
        ("Halfvolle melk", 1.5, "liter", "Zuivel", "Levensmiddelen", 1.99),
        ("Volle melk", 1, "liter", "Zuivel", "Levensmiddelen", 1.29),
        ("Magere Yoghurt", 1, "liter", "Zuivel", "Levensmiddelen", 1.19),
        ("Volle Yoghurt", 1, "liter", "Zuivel", "Levensmiddelen", 1.29),

        ("Volkoren spagheti", 500, "gram", "Graanproducten", "Levensmiddelen", 2.09),
        ("Volkoren brood", 1, "heel", "Graanproducten", "Levensmiddelen", 1.29),
        ("Volkoren brood", 1, "half", "Graanproducten", "Levensmiddelen", 0.79),
        ("Wit brood", 1, "heel", "Graanproducten", "Levensmiddelen", 0.99),
        ("Wit brood", 1, "half", "Graanproducten", "Levensmiddelen", 0.69),

        ("Witte druiven", 500, "gram", "Fruit", "Levensmiddelen", 2.79),
        ("Appels", 8, "stuks", "Fruit", "Levensmiddelen", 2.79),
        ("Peren", 4, "stuks", "Fruit", "Levensmiddelen", 2.99),
        ("Komkommer", 1, "stuks", "Groenten", "Levensmiddelen", 1.39),
        ("Tomaten", 500, "gram", "Groenten", "Levensmiddelen", 2.49),
        ("Sla krop", 1, "stuks", "Groenten", "Levensmiddelen", 1.59),
        ("Sla gesneden", 200, "gram", "Groenten", "Levensmiddelen", 1.19),

        ("Bier", 1, "krat", "Alcoholische dranken", "Levensmiddelen", 14.17),
        ("Rode wijn", 1, "liter", "Alcoholische dranken", "Levensmiddelen", 6.99),
        ("Witte wijn", 1, "liter", "Alcoholische dranken", "Levensmiddelen", 5.29),
        
        ("Shampoo", 0.3, "liter", "Haarproducten", "Persoonlijke verzorging", 6.49),
        ("Handgel", 0.3, "liter", "Hygiene producten", "Persoonlijke verzorging", 2.99),
        ("Tandpasta", 0.075, "liter", "Hygiene producten", "Persoonlijke verzorging", 2.75),
        
    ],
    columns=["product", "package", "package_unit", "subcategory", "category", "price"]
)
product_ids = [faker.unique.bothify(text='PRD-##########') for _ in range(len(products))]
products = products.assign(product_id=product_ids)


In [None]:
products.sample(5)

In [None]:
order = [
    'product_id', 'product', 'package', 'package_unit', 'subcategory', 'category', 'price', 
]
products[order].to_csv("products.csv", index=False)

## Transactions

In [None]:
# Generator settings
date_range = "2023-01-01", "2023-01-31"
daily_transactions = 80, 120
transaction_items = 1, 7

In [None]:
transactions = []

days = pd.date_range(*date_range)
counts = np.random.uniform(*daily_transactions, len(days)).astype(int)

for day, count in zip(days, counts):
    
    for _ in range(count):
        
        customer_id = customers.sample(1).iloc[0, 0]
        transaction_id = faker.unique.bothify(text='TX-##########')
        lines = random.randint(1, 5)
        
        for line_nr, line in enumerate(range(lines), start=1):
            product = products.sample(1)
            product_id = product["product_id"].iloc[0]
            product_price = product["price"].iloc[0]
            quantity = round(random.lognormvariate(0.4, 0.8))
            quantity = clip(quantity, *transaction_items)
            
            transaction = {
                "transaction_id": transaction_id,
                "line_id": f"{transaction_id}-{line_nr:04d}",
                "customer_id": customer_id,
                "product_id": product_id,
                "transaction_date": day,
                "quantity": quantity,
                "price": product_price,
                "total": round(quantity * product_price, 2),
            }

            transactions.append(transaction)

transactions = pd.DataFrame(transactions)

In [None]:
transactions.sample(5)

In [None]:
transactions.dtypes

In [None]:
transactions.to_csv("transactions.csv", index=False)