# Dummy Shopping Data Generator

In [1]:
import sqlite3
import random
import unicodedata

import numpy as np
import pandas as pd

from faker import Faker

In [2]:
# Set up
db = sqlite3.connect("sales_data.db")
cursor = db.cursor()
faker = Faker("nl_NL")

In [3]:
def clip(value, min_value, max_value):
    value = min(value, max_value)
    value = max(value, min_value)
    return value

def normalize(value):
    return unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()

def fake_mail(name):
    if random.randint(0, 10) < 8:
        domain = random.choice([
            "gmail.com", "hotmail.com", "outlook.com",
            "kpn.nl", "ziggo.nl",
        ])
    else:
        domain = faker.domain_name()

    clean_name = normalize(name.lower().replace(" ", "."))
    return f"{clean_name}@{domain}"

## Create structure

In [None]:
# Reset the database.
cursor.execute("DROP TABLE IF EXISTS Klanten;")
cursor.execute("DROP TABLE IF EXISTS Producten;")
cursor.execute("DROP TABLE IF EXISTS Transacties;")

In [None]:
cursor.execute("""
    CREATE TABLE Klanten (
        KlantId TEXT PRIMARY KEY,
        Naam TEXT,
        Email TEXT,
        Geboortedatum DATE,
        Adres TEXT,
        Postcode TEXT,
        Stad TEXT,
        Aangemaakt DATE
    );
""")

cursor.execute("""
    CREATE TABLE Producten (
        ProductId TEXT PRIMARY KEY,
        Naam TEXT,
        Verpakking REAL,
        VerpakkingEenheid TEXT,
        Categorie TEXT,
        Subcategorie TEXT,
        Prijs REAL
    );
""")

cursor.execute("""
    CREATE TABLE Transacties (
        TransactieId TEXT,
        KlantId TEXT,
        ProductId TEXT,
        DatumTijd DATETIME,
        RegelNummer INTEGER,
        Aantal INTEGER,
        Prijs REAL,

        PRIMARY KEY(TransactieId, RegelNummer)
    );
""")

## Customers

In [6]:
# Customer settings
n = 250
birthday = "-80y", "-18y"
created = "-5y", "-1y"

In [7]:
customers = []

for _ in range(n):
    name = faker.unique.name().split("-", 1)[0]
    address = faker.unique.address().split('\n')

    customers.append({
        "KlantId": faker.unique.bothify(text='CST-#####'),
        "Naam": name,
        "Email": fake_mail(name),
        "Geboortedatum": faker.date_between(*birthday),
        "Adres": address[0],
        "Postcode": address[1],
        "Stad": address[2],
        "Aangemaakt": faker.date_between(*created),
    })

customers = pd.DataFrame(customers)

In [None]:
customers.sample(5)

In [None]:
cursor.executemany(
    """
    INSERT INTO Klanten
    (KlantId, Naam, Email, Geboortedatum, Adres, Postcode, Stad, Aangemaakt)
    VALUES(:KlantId, :Naam, :Email, :Geboortedatum, :Adres, :Postcode, :Stad, :Aangemaakt);
    """,
    customers.to_dict(orient="records")
)

## Products

In [10]:
# Product specs: name, packaging, package_unit, category, subcategory, price
products = pd.DataFrame(
    data = [

        ("Margarine", 500, "gram", "Zuivel", "Levensmiddelen", 0.99),
        ("Halfvolle melk", 1, "liter", "Zuivel", "Levensmiddelen", 1.19),
        ("Halfvolle melk", 1.5, "liter", "Zuivel", "Levensmiddelen", 1.99),
        ("Volle melk", 1, "liter", "Zuivel", "Levensmiddelen", 1.29),
        ("Magere Yoghurt", 1, "liter", "Zuivel", "Levensmiddelen", 1.19),
        ("Volle Yoghurt", 1, "liter", "Zuivel", "Levensmiddelen", 1.29),

        ("Volkoren spagheti", 500, "gram", "Graanproducten", "Levensmiddelen", 2.09),
        ("Volkoren brood", 1, "heel", "Graanproducten", "Levensmiddelen", 1.29),
        ("Volkoren brood", 1, "half", "Graanproducten", "Levensmiddelen", 0.79),
        ("Wit brood", 1, "heel", "Graanproducten", "Levensmiddelen", 0.99),
        ("Wit brood", 1, "half", "Graanproducten", "Levensmiddelen", 0.69),

        ("Witte druiven", 500, "gram", "Fruit", "Levensmiddelen", 2.79),
        ("Appels", 8, "stuks", "Fruit", "Levensmiddelen", 2.79),
        ("Peren", 4, "stuks", "Fruit", "Levensmiddelen", 2.99),
        ("Komkommer", 1, "stuks", "Groenten", "Levensmiddelen", 1.39),
        ("Tomaten", 500, "gram", "Groenten", "Levensmiddelen", 2.49),
        ("Sla krop", 1, "stuks", "Groenten", "Levensmiddelen", 1.59),
        ("Sla gesneden", 200, "gram", "Groenten", "Levensmiddelen", 1.19),

        ("Bier", 1, "krat", "Alcoholische dranken", "Levensmiddelen", 14.17),
        ("Rode wijn", 1, "liter", "Alcoholische dranken", "Levensmiddelen", 6.99),
        ("Witte wijn", 1, "liter", "Alcoholische dranken", "Levensmiddelen", 5.29),

        ("Shampoo", 300, "mililiter", "Haarproducten", "Persoonlijke verzorging", 6.49),
        ("Handgel", 100, "mililiter", "Hygiene producten", "Persoonlijke verzorging", 2.99),
        ("Tandpasta", 75, "mililiter", "Hygiene producten", "Persoonlijke verzorging", 2.75),

    ],
    columns=[
        "Naam", "Verpakking", "VerpakkingEenheid", "Subcategorie", "Categorie", "Prijs"
    ]
)
product_ids = [faker.unique.bothify(text='PRD-#####') for _ in range(len(products))]
products = products.assign(ProductId=product_ids)


In [None]:
products.sample(5)

In [None]:
cursor.executemany(
    """
    INSERT INTO Producten
    (ProductId, Naam, Verpakking, VerpakkingEenheid, Categorie, Subcategorie, Prijs)
    VALUES(:ProductId, :Naam, :Verpakking, :VerpakkingEenheid, :Categorie, :Subcategorie, :Prijs);
    """,
    products.to_dict(orient="records")
)

## Transactions

In [13]:
# Generator settings
date_range = "2024-01-01", "2024-12-31"
daily_transactions = 10, 20
transaction_items = 1, 7

In [14]:
transactions = []

days = pd.date_range(*date_range)
counts = np.random.uniform(*daily_transactions, len(days)).astype(int)

for day, count in zip(days, counts):

    for _ in range(count):

        customer_id = customers.sample(1).iloc[0, 0]
        transaction_id = faker.unique.bothify(text='TX-######')
        lines = random.randint(1, 5)
        time = random.randrange(9 * 3600, 18 * 3600)

        for line_nr, line in enumerate(range(lines), start=1):
            product = products.sample(1)
            product_id = product["ProductId"].iloc[0]
            product_price = product["Prijs"].iloc[0]
            quantity = round(random.lognormvariate(0.3, 0.4))
            # quantity = clip(quantity, *transaction_items)

            transaction = {
                "TransactieId": transaction_id,
                "KlantId": customer_id,
                "ProductId": product_id,
                "DatumTijd": (day + pd.Timedelta(seconds=time)),
                "RegelNummer": line_nr,
                "Aantal": quantity,
                "Prijs": product_price,
                # "Totaal": round(quantity * product_price, 2),
            }

            transactions.append(transaction)

transactions = pd.DataFrame(transactions)

In [15]:
# pd.Series(np.random.lognormal(0.5, 0.5, 1000)).plot.hist(bins=50)

In [16]:
transactions["DatumTijd"] = transactions["DatumTijd"].dt.strftime("%Y-%m-%d %H:%M:%S")

In [None]:
transactions.sample(5)

In [None]:
transactions.dtypes

In [None]:
cursor.executemany(
    """
    INSERT INTO Transacties
    (TransactieId, KlantId, ProductId, DatumTijd, RegelNummer, Aantal, Prijs)
    VALUES(:TransactieId, :KlantId, :ProductId, :DatumTijd, :RegelNummer, :Aantal, :Prijs);
    """,
    transactions.to_dict(orient="records")
)

In [20]:
db.commit()
db.close()