In [8]:
import pandas as pd
from numpy.random import lognormal
import numpy as np

In [9]:
def truncated_lognormal(mean, sigma, low, high, size):
    arr = lognormal(mean, sigma, size)
    mask = (arr < low) | (arr > high)
    while mask.any():
        arr[mask] = lognormal(mean, sigma, mask.sum())
        mask = (arr < low) | (arr > high)
    return arr

In [10]:
df = pd.read_csv(
    './data/fra_cleaned.csv',
    sep=';',
    encoding='ISO-8859-1',
    quotechar='"',
    decimal=','
)
df.index = df.index + 1
for col in df.select_dtypes('object'):
    df[col] = df[col].str.encode('latin-1', 'ignore').str.decode('utf-8', 'ignore')
df = df[list(df.columns)[:11]]
mean = np.log(20)
sigma = 0.8
df['price'] = [ round(x)*100 for x in truncated_lognormal(mean, sigma, 50, 2500, len(df)) ]
df.columns = [ x.lower().replace(' ', '_') for x in df.columns]
df['year'] = pd.to_numeric(df['year'], errors='coerce').fillna(0).astype(int)
df.head()

Unnamed: 0,url,perfume,brand,country,gender,rating_value,rating_count,year,top,middle,base,price
1,https://www.fragrantica.com/perfume/xerjoff/ac...,accento-overdose-pride-edition,xerjoff,Italy,unisex,1.42,201,2022,"fruity notes, aldehydes, green notes","bulgarian rose, egyptian jasmine, lily-of-the-...","eucalyptus, pine",5100
2,https://www.fragrantica.com/perfume/jean-paul-...,classique-pride-2024,jean-paul-gaultier,France,women,1.86,70,2024,"yuzu, citruses","orange blossom, neroli","musk, blonde woods",10300
3,https://www.fragrantica.com/perfume/jean-paul-...,classique-pride-2023,jean-paul-gaultier,France,unisex,1.91,285,2023,"blood orange, yuzu","neroli, orange blossom","musk, white woods",6600
4,https://www.fragrantica.com/perfume/bruno-bana...,pride-edition-man,bruno-banani,Germany,men,1.92,59,2019,"guarana, grapefruit, red apple","walnut, lavender, guava","vetiver, benzoin, amber",7000
5,https://www.fragrantica.com/perfume/jean-paul-...,le-male-pride-collector,jean-paul-gaultier,France,men,1.93,632,2020,"mint, lavender, cardamom, artemisia, bergamot","caraway, cinnamon, orange blossom","vanilla, sandalwood, amber, cedar, tonka bean",8200


In [11]:
import pandas as pd
import random

country = pd.DataFrame(df['country'].unique(), columns=['country'])
country['country_id'] = country.index + 1
country.set_index('country', inplace=True)

brands = df[['brand', 'country']].copy()
brands['country_id'] = brands['country'].map(country['country_id'])
brands = brands.drop_duplicates(subset='brand', keep='first').reset_index(drop=True)
brands['brand_id'] = brands.index + 1
brands.set_index('brand_id', inplace=True)

country_to_id = country['country_id'].to_dict()
brand_to_id = {name: idx for idx, name in brands['brand'].items()}

df['country_id'] = df['country'].map(country_to_id)
df['brand_id'] = df['brand'].map(brand_to_id)

brands.drop(columns=['country'], inplace=True)

brands.to_csv('./data/brands.csv', sep=';', index=False, encoding='ISO-8859-1')
country.to_csv('./data/countries.csv', sep=';', index=True, encoding='ISO-8859-1')

df.drop(columns=['country', 'brand', 'country_id'], inplace=True)
df['product_id'] = df.index
df.set_index('product_id', inplace=True)

df['quantity'] = [ random.randint(1, 100) for _ in range(len(df)) ]
df['quantity'] = df['quantity'].astype(int)

df.to_csv('./data/products.csv', sep=';', index=False, encoding='ISO-8859-1', decimal='.', float_format='%.2f')


In [12]:
from faker import Faker
import pandas as pd

fake = Faker('hu_HU')
Faker.seed(42)

records = []
for _ in range(10_000):
    prof = fake.profile(fields=['username', 'name', 'sex', 'mail', 'birthdate'])
    prof['country'] = 'Hungary'
    prof['city'] = fake.city()
    prof['street'] = fake.street_address()
    prof['password'] = fake.password(length=12, special_chars=False, digits=True, upper_case=True, lower_case=True)
    records.append(prof)

df_users = pd.DataFrame(records)
df_users = df_users.drop_duplicates(subset='username', keep='first').reset_index(drop=True)
df_users.reset_index(drop=True, inplace=True)
df_users.index = df_users.index

df_users.to_csv('./data/users.csv', sep=';', index=False)


In [13]:
import random

# Add a 'user_id' column to df_users
df_users['user_id'] = df_users.index + 1

NUM_CARTS = 7500
# draw with replacement from the list of user IDs
picked_users = random.choices(df_users['user_id'].tolist(), k=NUM_CARTS)

carts = pd.DataFrame({
    'cart_id': range(1, NUM_CARTS+1),
    'user_id': picked_users,
    'shipped': [random.choice([True, False]) for _ in range(NUM_CARTS)]
})

product_catalog = df_users.index.tolist()


In [14]:
import pandas as pd
import numpy as np
import random

avg_items = 3.97
zipf_s = 1.07
max_items = 20
max_qty = 5
geom_p = 0.8

N = len(product_catalog)
ranks = np.arange(1, N+1)
zipf_weights = 1 / np.power(ranks, zipf_s)
zipf_weights /= zipf_weights.sum()

records = []
for cart_id in carts['cart_id']:
    n_items = np.random.poisson(lam=avg_items)
    n_items = int(np.clip(n_items, 1, max_items))
    chosen = np.random.choice(
        product_catalog,
        size=n_items,
        replace=False,
        p=zipf_weights
    )
    for pid in chosen:
        qty = np.random.geometric(p=geom_p)
        qty = int(np.clip(qty, 1, max_qty))
        records.append({
            'cart_id': cart_id,
            'product_id': pid + 1,
            'quantity': qty
        })

df_cart_content = pd.DataFrame(records)
df_cart_content.to_csv(
    './data/cart_content.csv',
    sep=';',
    index=False,
    encoding='ISO-8859-1'
)

carts['cart_id'] = carts['cart_id'] + 1
carts.drop(columns=['cart_id'], inplace=True)
carts.to_csv(
    './data/carts.csv',
    sep=';',
    index=False,
    encoding='ISO-8859-1'
)
