In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from data_simulation import DataGenerator

SCENARIO = 0

K = int(1e3)
M = int(1e6)
B = int(1e9)

scenarios = {
    0: {
        "minutes": 20 * K,
        "advertisements_per_minute": 50,
        "shop_visits_per_minute": 50,
        "customer_database_size": 10 * K,
    },
    1: {
        "minutes": 100 * K,
        "advertisements_per_minute": 10,
        "shop_visits_per_minute": 10,
        "customer_database_size": 100 * K,
    },
    2: {
        "minutes": 500 * K,
        "advertisements_per_minute": 20,
        "shop_visits_per_minute": 20,
        "customer_database_size": 1 * M,
    },
    3: {
        "minutes": 1 * M,
        "advertisements_per_minute": 100,
        "shop_visits_per_minute": 100,
        "customer_database_size": 10 * M,
    },
    4: {
        "minutes": 5 * M,
        "advertisements_per_minute": 200,
        "shop_visits_per_minute": 200,
        "customer_database_size": 100 * M,
    },
    5: {
        "minutes": 10 * M,
        "advertisements_per_minute": 1 * K,
        "shop_visits_per_minute": 1 * K,
        "customer_database_size": 1 * B,
    },
}
data_generator = DataGenerator(
    number_of_minutes=scenarios[SCENARIO]["minutes"],
    advertisements_per_minute=scenarios[SCENARIO]["advertisements_per_minute"],
    shop_visits_per_minute=scenarios[SCENARIO]["shop_visits_per_minute"],
    customer_database_size=scenarios[SCENARIO]["customer_database_size"],
)

data_generator.generate_and_set_data()

# Save as parquet files


In [None]:
from pathlib import Path
import pyarrow as pa
import os
import numpy as np
import pyarrow.parquet as pq


customer_database_table = pa.table(
    {
        "customer_id": data_generator.customer_database[:, 0],
        "brand_happiness": data_generator.customer_database[:, 1],
    }
)
store_visits_table = pa.table(
    {
        "datetime": data_generator.store_visits[:, 0],
        "customer_id": data_generator.store_visits[:, 1],
    }
)
advertisements_table = pa.table(
    {
        "datetime": data_generator.advertisements[:, 0],
        "channel": data_generator.advertisements[:, 1],
        "spend": data_generator.advertisements[:, 2],
    }
)
revenue_table = pa.table(
    {
        "datetime": data_generator.revenue[:, 0],
        "revenue": data_generator.revenue[:, 1],
    }
)

# Write DataFrames to Parquet files
save_location = Path(os.getcwd()) / "data_files" / f"scenario_{SCENARIO}"
save_location.mkdir(parents=True, exist_ok=True)

pq.write_table(customer_database_table, save_location / "customer_database.parquet")
pq.write_table(store_visits_table, save_location / "store_visits.parquet")
pq.write_table(advertisements_table, save_location / "advertisements.parquet")
pq.write_table(revenue_table, save_location / "revenue.parquet")