In [0]:
from pyspark.sql import Row
import uuid
import random
from datetime import datetime, timedelta

# Parâmetros
num_users = 100
num_events = 10000  # número total de eventos

# Opções
event_names = ["screen_view", "click", "scroll"]
event_paths = ["/login", "/home", "/product", "/cart", "/purchase"]
event_devices = ["mobile", "desktop"]

# Gerar os dados
data = []
for _ in range(num_events):
    event_id = str(uuid.uuid4())
    user_id = f"U{random.randint(1, num_users):03d}"
    event_name = random.choice(event_names)
    event_path = random.choice(event_paths)
    event_device = random.choice(event_devices)

    # Timestamp até 3 dias atrás
    days_ago = random.randint(0, 2)
    time_offset = timedelta(days=days_ago, hours=random.randint(0, 23), minutes=random.randint(0, 59))
    event_time_dt = datetime.now() - time_offset
    event_time = event_time_dt.strftime('%Y-%m-%d %H:%M:%S')
    event_date = event_time_dt.strftime('%Y-%m-%d')  # <- corrigido aqui

    data.append(Row(
        event_id=event_id,
        user_id=user_id,
        event_time=event_time,
        event_name=event_name,
        event_path=event_path,
        event_device=event_device,
        event_date=event_date
    ))

# Criar DataFrame
df = spark.createDataFrame(data)

# Exibir amostra
df.show(10, truncate=False)


+------------------------------------+-------+-------------------+-----------+----------+------------+----------+
|event_id                            |user_id|event_time         |event_name |event_path|event_device|event_date|
+------------------------------------+-------+-------------------+-----------+----------+------------+----------+
|f8b7a313-f50c-47f4-8c4a-a5b6a155f5ef|U010   |2025-05-09 18:22:10|screen_view|/home     |mobile      |2025-05-09|
|a3c65f62-f859-4290-b49c-62861e80a2cb|U073   |2025-05-09 00:35:10|screen_view|/purchase |desktop     |2025-05-09|
|4059b557-c62b-40ce-98b0-ceab049b15e4|U010   |2025-05-09 02:02:10|click      |/product  |mobile      |2025-05-09|
|9857a036-f3c4-47c6-97c3-e528496619f5|U027   |2025-05-10 03:34:10|scroll     |/login    |mobile      |2025-05-10|
|3f65c9ab-3286-4b49-9d15-8a0a7bb093f1|U020   |2025-05-11 12:59:10|scroll     |/cart     |desktop     |2025-05-11|
|35917623-5da1-42c8-9e89-2ea54e1007c8|U061   |2025-05-10 02:06:10|scroll     |/login    

In [0]:
from pyspark.sql.functions import col

# Convertendo a coluna 'event_time' para TIMESTAMP
df = df.withColumn("event_time", col("event_time").cast("timestamp"))

df.show()

+--------------------+-------+-------------------+-----------+----------+------------+----------+
|            event_id|user_id|         event_time| event_name|event_path|event_device|event_date|
+--------------------+-------+-------------------+-----------+----------+------------+----------+
|f8b7a313-f50c-47f...|   U010|2025-05-09 18:22:10|screen_view|     /home|      mobile|2025-05-09|
|a3c65f62-f859-429...|   U073|2025-05-09 00:35:10|screen_view| /purchase|     desktop|2025-05-09|
|4059b557-c62b-40c...|   U010|2025-05-09 02:02:10|      click|  /product|      mobile|2025-05-09|
|9857a036-f3c4-47c...|   U027|2025-05-10 03:34:10|     scroll|    /login|      mobile|2025-05-10|
|3f65c9ab-3286-4b4...|   U020|2025-05-11 12:59:10|     scroll|     /cart|     desktop|2025-05-11|
|35917623-5da1-42c...|   U061|2025-05-10 02:06:10|     scroll|    /login|      mobile|2025-05-10|
|17cb09d4-98f5-48b...|   U017|2025-05-09 08:27:10|      click|     /home|      mobile|2025-05-09|
|f3d3a271-79ab-485..

In [0]:
# Escrever os dados na tabela "business_events"
df.write.mode("append").insertInto("db_experiments.business_events")

In [0]:
from pyspark.sql import Row
import uuid
import random
from datetime import datetime, timedelta
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# Parâmetros
num_users = 100
test_codes = ["T001", "T002"]
variations = ["A", "B"]
events_per_user_test = 2  # Quantidade de eventos por combinação

# Etapa 1: definir user_variation única por user/test_code
user_test_variations = []
for user_num in range(1, num_users + 1):
    user_id = f"U{user_num:03d}"
    for test_code in test_codes:
        variation = random.choice(variations)
        user_test_variations.append((user_id, test_code, variation))

# Etapa 2: gerar eventos com variação consistente
data = []
for user_id, test_code, variation in user_test_variations:
    for _ in range(events_per_user_test):
        event_id = str(uuid.uuid4())
        days_ago = random.randint(0, 2)
        event_date_dt = datetime.now() - timedelta(days=days_ago)
        event_date = event_date_dt.strftime('%Y-%m-%d')

        data.append(Row(
            event_id=event_id,
            user_id=user_id,
            test_code=test_code,
            user_variation=variation,
            event_date=event_date
        ))

# Criar DataFrame
df_events = spark.createDataFrame(data)

df_events.filter(df_events.user_id == "U001").show()

+--------------------+-------+---------+--------------+----------+
|            event_id|user_id|test_code|user_variation|event_date|
+--------------------+-------+---------+--------------+----------+
|bf9d94f7-7150-402...|   U001|     T001|             B|2025-05-09|
|9f164ae6-15bb-4de...|   U001|     T001|             B|2025-05-11|
|513b1183-bad6-405...|   U001|     T002|             A|2025-05-09|
|494b07d0-53d2-4a9...|   U001|     T002|             A|2025-05-10|
+--------------------+-------+---------+--------------+----------+



In [0]:
df_events.write.mode("append").insertInto("db_experiments.platform_events")
