# 01 – Generate Synthetic Data (Fabric Lakehouse)

**Objetivo:** Generar dataset sintético y persistirlo en OneLake (Lakehouse) en formato Parquet.

**Salida:** `/lakehouse_sim/Files/raw/ops_daily.parquet`

In [1]:

from pathlib import Path
from datetime import datetime, timedelta

# raíz del subproyecto (donde está el notebook)
PROJ_ROOT = Path(".").resolve()
RAW_DIR = PROJ_ROOT / "lakehouse_sim" / "Files" / "raw"
RAW_DIR.mkdir(parents=True, exist_ok=True)

RAW_PATH = RAW_DIR / "ops_daily.parquet"
RAW_PATH

PosixPath('/home/aedron/projects/dev-pro-portfolio/projects/ops-stability-analytics/lakehouse_sim/Files/raw/ops_daily.parquet')

In [2]:
import numpy as np
import pandas as pd

N_AGENTS = 60
DAYS = 90
SEED = 42

rng = np.random.default_rng(SEED)

agents = [f"AG{str(i).zfill(3)}" for i in range(1, N_AGENTS + 1)]
teams = [f"T{(i % 6) + 1}" for i in range(1, N_AGENTS + 1)]

end = datetime.today().date()
start = end - timedelta(days=DAYS - 1)
dates = pd.date_range(start=start, end=end, freq="D")

rows = []
for a, t in zip(agents, teams):
    base_prod = rng.normal(6.0, 1.0)
    base_cases = rng.normal(18, 4)
    stability = rng.uniform(0.05, 0.25)

    for d in dates:
        hrs = max(0.0, rng.normal(base_prod, base_prod * stability))
        cases = max(0.0, rng.normal(base_cases, base_cases * stability))
        rows.append((d.date().isoformat(), a, t, round(hrs, 2), int(cases)))

df = pd.DataFrame(rows, columns=["date", "agent_id", "team_id", "productive_hours", "cases_closed"])
df.head()


Unnamed: 0,date,agent_id,team_id,productive_hours,cases_closed
0,2025-08-08,AG001,T2,7.62,7
1,2025-08-09,AG001,T2,4.48,14
2,2025-08-10,AG001,T2,5.86,13
3,2025-08-11,AG001,T2,5.11,16
4,2025-08-12,AG001,T2,7.39,14


In [3]:
df.to_parquet(RAW_PATH, index=False)
print(f"✅ Generated {len(df):,} rows → {RAW_PATH}")

✅ Generated 5,400 rows → /home/aedron/projects/dev-pro-portfolio/projects/ops-stability-analytics/lakehouse_sim/Files/raw/ops_daily.parquet
