In [0]:
from pyspark.sql.functions import (
    year, weekofyear, count, expr, to_date, 
    monotonically_increasing_id, date_add, lit
)
import pandas as pd
import numpy as np
import statsmodels.api as sm

# --- 1. LOAD AND AGGREGATE DATA ---
df = spark.table("workspace.default.complete_dataset")
df_weekly = (
    df.withColumn("year", year("fecha_notificacion"))
      .withColumn("week", weekofyear("fecha_notificacion"))
      .groupBy("year", "week")
      .agg(count("*").alias("y"))
      # Construct the first day of the year, then add (week-1)*7 days to get the week's Monday
      .withColumn("Semana", expr("date_add(concat(year, '-01-01'), (week - 1) * 7)"))
      .orderBy("Semana")
)

pdf = df_weekly.toPandas()

# --- 2. FEATURE ENGINEERING ---
# Ensure "ds" is datetime
pdf["Semana"] = pd.to_datetime(pdf["Semana"])

# --- 2. FEATURE ENGINEERING ---
pdf["t"] = np.arange(len(pdf))
pdf["week_of_year"] = pdf["Semana"].dt.isocalendar().week.astype(float)
pdf["sin_week"] = np.sin(2 * np.pi * pdf["week_of_year"] / 52)
pdf["cos_week"] = np.cos(2 * np.pi * pdf["week_of_year"] / 52)
X = pdf[["t", "sin_week", "cos_week"]]
X = sm.add_constant(X)
y = pdf["y"]

# --- 3. TRAIN MODEL ---
model = sm.OLS(y, X).fit()

# --- 4. PREPARE FUTURE DATA ---
last_date = pdf["Semana"].max()
last_t = pdf["t"].max()
future_dates = [last_date + pd.Timedelta(weeks=i+1) for i in range(12)]
future_pdf = pd.DataFrame(future_dates, columns=["Semana"])
future_pdf["t"] = [last_t + i + 1 for i in range(12)]
future_pdf["week_of_year"] = future_pdf["Semana"].dt.isocalendar().week.astype(float)
future_pdf["sin_week"] = np.sin(2 * np.pi * future_pdf["week_of_year"] / 52)
future_pdf["cos_week"] = np.cos(2 * np.pi * future_pdf["week_of_year"] / 52)
future_X = sm.add_constant(future_pdf[["t", "sin_week", "cos_week"]])

# --- 5. PREDICT FUTURE VALUES ---
future_pdf["Casos Previstos"] = model.predict(future_X)
display(future_pdf[["Semana", "Casos Previstos"]])