In [14]:
import os, zipfile, numpy as np, pandas as pd
from pathlib import Path


In [None]:
# ============================================================== #
# 1) PARÂMETROS
# ============================================================== #
zip_dir  = Path("/mnt/c/Users/msses/Desktop/ETF")          # onde estão os .zip
out_eq_dir = zip_dir / "weekly_log_returns"                # saída dos ETFs
out_rf_dir = zip_dir / "DTB1"                              # mesma pasta do CSV RF
rf_csv_path = out_rf_dir / "DTB4WK.csv"                    # arquivo já baixado

tmp_extract_dir = Path("extracted_files")                  # pasta temporária Linux
start_monday = pd.Timestamp("2018-01-01")
end_date     = pd.Timestamp("2025-03-31")
days_in_year = 252

# ============================================================== #
# 2) FUNÇÕES AUXILIARES
# ============================================================== #
def monday_of(series: pd.Series) -> pd.Series:
    """Recebe uma Series de datas e retorna a segunda-feira da semana correspondente."""
    return series - pd.to_timedelta(series.dt.weekday, unit="d")


def split_by_four_weeks(df: pd.DataFrame, date_col: str, out_dir: Path, base_name: str):
    """
    Salva blocos consecutivos de 4 semanas em CSVs.
    `base_name` é 'weekly_log_returns' ou 'rf_weekly_log_returns'.
    """
    df = df.sort_values(date_col)
    weeks = df[date_col].drop_duplicates().sort_values().to_list()
    for i in range(0, len(weeks), 4):
        block_weeks = weeks[i : i + 4]
        if len(block_weeks) < 4:
            break  # ignora bloco incompleto no fim da série
        first_monday = block_weeks[0]
        first_str = first_monday.strftime("%Y_%m_%d")
        filename = f"{base_name}_{first_str}.csv"
        df_block = df[df[date_col].isin(block_weeks)]
        (out_dir / filename).write_text(df_block.to_csv(index=False, float_format="%.10f"))

# ============================================================== #
# 3) PROCESSAR COTAÇÕES (ZIP → TXT → log_ret)
# ============================================================== #
out_eq_dir.mkdir(exist_ok=True, parents=True)
tmp_extract_dir.mkdir(exist_ok=True, parents=True)
df_eq_list = []

for file in os.listdir(zip_dir):
    if file.lower().endswith(".zip"):
        with zipfile.ZipFile(zip_dir / file) as z:
            z.extractall(tmp_extract_dir)

for txt in tmp_extract_dir.glob("*.txt"):
    df = pd.read_csv(
        txt,
        header=None,
        names=["date", "open", "high", "low", "close", "volume"],
        parse_dates=["date"],
    )
    ticker = txt.stem.split("_")[0]
    df["ticker"] = ticker
    df_eq_list.append(df)
    txt.unlink()  # remove o .txt

if df_eq_list:
    full_df = pd.concat(df_eq_list, ignore_index=True)
    # apenas datas >= 1-jan-2018
    full_df = full_df[full_df["date"] >= start_monday]

    # segunda de cada semana
    full_df["week_start"] = monday_of(full_df["date"])

    # log-retorno: close de 2ª a 6ª
    weekly_ret = (
        full_df.sort_values(["ticker", "date"])
        .groupby(["ticker", "week_start"])
        .agg(first_price=("close", "first"), last_price=("close", "last"))
        .dropna()
        .reset_index()
    )
    weekly_ret["log_return"] = np.log(weekly_ret["last_price"] / weekly_ret["first_price"])

    # só semanas completas dentro [2018-01-01, 2025-03-28]
    weekly_ret = weekly_ret[
        (weekly_ret["week_start"] >= start_monday)
        & (weekly_ret["week_start"] <= pd.Timestamp("2025-03-28"))
    ]

    # salvar em blocos de 4 semanas
    split_by_four_weeks(
        weekly_ret,
        "week_start",
        out_eq_dir,
        base_name="weekly_log_returns",
    )

# ============================================================== #
# 4) PROCESSAR RISK-FREE (DTB4WK % a.a. → log_ret semanal)
# ============================================================== #
out_rf_dir.mkdir(exist_ok=True, parents=True)
rf = (
    pd.read_csv(rf_csv_path, parse_dates=["observation_date"])
    .rename(columns={"observation_date": "date", "DTB4WK": "annual_rate_pct"})
    .dropna(subset=["annual_rate_pct"])
)
# diário → log-retorno diário
rf["rf_log_daily"] = np.log1p(rf["annual_rate_pct"] / 100) / days_in_year
rf = rf[(rf["date"] >= start_monday) & (rf["date"] <= end_date)]

# segunda-feira da semana
rf["week_start"] = monday_of(rf["date"])

# soma dos logs disponíveis na semana
rf_weekly = (
    rf.groupby("week_start")["rf_log_daily"]
    .sum()
    .reset_index()
    .rename(columns={"rf_log_daily": "log_return"})
)

# manter apenas semanas que coincidam com as das cotações
valid_weeks = weekly_ret["week_start"].unique()
rf_weekly = rf_weekly[rf_weekly["week_start"].isin(valid_weeks)]

split_by_four_weeks(
    rf_weekly,
    "week_start",
    out_rf_dir,
    base_name="rf_weekly_log_returns",
)

print("✅ Arquivos semanais gerados em blocos de 4 semanas para cotações e risk-free.")

AttributeError: 'Series' object has no attribute 'weekday'