
# 🌐 Arboviroses — LSTM (GitHub-only, Keras `.keras`, scalers slug, push-to-GitHub)
Notebook completo para rodar no **Google Colab**:
- Lê **meteorologia** e **casos** direto do **GitHub** (usa `GITHUB_TOKEN` se for privado).
- Agrega semanalmente (W-SUN), faz *merge*, cria *lags* e sequências por município.
- Treina LSTM, avalia e **salva**: `model_lstm.keras` (nativo) + `model_lstm.h5` (compat), `metadata.json`, `scalers/` com nomes **slugificados**.
- Gera **ZIP** com todos os artefatos e **(opcional)** envia para o GitHub via API.


## 1) Dependências (stack estável) — execute e aguarde reiniciar

In [None]:

# ✅ Stack estável p/ Colab: TF 2.19 + Keras 3, sem conflitos
!pip -q install --upgrade pip
!pip -q install -U --force-reinstall --no-warn-conflicts \
  "tensorflow==2.19.0" "keras==3.10.0" \
  "numpy==1.26.4" "pandas==2.2.2" \
  "scikit-learn==1.6.1" "joblib==1.3.2" \
  "matplotlib==3.8.4" "protobuf==5.29.5" "requests==2.32.3"

import os; os.kill(os.getpid(), 9)

#!pip -q install --upgrade pip
#!pip -q install -U --force-reinstall --no-warn-conflicts   "tensorflow==2.19.0" "keras==3.10.0"   "numpy==1.26.4" "pandas==2.2.2"   "scikit-learn==1.6.1" "joblib==1.3.2"   "matplotlib==3.8.4" "protobuf==5.29.5" "requests==2.32.3"
#import os; os.kill(os.getpid(), 9)


In [None]:
import numpy, pandas, sklearn, tensorflow as tf, keras
print("numpy", numpy.__version__)
print("pandas", pandas.__version__)
print("sklearn", sklearn.__version__)
print("tensorflow", tf.__version__)
print("keras", keras.__version__)


## 2) Imports e utilitários

In [None]:

import os, io, base64, json, re, unicodedata, joblib, numpy as np, pandas as pd, matplotlib.pyplot as plt, requests
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers

SEED = 42
np.random.seed(SEED)
keras.utils.set_random_seed(SEED)

ARTIFACTS_DIR = "/content/arboviroses_artifacts"
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

def slugify(s: str) -> str:
    s = unicodedata.normalize("NFKD", str(s)).encode("ascii","ignore").decode("ascii")
    s = re.sub(r"\s+", "_", s.strip())
    s = re.sub(r"[^A-Za-z0-9_]", "", s)
    return s

print("OK: imports, seed e diretórios prontos.")


## 3) Configuração do GitHub (token + caminhos)

In [None]:

try:
    from google.colab import userdata
    GITHUB_TOKEN = userdata.get("GITHUB_TOKEN")
except Exception:
    GITHUB_TOKEN = None

OWNER   = "JaniceSilva"
REPO    = "arboviroses-platform"
BRANCH  = "master"

MET_PATH_DIAM = "backend/data/diamantina.csv"
MET_PATH_TEOF = "backend/data/teofilo_otoni.csv"
CASES_PATH    = "backend/data/sinan_arboviroses_data.csv"

print("Config GitHub OK. OWNER/REPO/BRANCH:", OWNER, REPO, BRANCH)


### 3.1) Funções para ler arquivos do GitHub (API com token ou RAW)

In [None]:

def github_api_get_file(owner, repo, path, ref="master", token=None):
    url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={ref}"
    headers = {"Accept": "application/vnd.github+json"}
    if token:
        headers["Authorization"] = f"token {token}"
    r = requests.get(url, headers=headers, timeout=60)
    r.raise_for_status()
    data = r.json()
    if isinstance(data, dict) and "content" in data:
        return base64.b64decode(data["content"])
    raise RuntimeError(f"Resposta inesperada da API do GitHub: {str(data)[:200]}")

def github_raw_url(owner, repo, path, ref="master"):
    return f"https://raw.githubusercontent.com/{owner}/{repo}/{ref}/{path}"

def read_csv_github(owner, repo, path, ref="master", token=None, **pd_kwargs):
    if token:
        try:
            content = github_api_get_file(owner, repo, path, ref, token)
            return pd.read_csv(io.BytesIO(content), **pd_kwargs)
        except Exception as e:
            print("Aviso: falhou via API; tentando RAW…", e)
    url = github_raw_url(owner, repo, path, ref)
    return pd.read_csv(url, **pd_kwargs)


## 4) Carregar **meteorologia** do GitHub

In [None]:

COLMAP_MET = {
    "Data Medicao": "date",
    "PRECIPITACAO TOTAL, DIARIO (AUT)(mm)": "chuva",
    "PRESSAO ATMOSFERICA MEDIA DIARIA (AUT)(mB)": "pressao_media",
    "TEMPERATURA DO PONTO DE ORVALHO MEDIA DIARIA (AUT)(°C)": "ponto_orvalho_media",
    "TEMPERATURA DO PONTO DE ORVALHO MEDIA DIARIA (AUT)(Â°C)": "ponto_orvalho_media",
    "TEMPERATURA MAXIMA, DIARIA (AUT)(°C)": "temp_max",
    "TEMPERATURA MAXIMA, DIARIA (AUT)(Â°C)": "temp_max",
    "TEMPERATURA MEDIA, DIARIA (AUT)(°C)": "temp_media",
    "TEMPERATURA MEDIA, DIARIA (AUT)(Â°C)": "temp_media",
    "TEMPERATURA MINIMA, DIARIA (AUT)(°C)": "temp_min",
    "TEMPERATURA MINIMA, DIARIA (AUT)(Â°C)": "temp_min",
    "UMIDADE RELATIVA DO AR, MEDIA DIARIA (AUT)(%)": "umi_media",
    "UMIDADE RELATIVA DO AR, MINIMA DIARIA (AUT)(%)": "umi_min",
    "VENTO, RAJADA MAXIMA DIARIA (AUT)(m/s)": "vento_rajada_max",
    "VENTO, VELOCIDADE MEDIA DIARIA (AUT)(m/s)": "vento_vel_media",
}
NA_VALS = ["", " ", "-", "NA", "N/A", None]

def load_meteo_from_github(owner, repo, path, ref, token, municipio_nome):
    df = read_csv_github(owner, repo, path, ref, token,
                         sep=";", decimal=",", encoding="latin1", engine="python", na_values=NA_VALS)
    df = df.loc[:, ~df.columns.str.match(r"^Unnamed")]
    df.columns = [c.strip() for c in df.columns]
    fixcols = {c: c.replace("(Â°C)", "(°C)") for c in df.columns if "(Â°C)" in c}
    if fixcols: df = df.rename(columns=fixcols)
    ren = {c: COLMAP_MET[c] for c in df.columns if c in COLMAP_MET}
    df = df.rename(columns=ren)
    if "date" not in df.columns:
        raise ValueError("Coluna de data não encontrada (esperado 'Data Medicao').")
    df["date"] = pd.to_datetime(df["date"], errors="coerce", format="%Y-%m-%d")
    for v in COLMAP_MET.values():
        if v in df.columns and v != "date":
            df[v] = pd.to_numeric(df[v], errors="coerce")
    df["municipio"] = municipio_nome
    return df.sort_values("date").dropna(subset=["date"]).reset_index(drop=True)

met_diam = load_meteo_from_github(OWNER, REPO, MET_PATH_DIAM, BRANCH, (GITHUB_TOKEN or None), "Diamantina")
met_teof = load_meteo_from_github(OWNER, REPO, MET_PATH_TEOF, BRANCH, (GITHUB_TOKEN or None), "Teófilo Otoni")
met_all_daily = pd.concat([met_diam, met_teof], ignore_index=True)
print("Meteorologia (linhas):", met_all_daily.shape[0])
display(met_all_daily.head())


## 5) Carregar **casos** do GitHub

In [None]:

import unicodedata
df_cases = read_csv_github(OWNER, REPO, CASES_PATH, BRANCH, (GITHUB_TOKEN or None),
                           encoding="utf-8", sep=None, engine="python")
df_cases.columns = [c.strip().lower() for c in df_cases.columns]
for k, v in {"municipality":"municipio","state":"uf","outo date":"date","outro date":"date","data":"date"}.items():
    if k in df_cases.columns and v not in df_cases.columns:
        df_cases = df_cases.rename(columns={k: v})
if "date" not in df_cases.columns:
    raise ValueError(f"Coluna 'date' não encontrada. Colunas: {list(df_cases.columns)}")
df_cases["date"] = pd.to_datetime(df_cases["date"], errors="coerce")
for col in ["dengue_cases","zika_cases","chikungunya_cases","febre_amarela_cases"]:
    if col not in df_cases.columns:
        df_cases[col] = 0
    df_cases[col] = pd.to_numeric(df_cases[col], errors="coerce").fillna(0).astype(int)
if "total_cases" not in df_cases.columns:
    df_cases["total_cases"] = df_cases[["dengue_cases","zika_cases","chikungunya_cases","febre_amarela_cases"]].sum(axis=1)
def strip_accents(s):
    if pd.isna(s): return s
    return "".join(ch for ch in unicodedata.normalize("NFKD", str(s)) if not unicodedata.combining(ch))
df_cases["muni_norm"] = df_cases["municipio"].map(strip_accents).str.strip().str.lower()
df_cases = df_cases[df_cases["muni_norm"].isin({"diamantina","teofilo otoni"})].drop(columns=["muni_norm"]).copy()
print("Casos (linhas):", df_cases.shape[0]); display(df_cases.head())


## 6) Agregação semanal (W-SUN)

In [None]:

agg_map = {}
for c in met_all_daily.columns:
    if c in ["date","municipio"]:
        continue
    agg_map[c] = "sum" if c == "chuva" else "mean"

def to_weekly(df_daily):
    out = []
    for muni, g in df_daily.groupby("municipio", as_index=False):
        g = g.set_index("date").sort_index()
        g_week = g.resample("W-SUN").agg(agg_map)
        g_week["municipio"] = muni
        g_week = g_week.reset_index().rename(columns={"date":"date_week"})
        out.append(g_week)
    return pd.concat(out, ignore_index=True)

met_weekly = to_weekly(met_all_daily)
casos_weekly = (
    df_cases.set_index("date").groupby("municipio").resample("W-SUN").agg({
        "dengue_cases":"sum","zika_cases":"sum","chikungunya_cases":"sum","febre_amarela_cases":"sum","total_cases":"sum"
    }).reset_index().rename(columns={"date":"date_week"})
)
print("met_weekly:", met_weekly.shape, "| casos_weekly:", casos_weekly.shape)
display(met_weekly.head(), casos_weekly.head())


## 7) Merge, features e lags

In [None]:

df_semana = pd.merge(
    met_weekly,
    casos_weekly[["date_week","municipio","total_cases","dengue_cases","zika_cases","chikungunya_cases"]],
    on=["date_week","municipio"],
    how="left"
)
for col in ["dengue_cases","zika_cases","chikungunya_cases"]:
    if col not in df_semana.columns: df_semana[col] = 0
if "total_cases" not in df_semana.columns:
    df_semana["total_cases"] = df_semana[["dengue_cases","zika_cases","chikungunya_cases"]].sum(axis=1)
df_semana["date_week"] = pd.to_datetime(df_semana["date_week"], errors="coerce")
for col in ["total_cases","dengue_cases","zika_cases","chikungunya_cases"]:
    df_semana[col] = pd.to_numeric(df_semana[col], errors="coerce").fillna(0).astype(int)
df_semana = df_semana.sort_values(["municipio","date_week"]).reset_index(drop=True)
base_feats = ["temp_min","temp_media","temp_max","umi_media","umi_min","chuva","pressao_media","vento_rajada_max","vento_vel_media","ponto_orvalho_media"]
features = [c for c in base_feats if c in df_semana.columns]
alvo = "total_cases"
for lag in [1, 2]:
    df_semana[f"{alvo}_lag{lag}"] = df_semana.groupby("municipio")[alvo].shift(lag)
    features.append(f"{alvo}_lag{lag}")
df = df_semana.dropna(subset=features + [alvo]).copy()
df = df.rename(columns={"date_week":"date"}).reset_index(drop=True)
print("Municipios:", df["municipio"].unique().tolist())
print("Período:", df["date"].min(), "→", df["date"].max())
print("N por município:\n", df.groupby("municipio").size())
print("Features:", features); print("Alvo:", alvo)


## 8) Geração de sequências

In [None]:

WINDOW = 8; HORIZON = 1
def make_sequences(group_df, feature_cols, target_col, window=8, horizon=1):
    X_list, y_list, dates = [], [], []
    X_raw = group_df[feature_cols].values; y_raw = group_df[target_col].values; dts = group_df["date"].values
    for i in range(len(group_df) - window - horizon + 1):
        X_list.append(X_raw[i:i+window]); y_list.append(y_raw[i+window+horizon-1]); dates.append(dts[i+window+horizon-1])
    return np.array(X_list), np.array(y_list), np.array(dates)
X_all, y_all, dt_all, muni_all = [], [], [], []; scalers_by_muni = {}
for muni, g in df.groupby("municipio"):
    g = g.sort_values("date").reset_index(drop=True)
    if g.shape[0] < WINDOW + HORIZON:
        print(f"[PULANDO] {muni}: poucos dados.");
        continue
    scaler = StandardScaler().fit(g[features].values); scalers_by_muni[muni] = scaler
    g_scaled = g.copy(); g_scaled[features] = scaler.transform(g[features].values)
    X, y, dts = make_sequences(g_scaled, features, alvo, window=WINDOW, horizon=HORIZON)
    if X.size == 0:
        print(f"[PULANDO] {muni}: janela/horizonte grandes.");
        continue
    X_all.append(X); y_all.append(y); dt_all.append(dts); muni_all.append(np.array([muni]*len(y)))
if not X_all: raise ValueError("Nenhuma sequência criada.")
X = np.concatenate(X_all, axis=0); y = np.concatenate(y_all, axis=0)
dates_seq = np.concatenate(dt_all, axis=0); muni_seq = np.concatenate(muni_all, axis=0)
print("Shapes → X:", X.shape, "| y:", y.shape)


## 9) Split treino/val/teste

In [None]:

X_train, X_tmp, y_train, y_tmp, muni_train, muni_tmp = train_test_split(
    X, y, muni_seq, test_size=0.2, random_state=SEED, shuffle=True
)
X_val, X_test, y_val, y_test, muni_val, muni_test = train_test_split(
    X_tmp, y_tmp, muni_tmp, test_size=0.5, random_state=SEED, shuffle=True
)
X_train.shape, X_val.shape, X_test.shape


## 10) Modelo LSTM

In [None]:

n_timesteps = X_train.shape[1]; n_features = X_train.shape[2]
model = keras.Sequential([
    layers.Input(shape=(n_timesteps, n_features)),
    layers.LSTM(64, return_sequences=True),
    layers.Dropout(0.2),
    layers.LSTM(32),
    layers.Dense(16, activation="relu"),
    layers.Dense(1)
])
model.compile(optimizer=keras.optimizers.Adam(1e-3), loss="mse", metrics=["mae"])
model.summary()


## 11) Treino

In [None]:

callbacks = [keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True, monitor="val_loss")]
hist = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=200, batch_size=64, callbacks=callbacks, verbose=1)
plt.figure(); plt.plot(hist.history["loss"], label="train"); plt.plot(hist.history["val_loss"], label="val"); plt.legend(); plt.title("Curva de treino (MSE)"); plt.show()


## 12) Avaliação

In [None]:

test_loss, test_mae = model.evaluate(X_test, y_test, verbose=0)
print(f"Teste — MSE: {test_loss:.3f} | MAE: {test_mae:.3f}")


## 13) Salvar artefatos (.keras + .h5), metadata, scalers (slug), ZIP

In [None]:

model_path_keras = os.path.join(ARTIFACTS_DIR, "model_lstm.keras")
model_path_h5    = os.path.join(ARTIFACTS_DIR, "model_lstm.h5")
savedmodel_dir   = os.path.join(ARTIFACTS_DIR, "model_lstm_savedmodel")
ok_keras = False
try:
    model.save(model_path_keras); ok_keras = True; print("✅ .keras salvo:", model_path_keras)
except Exception as e:
    print("⚠️ .keras falhou:", e); print("➡️ usando SavedModel…"); model.save(savedmodel_dir); print("✅ SavedModel salvo:", savedmodel_dir)
try:
    model.save(model_path_h5); print("✅ .h5 salvo:", model_path_h5)
except Exception as e:
    print("⚠️ .h5 falhou:", e)
metadata = {
    "window": int(WINDOW), "horizon": int(HORIZON), "features": list(features), "target": "total_cases",
    "framework": "keras", "tensorflow_version": keras.__version__,
    "artifacts": {"keras": model_path_keras if ok_keras else None, "h5": model_path_h5, "savedmodel_dir": savedmodel_dir if not ok_keras else None}
}
with open(os.path.join(ARTIFACTS_DIR, "metadata.json"), "w") as f: json.dump(metadata, f, indent=2, default=str)
scalers_dir = os.path.join(ARTIFACTS_DIR, "scalers"); os.makedirs(scalers_dir, exist_ok=True)
for muni, g in df.groupby("municipio"):
    sc = StandardScaler().fit(g[features].values); out_name = f"scaler_{slugify(muni)}.joblib"; joblib.dump(sc, os.path.join(scalers_dir, out_name))
from google.colab import files
!cd {ARTIFACTS_DIR} && zip -r artifacts_bundle.zip * -q
print("Conteúdo de artifacts:", os.listdir(ARTIFACTS_DIR))
files.download(os.path.join(ARTIFACTS_DIR, "artifacts_bundle.zip"))


## 14) Função de inferência por município (scaler slugificado)

In [None]:

def predict_next_for_muni(model, df_all, muni, last_window_df, features, artifacts_dir=ARTIFACTS_DIR, window=8):
    import joblib, numpy as np
    path_s = os.path.join(artifacts_dir, "scalers", f"scaler_{slugify(muni)}.joblib")
    if not os.path.exists(path_s): raise FileNotFoundError(f"Scaler não encontrado: {path_s}")
    scaler_m = joblib.load(path_s)
    X_last = last_window_df.sort_values("date").copy()
    X_scaled = scaler_m.transform(X_last[features].values)
    X_seq = np.expand_dims(X_scaled, axis=0)
    return float(model.predict(X_seq, verbose=0).ravel()[0])

for muni in df["municipio"].unique().tolist():
    g = df[df["municipio"] == muni].sort_values("date")
    if g.shape[0] >= WINDOW:
        print(muni, "→ previsão próxima semana:", round(predict_next_for_muni(model, df, muni, g.tail(WINDOW), features, window=WINDOW), 2))


## 15) (Opcional) Enviar artefatos para o GitHub (backend/artifacts/)

In [None]:

if GITHUB_TOKEN:
    import base64, requests
    DEST_DIR = "backend/artifacts"
    def github_upsert(owner, repo, path, content_bytes, message, branch="master", token=None):
        url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
        headers = {"Accept":"application/vnd.github+json"}
        if token: headers["Authorization"] = f"token {token}"
        # pegar SHA se já existe
        sha = None
        r0 = requests.get(url+f"?ref={branch}", headers=headers, timeout=60)
        if r0.status_code == 200:
            try: sha = r0.json().get("sha")
            except Exception: sha = None
        data = {"message": message, "content": base64.b64encode(content_bytes).decode("utf-8"), "branch": branch}
        if sha: data["sha"] = sha
        r = requests.put(url, headers=headers, data=json.dumps(data), timeout=60); r.raise_for_status(); return r.json()
    # principais
    for fname in ["model_lstm.keras", "model_lstm.h5", "metadata.json"]:
        fpath = os.path.join(ARTIFACTS_DIR, fname)
        if os.path.exists(fpath):
            with open(fpath, "rb") as f:
                github_upsert(OWNER, REPO, f"{DEST_DIR}/{fname}", f.read(), message=f"Add {fname} (LSTM artifacts)", branch=BRANCH, token=GITHUB_TOKEN)
            print("Enviado:", fname)
    # scalers
    scdir = os.path.join(ARTIFACTS_DIR, "scalers")
    for fname in sorted(os.listdir(scdir)):
        if fname.endswith(".joblib"):
            fpath = os.path.join(scdir, fname)
            with open(fpath, "rb") as f:
                github_upsert(OWNER, REPO, f"{DEST_DIR}/scalers/{fname}", f.read(), message=f"Add scaler {fname}", branch=BRANCH, token=GITHUB_TOKEN)
            print("Enviado scaler:", fname)
else:
    print("GITHUB_TOKEN não definido; pulando push para GitHub.")
