In [None]:
# importar a biblioteca usada para trabalhar com vetores e matrizes
import numpy as np

# importar a biblioteca usada para trabalhar com dataframes (dados em formato de tabela) e análise de dados
import pandas as pd

# importar as bibliotecas usadas para geração de gráficos
import seaborn as sns
import matplotlib.pyplot as plt

# importar as bibliotecas necessárias para o modelo de previsão
from statsmodels.tsa.statespace.sarimax import SARIMAX
from ipywidgets import Dropdown, Button, Output, HBox, VBox, Checkbox

print('Bibliotecas carregadas com sucesso!')

Bibliotecas carregadas com sucesso!


In [None]:
# 1) Credenciais do Kaggle
# =========================================================

import os, json, zipfile, pathlib, glob, sys

pd.set_option('display.max_columns', 120)
pd.set_option('display.width', 120)

kaggle_json = {
    "username": "guilhermecustdio12",   # <- preencha
    "key": "6c9d7914fa28c3d2e107c12532c23da"          # <- preencha
}

kaggle_dir = pathlib.Path.home().joinpath(".kaggle")
kaggle_dir.mkdir(exist_ok=True)
cred_path = kaggle_dir.joinpath("kaggle.json")
with open(cred_path, "w") as f:
    json.dump(kaggle_json, f)
os.chmod(cred_path, 0o600)

print("kaggle.json criado em:", cred_path)

# =========================================================
# 2) Instalar Kaggle API e baixar o dataset do Uber
# =========================================================
try:
    import kaggle  # noqa
except:
    !pip install -q kaggle
    import kaggle  # noqa

DATASET = "yashdevladdha/uber-ride-analytics-dashboard"
DOWNLOAD_DIR = "/content" if "google.colab" in sys.modules else str(pathlib.Path.cwd())
ZIP_NAME = DATASET.split("/")[-1] + ".zip"
ZIP_PATH = os.path.join(DOWNLOAD_DIR, ZIP_NAME)
EXTRACT_DIR = os.path.join(DOWNLOAD_DIR, "data_uber")

# Baixar zip (se já existir, reaproveita)
if not os.path.exists(ZIP_PATH):
    !kaggle datasets download -d $DATASET -p $DOWNLOAD_DIR -q
else:
    print("Zip já existe em:", ZIP_PATH)

# Extrair
os.makedirs(EXTRACT_DIR, exist_ok=True)
with zipfile.ZipFile(ZIP_PATH, "r") as z:
    z.extractall(EXTRACT_DIR)

print("Extraído em:", EXTRACT_DIR)
csv_files = glob.glob(os.path.join(EXTRACT_DIR, "**", "*.csv"), recursive=True)
print("CSVs encontrados:", [os.path.basename(f) for f in csv_files])
assert csv_files, "Nenhum CSV encontrado no dataset; verifique a extração."

kaggle.json criado em: /root/.kaggle/kaggle.json
Dataset URL: https://www.kaggle.com/datasets/yashdevladdha/uber-ride-analytics-dashboard
License(s): CC-BY-SA-4.0
Extraído em: /content/data_uber
CSVs encontrados: ['ncr_ride_bookings.csv']


In [None]:
# 3) Carregar e mostrar uma amostra do conjunto de dados

# =========================================================
priority_keywords = ["ride", "uber", "trip", "data", "request", "pickup"]
def score_file(name: str) -> int:
    n = name.lower()
    return sum(k in n for k in priority_keywords)

csv_scored = sorted(csv_files, key=lambda p: score_file(os.path.basename(p)), reverse=True)
main_csv_path = csv_scored[0]
print("CSV selecionado:", os.path.basename(main_csv_path))

df_raw = pd.read_csv(main_csv_path)
print("Dados brutos:", df_raw.shape)
display(df_raw.head(50))

CSV selecionado: ncr_ride_bookings.csv
Dados brutos: (150000, 21)


Unnamed: 0,Date,Time,Booking ID,Booking Status,Customer ID,Vehicle Type,Pickup Location,Drop Location,Avg VTAT,Avg CTAT,Cancelled Rides by Customer,Reason for cancelling by Customer,Cancelled Rides by Driver,Driver Cancellation Reason,Incomplete Rides,Incomplete Rides Reason,Booking Value,Ride Distance,Driver Ratings,Customer Rating,Payment Method
0,2024-03-23,12:29:38,"""CNR5884300""",No Driver Found,"""CID1982111""",eBike,Palam Vihar,Jhilmil,,,,,,,,,,,,,
1,2024-11-29,18:01:39,"""CNR1326809""",Incomplete,"""CID4604802""",Go Sedan,Shastri Nagar,Gurgaon Sector 56,4.9,14.0,,,,,1.0,Vehicle Breakdown,237.0,5.73,,,UPI
2,2024-08-23,08:56:10,"""CNR8494506""",Completed,"""CID9202816""",Auto,Khandsa,Malviya Nagar,13.4,25.8,,,,,,,627.0,13.58,4.9,4.9,Debit Card
3,2024-10-21,17:17:25,"""CNR8906825""",Completed,"""CID2610914""",Premier Sedan,Central Secretariat,Inderlok,13.1,28.5,,,,,,,416.0,34.02,4.6,5.0,UPI
4,2024-09-16,22:08:00,"""CNR1950162""",Completed,"""CID9933542""",Bike,Ghitorni Village,Khan Market,5.3,19.6,,,,,,,737.0,48.21,4.1,4.3,UPI
5,2024-02-06,09:44:56,"""CNR4096693""",Completed,"""CID4670564""",Auto,AIIMS,Narsinghpur,5.1,18.1,,,,,,,316.0,4.85,4.1,4.6,UPI
6,2024-06-17,15:45:58,"""CNR2002539""",Completed,"""CID6800553""",Go Mini,Vaishali,Punjabi Bagh,7.1,20.4,,,,,,,640.0,41.24,4.0,4.1,UPI
7,2024-03-19,17:37:37,"""CNR6568000""",Completed,"""CID8610436""",Auto,Mayur Vihar,Cyber Hub,12.1,16.5,,,,,,,136.0,6.56,4.4,4.2,UPI
8,2024-09-14,12:49:09,"""CNR4510807""",No Driver Found,"""CID7873618""",Go Sedan,Noida Sector 62,Noida Sector 18,,,,,,,,,,,,,
9,2024-12-16,19:06:48,"""CNR7721892""",Incomplete,"""CID5214275""",Auto,Rohini,Adarsh Nagar,6.1,26.0,,,,,1.0,Other Issue,135.0,10.36,,,Cash


In [None]:
# 4) Tratamento dos dados
# =========================================================
def drop_duplicates_inconsistent(df: pd.DataFrame, cfg: dict = None) -> pd.DataFrame:
    return df.drop_duplicates()

def drop_irrelevant(df: pd.DataFrame, cfg: dict = None) -> pd.DataFrame:
    df = df.copy()
    cols_drop = [c for c in df.columns if any(k in c.lower() for k in ["id", "uuid", "unnamed"])]
    return df.drop(columns=cols_drop, errors="ignore")

def handle_missing(df: pd.DataFrame, cfg: dict = None) -> pd.DataFrame:
    df = df.copy()
    # numéricas -> mediana; categóricas -> "UNKNOWN"
    for c in df.columns:
        if pd.api.types.is_numeric_dtype(df[c]):
            df[c] = df[c].fillna(df[c].median())
        else:
            df[c] = df[c].fillna("UNKNOWN")
    return df

def remove_outliers(df: pd.DataFrame, cfg: dict = None) -> pd.DataFrame:
    df = df.copy()
    # Exemplos comuns no contexto Uber; ajuste às suas regras:
    if "distance" in df.columns:
        df = df[df["distance"].between(0, 150)]
    if "price" in df.columns:
        df = df[df["price"].between(0, 1000)]
    return df

def run_cleaning_pipeline(df: pd.DataFrame, cfg: dict = None) -> pd.DataFrame:
    df1 = df.copy()
    df1 = drop_duplicates_inconsistent(df1, cfg)
    df1 = drop_irrelevant(df1, cfg)
    df1 = handle_missing(df1, cfg)
    df1 = remove_outliers(df1, cfg)
    return df1

# =========================================================
# 5) Criar série temporal de demanda por hora

df = df_raw.copy()
DATE_COL = "Date"
TIME_COL = "Time"

# 1) higienizar strings (tira aspas, espaços, etc.)
for c in [DATE_COL, TIME_COL]:
    if c in df.columns:
        df[c] = (df[c]
                 .astype(str)
                 .str.strip()
                 .str.replace('"', '', regex=False)
                 .str.replace("'", '', regex=False))

# 2) montar coluna combinada
assert DATE_COL in df.columns and TIME_COL in df.columns, "Colunas 'Date' e/ou 'Time' não encontradas."
df["datetime_str"] = df[DATE_COL] + " " + df[TIME_COL]

# 3) tentar parse (24h)
dt = pd.to_datetime(df["datetime_str"], format="%Y-%m-%d %H:%M:%S", errors="coerce")

# 3b) fallback: tenta AM/PM se ainda houver NaT
mask_nat = dt.isna()
if mask_nat.any():
    dt2 = pd.to_datetime(df.loc[mask_nat, "datetime_str"], format="%Y-%m-%d %I:%M:%S %p", errors="coerce")
    dt.loc[mask_nat] = dt2

# 3c) último fallback: infer (lento, mas robusto)
mask_nat = dt.isna()
if mask_nat.any():
    dt3 = pd.to_datetime(df.loc[mask_nat, "datetime_str"], errors="coerce", infer_datetime_format=True)
    dt.loc[mask_nat] = dt3

df["datetime"] = dt
print("Registros sem datetime após parse:", df["datetime"].isna().sum())

# 4) aplicar SEU pipeline, mas sem tocar na coluna datetime
def handle_missing(df_in: pd.DataFrame, cfg: dict = None, datetime_col: str = "datetime") -> pd.DataFrame:
    df2 = df_in.copy()
    for c in df2.columns:
        if c == datetime_col:  # não mexe na datetime
            continue
        if pd.api.types.is_numeric_dtype(df2[c]):
            df2[c] = df2[c].fillna(df2[c].median())
        else:
            df2[c] = df2[c].fillna("UNKNOWN")
    return df2

def run_cleaning_pipeline(df_in: pd.DataFrame, cfg: dict = None) -> pd.DataFrame:
    df1 = drop_duplicates_inconsistent(df_in, cfg)
    df1 = drop_irrelevant(df1, cfg)
    df1 = handle_missing(df1, cfg, datetime_col="datetime")
    df1 = remove_outliers(df1, cfg)
    return df1

df_clean = run_cleaning_pipeline(df)

# 5) remover NaT e ordenar
df_clean = df_clean.dropna(subset=["datetime"]).sort_values("datetime")

# 6) série de demanda por HORA
ts_hourly = (
    df_clean
      .set_index(pd.DatetimeIndex(df_clean["datetime"]))
      .resample("H")
      .size()
      .rename("rides")
      .to_frame()
)

# sanity checks
print("Distribuição por hora:")
print(ts_hourly["rides"].sort_values(ascending=False).head(15))
print("Horas únicas no índice:", ts_hourly.index.hour.unique().tolist())
display(ts_hourly.head())

Registros sem datetime após parse: 0
Distribuição por hora:
datetime
2024-09-16 18:00:00    51
2024-12-08 19:00:00    50
2024-11-22 18:00:00    50
2024-09-25 18:00:00    50
2024-06-16 18:00:00    50
2024-03-12 18:00:00    49
2024-05-14 19:00:00    49
2024-08-14 18:00:00    49
2024-06-28 17:00:00    48
2024-07-26 18:00:00    48
2024-02-19 18:00:00    47
2024-05-06 18:00:00    47
2024-02-18 18:00:00    47
2024-05-30 18:00:00    47
2024-04-10 18:00:00    46
Name: rides, dtype: int64
Horas únicas no índice: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]


  .resample("H")


Unnamed: 0_level_0,rides
datetime,Unnamed: 1_level_1
2024-01-01 00:00:00,1
2024-01-01 01:00:00,5
2024-01-01 02:00:00,4
2024-01-01 03:00:00,4
2024-01-01 04:00:00,4


In [None]:
##Criação do modelo

# 1) garantir coluna de região
REGION_CANDIDATES = [
    "Pickup Location","Pickup Area","Pickup Zone","Pickup City","Pickup",
    "Start Location","Source","Source Zone","Source City",
    "From","Origin","Region","Area","Zone","Location","City"
]
def infer_region_column(df):
    for c in df.columns:
        if c in REGION_CANDIDATES: return c
    for c in df.columns:
        cl = c.lower()
        if any(k in cl for k in ["pickup","start","source","origin","from","region","area","zone","location","city"]):
            return c
    return None

region_col = infer_region_column(df_clean)
assert region_col is not None, "Defina manualmente a coluna de região (ex.: region_col = 'Pickup Location')"

df_clean["region"] = (
    df_clean[region_col].astype(str)
      .str.replace(r'["\']', '', regex=True)
      .str.strip().str.upper()
)
df_clean = df_clean[df_clean["region"].str.len() > 0].copy()

# 2) painel horário × região
ts_by_region = (
    df_clean
      .set_index(pd.DatetimeIndex(df_clean["datetime"]))
      .groupby([pd.Grouper(freq="h"), "region"])
      .size().rename("rides").reset_index()
      .pivot(index="datetime", columns="region", values="rides")
      .sort_index().asfreq("h").fillna(0).astype(int)
)

def next_dow_start(last_ts: pd.Timestamp, target_dow: int) -> pd.Timestamp:
    start = (last_ts + pd.Timedelta(hours=1)).normalize()
    da = (target_dow - start.dayofweek) % 7
    if da == 0: da = 7
    return start + pd.Timedelta(days=da)

# ---- perfil HURDLE por (dow,hour): p(y>0) e média condicional μ = E[y | y>0]
def make_hurdle_profile(y: pd.Series, alpha: float = 1.0):
    y = y.asfreq('h').fillna(0).astype(float)
    tmp = y.to_frame('y')
    tmp['dow']  = tmp.index.dayofweek
    tmp['hour'] = tmp.index.hour
    g = tmp.groupby(['dow','hour'])

    n   = g.size().unstack('hour').reindex(index=range(7), columns=range(24)).fillna(0)
    pos = g['y'].apply(lambda s: (s>0).sum()).unstack('hour').reindex_like(n).fillna(0)
    s1  = g['y'].apply(lambda s: s[s>0].sum()).unstack('hour').reindex_like(n).fillna(0)

    # Probabilidade com suavização de Laplace: (pos + α)/(n + 2α)
    p = (pos + alpha) / (n + 2*alpha)

    # Média condicional (se não houve casos positivos, fica 0)
    mu = (s1 / pos.replace(0, np.nan)).fillna(0.0)

    return p.astype(float), mu.astype(float)

# ---- avaliação no teste + previsão 24h para DOW escolhido
def eval_and_forecast_hurdle(y: pd.Series, test_days=14, dow_future=0,
                             eval_only_selected_dow=False, alpha=1.0, prob_thresh=0.5):
    y = y.asfreq('h').fillna(0).astype(float)
    cutoff = y.index.max() - pd.Timedelta(days=test_days)
    y_tr = y.loc[:cutoff]
    y_te = y.loc[cutoff + pd.Timedelta(hours=1):]

    # perfil hurdle no treino
    p, mu = make_hurdle_profile(y_tr, alpha=alpha)

    # se quiser avaliar só o DOW selecionado
    if eval_only_selected_dow:
        y_te = y_te[y_te.index.dayofweek == dow_future]

    # previsão no teste (E[y] = p×μ) e probabilidade de evento
    pred_vals  = [p.loc[t.dayofweek, t.hour] * mu.loc[t.dayofweek, t.hour] for t in y_te.index]
    prob_vals  = [p.loc[t.dayofweek, t.hour]                                 for t in y_te.index]
    yhat_te = pd.Series(pred_vals, index=y_te.index, name='previsto')
    phat_te = pd.Series(prob_vals, index=y_te.index, name='prob_evento')

    # distância ponto-a-ponto
    eval_df = pd.concat([y_te.rename('real'), yhat_te, phat_te], axis=1)
    eval_df['erro']     = eval_df['real'] - eval_df['previsto']
    eval_df['abs_erro'] = eval_df['erro'].abs()
    eval_df['sMAPE_%']  = 2*eval_df['abs_erro']/(np.abs(eval_df['real'])+np.abs(eval_df['previsto'])+1e-9)*100

    # métricas contagem
    MAE  = float(eval_df['abs_erro'].mean())
    RMSE = float(np.sqrt(np.mean((eval_df['erro'])**2)))
    sMAPE = float(eval_df['sMAPE_%'].mean())

    # métricas do evento (haver corrida)
    y_true_evt = (eval_df['real'] > 0).astype(int)
    y_pred_evt = (eval_df['prob_evento'] >= prob_thresh).astype(int)
    tp = int(((y_pred_evt==1) & (y_true_evt==1)).sum())
    fp = int(((y_pred_evt==1) & (y_true_evt==0)).sum())
    fn = int(((y_pred_evt==0) & (y_true_evt==1)).sum())
    precision = tp / max(tp+fp, 1)
    recall    = tp / max(tp+fn, 1)
    f1        = 2*precision*recall / max(precision+recall, 1e-9)

    metrics = {'MAE': MAE, 'RMSE': RMSE, 'sMAPE_%': sMAPE,
               'Precision_evt': precision, 'Recall_evt': recall, 'F1_evt': f1}
    totals  = {'soma_real': float(eval_df['real'].sum()),
               'soma_prevista': float(eval_df['previsto'].sum()),
               'dif_total': float(eval_df['previsto'].sum()-eval_df['real'].sum()),
               'soma_abs_erro': float(eval_df['abs_erro'].sum())}

    # previsão 24h para o DOW escolhido
    start = next_dow_start(y.index[-1], dow_future)
    idx_fut = pd.date_range(start, periods=24, freq='h')
    fc_vals  = [p.loc[dow_future, h] * mu.loc[dow_future, h] for h in range(24)]
    prob_24h = [p.loc[dow_future, h]                         for h in range(24)]
    fc = pd.Series(fc_vals, index=idx_fut, name='forecast_24h')
    fc_prob = pd.Series(prob_24h, index=idx_fut, name='prob_evento_24h')

    return {'metrics_on_test': metrics, 'totals_on_test': totals,
            'eval_df': eval_df, 'forecast_next_24h': fc, 'forecast_prob_24h': fc_prob}


# 3) regiões elegíveis
MIN_TOTAL = 200         # ajuste se necessário
valid_regions = ts_by_region.columns[ts_by_region.sum() >= MIN_TOTAL].tolist()
if not valid_regions:   # fallback se o filtro ficou muito rígido
    valid_regions = ts_by_region.columns.tolist()

print("ts_by_region:", ts_by_region.shape, "| regiões válidas:", len(valid_regions))

ts_by_region: (8760, 176) | regiões válidas: 176


In [None]:
#Criação dos perfis para cada dia para a previsão dow

# ---- perfil médio por dia-da-semana (0=Seg..6=Dom) e hora (0..23)
def make_dow_hour_profile(y: pd.Series) -> pd.DataFrame:
    df = y.to_frame('rides').copy()
    df['dow']  = df.index.dayofweek
    df['hour'] = df.index.hour
    prof = (df.groupby(['dow','hour'])['rides']
              .mean()
              .unstack('hour')
              .reindex(index=range(7), columns=range(24))
              .fillna(0.0))
    return prof

# ---- próxima meia-noite do DOW desejado
def next_dow_start(last_ts: pd.Timestamp, target_dow: int) -> pd.Timestamp:
    start = (last_ts + pd.Timedelta(hours=1)).normalize()
    da = (target_dow - start.dayofweek) % 7
    if da == 0: da = 7
    return start + pd.Timedelta(days=da)

# ---- avaliação do perfil no período de teste + forecast 24h para um DOW
def eval_and_forecast_profile(y: pd.Series, test_days=14, dow_future=0):
    y = y.asfreq('h').fillna(0.0).astype(float)

    cutoff = y.index.max() - pd.Timedelta(days=test_days)
    y_tr = y.loc[:cutoff]
    y_te = y.loc[cutoff + pd.Timedelta(hours=1):]

    prof = make_dow_hour_profile(y_tr)

    # previsão ponto-a-ponto no teste usando o perfil por (dow,hour)
    pred_te_vals = [prof.loc[t.dayofweek, t.hour] for t in y_te.index]
    yhat_te = pd.Series(pred_te_vals, index=y_te.index, name='previsto')

    # distâncias
    eval_df = pd.concat([y_te.rename('real'), yhat_te], axis=1)
    eval_df['erro'] = eval_df['real'] - eval_df['previsto']
    eval_df['abs_erro'] = eval_df['erro'].abs()
    eval_df['sMAPE_%'] = 2*eval_df['abs_erro']/(np.abs(eval_df['real'])+np.abs(eval_df['previsto'])+1e-9)*100

    mae  = float(eval_df['abs_erro'].mean())
    rmse = float(np.sqrt(np.mean((eval_df['erro'])**2)))
    smape = float(eval_df['sMAPE_%'].mean())

    metrics = {'MAE': mae, 'RMSE': rmse, 'sMAPE_%': smape}
    totals  = {
        'soma_real': float(eval_df['real'].sum()),
        'soma_prevista': float(eval_df['previsto'].sum()),
        'dif_total': float(eval_df['previsto'].sum() - eval_df['real'].sum()),
        'soma_abs_erro': float(eval_df['abs_erro'].sum())
    }

    # forecast das próximas 24h para o DOW escolhido
    start = next_dow_start(y.index[-1], dow_future)
    idx_fut = pd.date_range(start, periods=24, freq='h')
    fc_vals = [prof.loc[dow_future, h] for h in range(24)]
    fc = pd.Series(fc_vals, index=idx_fut, name='forecast_24h')

    return {'metrics_on_test': metrics,
            'totals_on_test': totals,
            'eval_df': eval_df,
            'forecast_next_24h': fc}

In [None]:
# ========= HURDLE core (definições) =========

def next_dow_start(last_ts: pd.Timestamp, target_dow: int) -> pd.Timestamp:
    """Próxima 00:00 do dia-da-semana desejado (0=Seg..6=Dom) após last_ts."""
    start = (last_ts + pd.Timedelta(hours=1)).normalize()
    da = (target_dow - start.dayofweek) % 7
    if da == 0: da = 7
    return start + pd.Timedelta(days=da)

def make_hurdle_profile(y: pd.Series, alpha: float = 1.0):
    """Perfil por (dow,hour): p(y>0) com suavização de Laplace + média condicional μ=E[y|y>0]."""
    y = y.asfreq('h').fillna(0).astype(float)
    tmp = y.to_frame('y')
    tmp['dow']  = tmp.index.dayofweek
    tmp['hour'] = tmp.index.hour
    g = tmp.groupby(['dow','hour'])

    n   = g.size().unstack('hour').reindex(index=range(7), columns=range(24)).fillna(0)
    pos = g['y'].apply(lambda s: (s>0).sum()).unstack('hour').reindex_like(n).fillna(0)
    s1  = g['y'].apply(lambda s: s[s>0].sum()).unstack('hour').reindex_like(n).fillna(0)

    # probabilidade com suavização de Laplace
    p = (pos + alpha) / (n + 2*alpha)
    # média condicional (0 quando não há positivos)
    mu = (s1 / pos.replace(0, np.nan)).fillna(0.0)

    return p.astype(float), mu.astype(float)

def eval_and_forecast_hurdle(y: pd.Series, test_days=14, dow_future=0,
                             eval_only_selected_dow=False, alpha=1.0, prob_thresh=0.5):
    """Avalia no teste (opcionalmente só no DOW escolhido) e prevê 24h para o DOW."""
    y = y.asfreq('h').fillna(0).astype(float)
    cutoff = y.index.max() - pd.Timedelta(days=test_days)
    y_tr = y.loc[:cutoff]
    y_te = y.loc[cutoff + pd.Timedelta(hours=1):]

    p, mu = make_hurdle_profile(y_tr, alpha=alpha)

    if eval_only_selected_dow:
        y_te = y_te[y_te.index.dayofweek == dow_future]

    pred_vals = [p.loc[t.dayofweek, t.hour] * mu.loc[t.dayofweek, t.hour] for t in y_te.index]
    prob_vals = [p.loc[t.dayofweek, t.hour]                                  for t in y_te.index]
    yhat_te = pd.Series(pred_vals, index=y_te.index, name='previsto')
    phat_te = pd.Series(prob_vals, index=y_te.index, name='prob_evento')

    eval_df = pd.concat([y_te.rename('real'), yhat_te, phat_te], axis=1)
    eval_df['erro']     = eval_df['real'] - eval_df['previsto']
    eval_df['abs_erro'] = eval_df['erro'].abs()
    eval_df['sMAPE_%']  = 2*eval_df['abs_erro']/(np.abs(eval_df['real'])+np.abs(eval_df['previsto'])+1e-9)*100

    MAE  = float(eval_df['abs_erro'].mean())
    RMSE = float(np.sqrt(np.mean((eval_df['erro'])**2)))
    sMAPE = float(eval_df['sMAPE_%'].mean())

    y_true_evt = (eval_df['real'] > 0).astype(int)
    y_pred_evt = (eval_df['prob_evento'] >= prob_thresh).astype(int)
    tp = int(((y_pred_evt==1) & (y_true_evt==1)).sum())
    fp = int(((y_pred_evt==1) & (y_true_evt==0)).sum())
    fn = int(((y_pred_evt==0) & (y_true_evt==1)).sum())
    precision = tp / max(tp+fp, 1)
    recall    = tp / max(tp+fn, 1)
    f1        = 2*precision*recall / max(precision+recall, 1e-9)

    metrics = {'MAE': MAE, 'RMSE': RMSE, 'sMAPE_%': sMAPE,
               'Precision_evt': precision, 'Recall_evt': recall, 'F1_evt': f1}
    totals  = {'soma_real': float(eval_df['real'].sum()),
               'soma_prevista': float(eval_df['previsto'].sum()),
               'dif_total': float(eval_df['previsto'].sum()-eval_df['real'].sum()),
               'soma_abs_erro': float(eval_df['abs_erro'].sum())}

    start = next_dow_start(y.index[-1], dow_future)
    idx_fut = pd.date_range(start, periods=24, freq='h')
    fc_vals  = [p.loc[dow_future, h] * mu.loc[dow_future, h] for h in range(24)]
    prob_24h = [p.loc[dow_future, h]                         for h in range(24)]
    fc = pd.Series(fc_vals, index=idx_fut, name='forecast_24h')
    fc_prob = pd.Series(prob_24h, index=idx_fut, name='prob_evento_24h')

    return {'metrics_on_test': metrics, 'totals_on_test': totals,
            'eval_df': eval_df, 'forecast_next_24h': fc, 'forecast_prob_24h': fc_prob}

In [None]:
# Botão para prever a probabilidade de corridas para determinada região em determinado dia da semana

dow_map = [('Seg',0),('Ter',1),('Qua',2),('Qui',3),('Sex',4),('Sáb',5),('Dom',6)]
dd_region = Dropdown(options=valid_regions, description='Região:')
dd_dow    = Dropdown(options=dow_map, description='Dia:')
chk_dow   = Checkbox(value=False, description='Avaliar só este dia')
btn       = Button(description='Prever (Hurdle)', button_style='primary')
out       = Output()

def on_click(_):
    out.clear_output(wait=True)
    with out:
        region = dd_region.value
        dow = dd_dow.value
        y = ts_by_region[region].astype(float)

        res = eval_and_forecast_hurdle(
            y, test_days=14, dow_future=dow,
            eval_only_selected_dow=chk_dow.value,
            alpha=1.0, prob_thresh=0.4   # ajuste o threshold de evento
        )

        print("Métricas (teste):", res['metrics_on_test'])
        print("Totais (teste):", res['totals_on_test'])

        # Real vs previsto (contagem)
        plt.figure(figsize=(12,3.4))
        plt.plot(res['eval_df'].index, res['eval_df']['real'], label='Real (teste)')
        plt.plot(res['eval_df'].index, res['eval_df']['previsto'], label='Previsto (Hurdle)')
        plt.legend(); plt.title(f"{region} — avaliação (Hurdle)"); plt.tight_layout(); plt.show()

        # Probabilidade de haver corrida (se avaliando só o DOW, o eixo fica daquele dia)
        plt.figure(figsize=(12,2.8))
        plt.plot(res['eval_df'].index, res['eval_df']['prob_evento'], label='P(y>0)')
        plt.axhline(0.5, ls='--', alpha=0.6, label='limiar 0.5')
        plt.legend(); plt.title('Probabilidade de ocorrência'); plt.tight_layout(); plt.show()

        # Previsão 24h (contagem esperada) + probabilidade 24h
        fc, p24 = res['forecast_next_24h'], res['forecast_prob_24h']
        fig, ax = plt.subplots(2,1, figsize=(12,6), sharex=True)
        ax[0].plot(fc.index, fc.values); ax[0].set_title(f"{region} — previsão 24h ({dd_dow.label}) [E[y]=p×μ]")
        ax[1].plot(p24.index, p24.values); ax[1].axhline(0.5, ls='--', alpha=0.6); ax[1].set_title('Probabilidade de haver corrida (24h)')
        plt.tight_layout(); plt.show()

btn.on_click(on_click)
display(HBox([dd_region, dd_dow, chk_dow, btn]), out)

HBox(children=(Dropdown(description='Região:', options=('ADARSH NAGAR', 'AIIMS', 'AKSHARDHAM', 'AMBIENCE MALL'…

Output()