# Premi√®re it√©ration avec poisson

ann√©e, loca, nb restau

In [2]:
# ============================================================
# Trend par localit√© (Poisson + CAGR) √† partir d'un CSV
# ============================================================
# D√©pendances : pandas, numpy, statsmodels
# pip install pandas numpy statsmodels

import pandas as pd
import numpy as np
import statsmodels.api as sm

# ---------- Param√®tres ----------
CSV_PATH = "fake_swiss_restaurants.csv"  # <-- remplace si besoin
YEAR_MIN, YEAR_MAX = 2011, 2021          # bornes (utiles si tu filtres)
EXCLUDE_YEAR = []                         # ex.: [2020] pour ignorer l'ann√©e COVID

# ---------- Chargement ----------
df = pd.read_csv(CSV_PATH)

# Normalisation de colonnes (au cas o√π)
df.columns = [c.strip().lower() for c in df.columns]
expected_cols = {"year", "locality", "num_restaurants"}
missing = expected_cols - set(df.columns)
if missing:
    raise ValueError(f"Colonnes manquantes dans le CSV: {missing}. "
                     f"Colonnes trouv√©es: {list(df.columns)}")

# Filtrage √©ventuel des ann√©es
mask = (df["year"].between(YEAR_MIN, YEAR_MAX)) & (~df["year"].isin(EXCLUDE_YEAR))
df = df.loc[mask].copy()

# S√©curit√© : on exclut lignes non positives (Poisson attend des comptes >= 0)
df = df[df["num_restaurants"] >= 0].copy()

def poisson_trend_for_group(g: pd.DataFrame) -> pd.Series:
    """
    Calcule des indicateurs de tendance pour une localit√© donn√©e :
    - Annual Growth % (Poisson GLM): exp(beta_year) - 1
    - CAGR % entre la 1√®re et la derni√®re ann√©e observ√©es
    - Deviance explained (pseudo-R¬≤)
    - p-value du coef de l'ann√©e
    - trend_score = (Annual Growth %) * max(0, deviance_explained)
    """
    g = g.sort_values("year")
    # Centre l'ann√©e pour limiter la colin√©arit√©/intercept
    year_centered = g["year"] - g["year"].mean()
    X = sm.add_constant(year_centered)
    y = g["num_restaurants"]

    # GLM Poisson (robuste aux petites d√©viations via HC3)
    model = sm.GLM(y, X, family=sm.families.Poisson())
    res = model.fit(cov_type="HC3")

    # Coef associ√© √† l'ann√©e centr√©e
    beta_year = res.params["year"]
    pval_year = res.pvalues["year"]

    # Taux de croissance annuel "multiplicatif" (en %)
    # Interpr√©tation: pour +1 ann√©e, multiplicateur = exp(beta_year)
    annual_growth_pct = (np.exp(beta_year) - 1.0) * 100.0

    # Pseudo-R¬≤ via deviance expliqu√©e
    try:
        dev = res.deviance
        null_dev = res.null_deviance
        dev_explained = max(0.0, 1.0 - (dev / null_dev)) if null_dev > 0 else 0.0
    except Exception:
        dev_explained = 0.0

    # CAGR (simple, observ√© entre 1√®re et derni√®re ann√©e)
    first_year = int(g["year"].iloc[0])
    last_year  = int(g["year"].iloc[-1])
    n_years = last_year - first_year
    first_val = float(g["num_restaurants"].iloc[0])
    last_val  = float(g["num_restaurants"].iloc[-1])
    if n_years > 0 and first_val > 0:
        cagr_pct = ((last_val / first_val) ** (1 / n_years) - 1.0) * 100.0
    else:
        cagr_pct = np.nan

    # Trend score (facile √† ordonner): amplitude * qualit√© d'ajustement
    # -> positif si croissance, n√©gatif si d√©croissance
    trend_score = annual_growth_pct * dev_explained

    return pd.Series({
        "locality": g["locality"].iloc[0],
        "years_covered": f"{first_year}-{last_year}",
        "n_obs": len(g),
        "annual_growth_pct_poisson": annual_growth_pct,
        "cagr_pct": cagr_pct,
        "p_value_year": pval_year,
        "deviance_explained": dev_explained,
        "trend_score": trend_score
    })

# Calcul par localit√©
trend_table = (
    df.groupby("locality", as_index=False)
      .apply(poisson_trend_for_group)
      .reset_index(drop=True)
)

# Tri par score d√©croissant (plus haut = plus dynamique)
trend_table_sorted = trend_table.sort_values("trend_score", ascending=False)

# Affichage et sauvegarde
pd.set_option("display.float_format", lambda x: f"{x:,.4f}")
display(trend_table_sorted)

OUT_CSV = "locality_trends_poisson.csv"
trend_table_sorted.to_csv(OUT_CSV, index=False)
print(f"üìÑ R√©sultats export√©s ‚Üí {OUT_CSV}")

# --------- (Optionnel) quelques lectures utiles ----------
# Filtre des tendances significatives au seuil 10%
signif_10 = trend_table_sorted[trend_table_sorted["p_value_year"] < 0.10]
print(f"\nLocalit√©s avec tendance significative (p<0.10): {len(signif_10)}")
display(signif_10[["locality","annual_growth_pct_poisson","cagr_pct","p_value_year","trend_score"]].head(20))


  .apply(poisson_trend_for_group)


Unnamed: 0,locality,years_covered,n_obs,annual_growth_pct_poisson,cagr_pct,p_value_year,deviance_explained,trend_score
13,neuchatel,2011-2021,11,1.431,1.8863,0.003,0.5273,0.7546
15,st_gallen,2011-2021,11,0.5735,0.9163,0.0817,0.2819,0.1617
3,chur,2011-2021,11,0.5681,0.9447,0.1086,0.2539,0.1443
8,lausanne,2011-2021,11,0.4988,0.8534,0.1538,0.2091,0.1043
5,fribourg,2011-2021,11,0.448,0.6767,0.178,0.1981,0.0887
17,winterthur,2011-2021,11,0.4541,0.789,0.1948,0.1773,0.0805
6,geneva,2011-2021,11,0.3429,0.7124,0.3093,0.1225,0.042
19,zurich,2011-2021,11,0.3373,0.7033,0.3136,0.1245,0.042
10,lugano,2011-2021,11,0.3171,0.6134,0.3373,0.1075,0.0341
1,bern,2011-2021,11,0.278,0.5632,0.3893,0.086,0.0239


üìÑ R√©sultats export√©s ‚Üí locality_trends_poisson.csv

Localit√©s avec tendance significative (p<0.10): 3


Unnamed: 0,locality,annual_growth_pct_poisson,cagr_pct,p_value_year,trend_score
13,neuchatel,1.431,1.8863,0.003,0.7546
15,st_gallen,0.5735,0.9163,0.0817,0.1617
11,montreux,-2.0358,-2.5895,0.0748,-0.5356
