
# ✅ Corrigé — Mini‑projet : Classifieur Bayésien Naïf (Bernoulli)

**Cours :** Mathématiques pour l'informatique — FSGA / Université Quisqueya  
**Enseignant :** Geovany Batista Polo LAGUERRE — S1 — 2025–2026

Ce notebook fournit une **solution de référence** : comptages, lissage de Laplace, prédictions, comparaison `alpha=0` vs `alpha=1`, et une visualisation finale.


## Données jouet (recréation du CSV)

In [None]:

import csv, pandas as pd
from pathlib import Path
csv_path = Path("/mnt/data/train_pantalon.csv")
rows = [
    {"id":1, "pas_cher":1, "anglais":0, "achat":"OUI"},
    {"id":2, "pas_cher":0, "anglais":1, "achat":"NON"},
    {"id":3, "pas_cher":0, "anglais":1, "achat":"NON"},
    {"id":4, "pas_cher":0, "anglais":1, "achat":"NON"},
    {"id":5, "pas_cher":1, "anglais":0, "achat":"NON"},
    {"id":6, "pas_cher":1, "anglais":1, "achat":"OUI"},
    {"id":7, "pas_cher":1, "anglais":0, "achat":"OUI"},
    {"id":8, "pas_cher":1, "anglais":0, "achat":"OUI"},
]
csv_path.parent.mkdir(parents=True, exist_ok=True)
with open(csv_path, "w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=["id","pas_cher","anglais","achat"])
    w.writeheader(); w.writerows(rows)
df = pd.read_csv(csv_path)
df


## Implémentation (avec lissage de Laplace implémenté)

In [None]:

import csv
from collections import Counter, defaultdict
from math import log, exp
import pandas as pd

class NaiveBayesBernoulli:
    def __init__(self, alpha=1.0):
        self.alpha = float(alpha)
        self.classes_ = []
        self.features_ = []
        self.class_counts_ = Counter()
        self.feature_counts_ = defaultdict(lambda: Counter())
        self.n_ = 0

    def fit(self, X_list, y_list):
        self.n_ = len(y_list)
        self.classes_ = sorted(set(y_list))
        feat = set()
        for X in X_list:
            feat |= set(X.keys())
        self.features_ = sorted(feat)
        self.class_counts_.clear()
        self.feature_counts_.clear()
        for X, y in zip(X_list, y_list):
            self.class_counts_[y] += 1
            for f in self.features_:
                v = int(X.get(f, 0))
                self.feature_counts_[y][(f, v)] += 1
        return self

    def _p_class(self, c):
        return self.class_counts_[c] / self.n_

    def _p_feat_given_class(self, feat, val, c):
        c1 = self.feature_counts_[c][(feat, 1)]
        c0 = self.feature_counts_[c][(feat, 0)]
        tot = c1 + c0
        num = (c1 + self.alpha) if val == 1 else (c0 + self.alpha)
        den = tot + 2*self.alpha
        return num / den

    def predict_proba(self, X):
        scores = {}
        for c in self.classes_:
            s = log(self._p_class(c))
            for f in self.features_:
                v = int(X.get(f, 0))
                s += log(self._p_feat_given_class(f, v, c))
            scores[c] = s
        m = max(scores.values())
        exps = {c: exp(v - m) for c, v in scores.items()}
        Z = sum(exps.values())
        return {c: exps[c]/Z for c in self.classes_}

    def predict(self, X):
        proba = self.predict_proba(X)
        return max(proba, key=proba.get)

def load_csv_binary(path, feature_names=("pas_cher","anglais")):
    X_list, y_list = [], []
    with open(path, newline="", encoding="utf-8") as f:
        for row in csv.DictReader(f):
            X = {feat: int(row[feat]) for feat in feature_names}
            y = row["achat"].strip()
            X_list.append(X); y_list.append(y)
    return X_list, y_list

def pretty_counts_df(nb: NaiveBayesBernoulli):
    rows = []
    for c in nb.classes_:
        for f in nb.features_:
            c1 = nb.feature_counts_[c][(f,1)]
            c0 = nb.feature_counts_[c][(f,0)]
            tot = c1 + c0
            p1 = (c1 + nb.alpha) / (tot + 2*nb.alpha)
            p0 = (c0 + nb.alpha) / (tot + 2*nb.alpha)
            rows.append({
                "classe": c, "feature": f,
                "#1": c1, "#0": c0, "total": tot,
                "p(1|classe)": round(p1,3), "p(0|classe)": round(p0,3)
            })
    priors = {c: nb.class_counts_[c]/nb.n_ for c in nb.classes_}
    return pd.DataFrame(rows), priors


## Entraînement et comptages (alpha = 1.0)

In [None]:

X_train, y_train = load_csv_binary("/mnt/data/train_pantalon.csv", feature_names=("pas_cher","anglais"))
nb1 = NaiveBayesBernoulli(alpha=1.0).fit(X_train, y_train)
df_counts, priors = pretty_counts_df(nb1)
print("Priors (p(c)) :", {k:round(v,3) for k,v in priors.items()})
df_counts


## Prédictions — comparaison alpha=1.0 vs alpha=0.0

In [None]:

import pandas as pd

tests = [
    {"pas_cher":1, "anglais":1},
    {"pas_cher":1, "anglais":0},
    {"pas_cher":0, "anglais":1},
    {"pas_cher":0, "anglais":0},
]

def run_table(alpha):
    nb = NaiveBayesBernoulli(alpha=alpha).fit(X_train, y_train)
    rows = []
    for x in tests:
        proba = nb.predict_proba(x)
        yhat  = nb.predict(x)
        rows.append({
            "alpha": alpha,
            "pas_cher": x["pas_cher"],
            "anglais": x["anglais"],
            "p(OUI)": round(proba.get("OUI",0.0), 3),
            "p(NON)": round(proba.get("NON",0.0), 3),
            "pred": yhat
        })
    return pd.DataFrame(rows)

tab = pd.concat([run_table(1.0), run_table(0.0)], ignore_index=True)
tab



### Commentaire rapide
- Le prior est équilibré ici **p(OUI)=p(NON)=0.5** (4/8 chacun).  
- Avec lissage `alpha=1` : les probabilités conditionnelles sont **tirées vers 0.5**, évitant les zéros (stabilisation).  
- Pour `x=[1,1]`, la prédiction est **OUI** dans les deux cas, mais les scores lissés sont moins extrêmes.


## Visualisation : carte des scores p(OUI | pas_cher, anglais) (alpha=1)

In [None]:

import matplotlib.pyplot as plt
import numpy as np

nb = NaiveBayesBernoulli(alpha=1.0).fit(X_train, y_train)
grid = [(a,b) for a in [0,1] for b in [0,1]]
scores = [ nb.predict_proba({"pas_cher":a,"anglais":b})["OUI"] for a,b in grid ]

fig, ax = plt.subplots(figsize=(4,3))
im = ax.imshow(np.array(scores).reshape(2,2), vmin=0, vmax=1, origin="lower")
ax.set_xticks([0,1]); ax.set_xticklabels(["anglais=0","anglais=1"])
ax.set_yticks([0,1]); ax.set_yticklabels(["pas_cher=0","pas_cher=1"])
for i,(a,b) in enumerate(grid):
    ax.text(b, a, f"{scores[i]:.2f}", ha="center", va="center", color="w")
ax.set_title("p(OUI | pas_cher, anglais) — alpha=1")
fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
plt.tight_layout()
plt.show()
