<a href="https://colab.research.google.com/github/Jalalbaim/AI-Powered-X-Ray-Image-Analysis-for-Continuous-Tomography-System-Monitoring/blob/main/Lab1_streaming.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lab 1: Data stream Processing
@author: MJ.Baim

---

## River Framework

In [None]:
!pip install river

Collecting river
  Downloading river-0.22.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.0 kB)
Collecting pandas<3.0.0,>=2.2.3 (from river)
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Downloading river-0.22.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m140.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pandas, river
  Attempting uninstall: pandas
    Found existing installation: pandas 2.2.2
    Uninstalling pandas-2.2.2:
      Successfully u

In [None]:
import time
from river import datasets, preprocessing, linear_model, metrics, optim, forest, drift, tree

def mean_ms(total_s, n):
    return (total_s / max(n, 1)) * 1_000

def print_header(title):
    print("\n" + "="*len(title))
    print(title)
    print("="*len(title))

# -----------------------------
# ELEC2 — Classification with drift
# Modèles:
#   1) Baseline = StandardScaler + LogisticRegression + reset sur drift ADWIN (externe)
#   2) ARF = Adaptive Random Forest (pas de reset externe)
#   3) HAT = Hoeffding Adaptive Tree (pas de reset externe)
#   4) EFDT = Extremely Fast Decision Tree (pas de reset externe)
# Métriques: Acc, F1, Kappa, Balanced Acc, temps/élément (ms), flags/reset ADWIN
# -----------------------------
def run_elec2(max_n=None, report_every=20000):
    ds = datasets.Elec2()
    n = 0

    # 1) Baseline
    def new_base():
        return preprocessing.StandardScaler() | linear_model.LogisticRegression(
            optimizer=optim.SGD(0.01), l2=1e-3
        )
    base_model = new_base()

    # 2) ARF
    arf = forest.ARFClassifier(n_models=10, seed=42)

    # 3) HAT
    hat = tree.HoeffdingAdaptiveTreeClassifier(seed=42)

    # 4) EFDT
    efdt = tree.ExtremelyFastDecisionTreeClassifier()

    # Metrics (une famille par modèle)
    acc_base, f1_base = metrics.Accuracy(), metrics.F1()
    kap_base, bal_base = metrics.CohenKappa(), metrics.BalancedAccuracy()

    acc_arf, f1_arf = metrics.Accuracy(), metrics.F1()
    kap_arf, bal_arf = metrics.CohenKappa(), metrics.BalancedAccuracy()

    acc_hat, f1_hat = metrics.Accuracy(), metrics.F1()
    kap_hat, bal_hat = metrics.CohenKappa(), metrics.BalancedAccuracy()

    acc_efdt, f1_efdt = metrics.Accuracy(), metrics.F1()
    kap_efdt, bal_efdt = metrics.CohenKappa(), metrics.BalancedAccuracy()

    # Timings
    t_base = 0.0
    t_arf  = 0.0
    t_hat  = 0.0
    t_efdt = 0.0

    # Détecteurs de drift sur flux d'erreurs (mesure)
    adw_base = drift.ADWIN()
    adw_arf  = drift.ADWIN()
    adw_hat  = drift.ADWIN()
    adw_efdt = drift.ADWIN()

    # Compteurs de détection (reset réel uniquement pour baseline)
    drift_resets_base = 0
    drift_flags_arf   = 0
    drift_flags_hat   = 0
    drift_flags_efdt  = 0

    # Courbes d'apprentissage (checkpoints)
    checkpoints = []

    for x, y in ds:
        if (max_n is not None) and (n >= max_n):
            break
        n += 1

        # -------- Baseline --------
        t0 = time.perf_counter()
        yb = base_model.predict_one(x)
        acc_base.update(y_true=y, y_pred=yb)
        f1_base.update(y_true=y, y_pred=yb)
        kap_base.update(y_true=y, y_pred=yb)
        bal_base.update(y_true=y, y_pred=yb)
        base_model.learn_one(x, y)
        t_base += time.perf_counter() - t0

        err_b = 0 if (yb == y) else 1
        adw_base.update(err_b)
        if adw_base.drift_detected:
            drift_resets_base += 1
            base_model = new_base()  # reset réel

        # -------- ARF --------
        t1 = time.perf_counter()
        ya = arf.predict_one(x)
        acc_arf.update(y_true=y, y_pred=ya)
        f1_arf.update(y_true=y, y_pred=ya)
        kap_arf.update(y_true=y, y_pred=ya)
        bal_arf.update(y_true=y, y_pred=ya)
        arf.learn_one(x, y)
        t_arf += time.perf_counter() - t1

        err_a = 0 if (ya == y) else 1
        adw_arf.update(err_a)
        if adw_arf.drift_detected:
            drift_flags_arf += 1  # mesure (pas de reset)

        # -------- HAT --------
        t2 = time.perf_counter()
        yh = hat.predict_one(x)
        acc_hat.update(y_true=y, y_pred=yh)
        f1_hat.update(y_true=y, y_pred=yh)
        kap_hat.update(y_true=y, y_pred=yh)
        bal_hat.update(y_true=y, y_pred=yh)
        hat.learn_one(x, y)
        t_hat += time.perf_counter() - t2

        err_h = 0 if (yh == y) else 1
        adw_hat.update(err_h)
        if adw_hat.drift_detected:
            drift_flags_hat += 1  # mesure (pas de reset)

        # -------- EFDT --------
        t3 = time.perf_counter()
        ye = efdt.predict_one(x)
        acc_efdt.update(y_true=y, y_pred=ye)
        f1_efdt.update(y_true=y, y_pred=ye)
        kap_efdt.update(y_true=y, y_pred=ye)
        bal_efdt.update(y_true=y, y_pred=ye)
        efdt.learn_one(x, y)
        t_efdt += time.perf_counter() - t3

        err_e = 0 if (ye == y) else 1
        adw_efdt.update(err_e)
        if adw_efdt.drift_detected:
            drift_flags_efdt += 1  # mesure (pas de reset)

        # Checkpoints
        if report_every and (n % report_every == 0):
            checkpoints.append((
                n,
                acc_base.get(), f1_base.get(), kap_base.get(), bal_base.get(),
                acc_arf.get(),  f1_arf.get(),  kap_arf.get(),  bal_arf.get(),
                acc_hat.get(),  f1_hat.get(),  kap_hat.get(),  bal_hat.get(),
                acc_efdt.get(), f1_efdt.get(), kap_efdt.get(), bal_efdt.get(),
                drift_resets_base, drift_flags_arf, drift_flags_hat, drift_flags_efdt
            ))

    # -------------------- Rapport final --------------------
    print_header("ELEC2 — Final")
    print(f"samples: {n}")

    print("\n[Baseline: Logistic + ADWIN reset]")
    print(f"Acc={acc_base.get():.4f} | F1={f1_base.get():.4f} | Kappa={kap_base.get():.4f} | "
          f"BalAcc={bal_base.get():.4f} | time/elem={mean_ms(t_base, n):.3f} ms | resets={drift_resets_base}")

    print("\n[ARF: Adaptive Random Forest]")
    print(f"Acc={acc_arf.get():.4f} | F1={f1_arf.get():.4f} | Kappa={kap_arf.get():.4f} | "
          f"BalAcc={bal_arf.get():.4f} | time/elem={mean_ms(t_arf, n):.3f} ms | ADWIN flags={drift_flags_arf}")

    print("\n[HAT: Hoeffding Adaptive Tree]")
    print(f"Acc={acc_hat.get():.4f} | F1={f1_hat.get():.4f} | Kappa={kap_hat.get():.4f} | "
          f"BalAcc={bal_hat.get():.4f} | time/elem={mean_ms(t_hat, n):.3f} ms | ADWIN flags={drift_flags_hat}")

    print("\n[EFDT: Extremely Fast Decision Tree]")
    print(f"Acc={acc_efdt.get():.4f} | F1={f1_efdt.get():.4f} | Kappa={kap_efdt.get():.4f} | "
          f"BalAcc={bal_efdt.get():.4f} | time/elem={mean_ms(t_efdt, n):.3f} ms | ADWIN flags={drift_flags_efdt}")

    if checkpoints:
        print("\nLearning curve checkpoints:")
        print("N\tAcc_b\tF1_b\tKap_b\tBal_b\tAcc_ARF\tF1_ARF\tKap_ARF\tBal_ARF\tAcc_HAT\tF1_HAT\tKap_HAT\tBal_HAT\tAcc_EFDT\tF1_EFDT\tKap_EFDT\tBal_EFDT\tResets_b\tFlags_ARF\tFlags_HAT\tFlags_EFDT")
        for (N, ab, fb, kb, bb, aa, fa, ka, ba, ah, fh, kh, bh, ae, fe, ke, be, drb, dfa, dfh, dfe) in checkpoints:
            print(f"{N}\t{ab:.4f}\t{fb:.4f}\t{kb:.4f}\t{bb:.4f}\t"
                  f"{aa:.4f}\t{fa:.4f}\t{ka:.4f}\t{ba:.4f}\t"
                  f"{ah:.4f}\t{fh:.4f}\t{kh:.4f}\t{bh:.4f}\t"
                  f"{ae:.4f}\t{fe:.4f}\t{ke:.4f}\t{be:.4f}\t"
                  f"{drb}\t\t{dfa}\t\t{dfh}\t\t{dfe}")

if __name__ == "__main__":
    run_elec2(report_every=20000)


Downloading https://maxhalford.github.io/files/datasets/electricity.zip (697.72 KiB)
Uncompressing into /root/river_data/Elec2

ELEC2 — Final
samples: 45312

[Baseline: Logistic + ADWIN reset]
Acc=0.8229 | F1=0.7796 | Kappa=0.6323 | BalAcc=0.8117 | time/elem=0.086 ms | resets=32

[ARF: Adaptive Random Forest]
Acc=0.8884 | F1=0.8668 | Kappa=0.7709 | BalAcc=0.5893 | time/elem=1.496 ms | ADWIN flags=9

[HAT: Hoeffding Adaptive Tree]
Acc=0.8359 | F1=0.8028 | Kappa=0.6624 | BalAcc=0.5530 | time/elem=0.229 ms | ADWIN flags=49

[EFDT: Extremely Fast Decision Tree]
Acc=0.8286 | F1=0.7977 | Kappa=0.6490 | BalAcc=0.5495 | time/elem=0.492 ms | ADWIN flags=41

Learning curve checkpoints:
N	Acc_b	F1_b	Kap_b	Bal_b	Acc_ARF	F1_ARF	Kap_ARF	Bal_ARF	Acc_HAT	F1_HAT	Kap_HAT	Bal_HAT	Acc_EFDT	F1_EFDT	Kap_EFDT	Bal_EFDT	Resets_b	Flags_ARF	Flags_HAT	Flags_EFDT
20000	0.8435	0.8131	0.6789	0.8362	0.8990	0.8836	0.7944	0.5977	0.8635	0.8435	0.7224	0.5740	0.8583	0.8359	0.7113	0.5697	11		3		18		18
40000	0.8209	0.7739	0

In [None]:
import time
from river import datasets, preprocessing, linear_model, naive_bayes, tree, metrics, optim

# PHISHING — 4 classifieurs supervisés (sans adaptatif)
def run_phishing_supervised_v2(max_n=None, threshold=0.5, report_every=20_000):
    ds = datasets.Phishing()  # y in {0,1} (1 = phishing)

    models = {
        "PassiveAggressive+Std": preprocessing.StandardScaler() | linear_model.PAClassifier(
            C=0.01, mode=1
        ),
        "GaussianNB+Std": preprocessing.StandardScaler() | naive_bayes.GaussianNB(),
        "EFDT": tree.ExtremelyFastDecisionTreeClassifier(),
        "ComplementNB+OH": preprocessing.OneHotEncoder() | naive_bayes.ComplementNB(),
    }

    metrics_map = {
        name: {
            "acc": metrics.Accuracy(),
            "f1": metrics.F1(),
            "prec": metrics.Precision(),
            "rec": metrics.Recall(),
            "roc": metrics.ROCAUC()   # pas de pos_label dans ta version
        }
        for name in models
    }

    times = {name: 0.0 for name in models}
    checkpoints, n = [], 0

    for x, y in ds:
        if (max_n is not None) and (n >= max_n):
            break
        n += 1

        for name, model in models.items():
            t0 = time.perf_counter()

            # --- prédiction robuste ---
            p1 = 0.0
            if hasattr(model, "predict_proba_one"):
                proba = model.predict_proba_one(x)
                if isinstance(proba, dict):
                    p1 = proba.get(1, 0.0)

            # si pas de proba dispo, fallback via predict_one()
            if p1 == 0.0 and not hasattr(model, "predict_proba_one"):
                y_hat_tmp = model.predict_one(x)
                p1 = 1.0 if y_hat_tmp == 1 else 0.0

            y_hat = 1 if p1 >= threshold else 0

            # --- métriques ---
            m = metrics_map[name]
            m["acc"].update(y, y_hat)
            m["f1"].update(y, y_hat)
            m["prec"].update(y, y_hat)
            m["rec"].update(y, y_hat)
            m["roc"].update(y, p1)

            # --- apprentissage (NE PAS réassigner) ---
            model.learn_one(x, y)

            times[name] += time.perf_counter() - t0

        if report_every and (n % report_every == 0):
            checkpoints.append((
                n,
                *[metrics_map[k]["acc"].get() for k in models],
                *[metrics_map[k]["f1"].get() for k in models],
                *[metrics_map[k]["roc"].get() for k in models],
            ))

    print("\n=========== PHISHING — Final (v2, fixed) ===========")
    print(f"samples: {n}")
    for name in models:
        m = metrics_map[name]
        ms_per_elem = (times[name] / max(n, 1)) * 1000
        print(f"\n[{name}]")
        print(f"Acc={m['acc'].get():.4f} | F1={m['f1'].get():.4f} | "
              f"Prec={m['prec'].get():.4f} | Rec={m['rec'].get():.4f} | "
              f"ROC-AUC={m['roc'].get():.4f} | time/elem={ms_per_elem:.3f} ms")

    if checkpoints:
        print("\nLearning curve checkpoints:")
        heads = ["N"] + [f"Acc_{k}" for k in models] + [f"F1_{k}" for k in models] + [f"AUC_{k}" for k in models]
        print("\t".join(heads))
        for row in checkpoints:
            print("\t".join([str(row[0])] + [f"{v:.4f}" for v in row[1:]]))

if __name__ == "__main__":
    run_phishing_supervised_v2(max_n=100_000, report_every=20_000)



samples: 1250

[PassiveAggressive+Std]
Acc=0.8968 | F1=0.8833 | Prec=0.8761 | Rec=0.8905 | ROC-AUC=0.9552 | time/elem=0.078 ms

[GaussianNB+Std]
Acc=0.8792 | F1=0.8660 | Prec=0.8428 | Rec=0.8905 | ROC-AUC=0.9173 | time/elem=0.238 ms

[EFDT]
Acc=0.8872 | F1=0.8726 | Prec=0.8640 | Rec=0.8814 | ROC-AUC=0.9082 | time/elem=0.785 ms

[ComplementNB+OH]
Acc=0.9048 | F1=0.8944 | Prec=0.8705 | Rec=0.9197 | ROC-AUC=0.9569 | time/elem=0.381 ms


## CAPYMOA

In [None]:
!pip install capymoa


Collecting capymoa
  Downloading capymoa-0.11.0-py3-none-any.whl.metadata (5.0 kB)
Collecting deprecated (from capymoa)
  Downloading Deprecated-1.2.18-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting jpype1>=v1.5.1 (from capymoa)
  Downloading jpype1-1.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.0 kB)
Collecting wget (from capymoa)
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading capymoa-0.11.0-py3-none-any.whl (60.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.5/60.5 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jpype1-1.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (495 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m495.9/495.9 kB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Deprecated-1.2.18-py2.py3-none-any.whl (10.0 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... 