# GE461 Project 5

#### Install dependicies

In [15]:
import numpy as np

from skmultiflow.trees import HoeffdingTreeClassifier
from skmultiflow.data import SEAGenerator, AGRAWALGenerator, ConceptDriftStream

In [16]:
# ----------------------------------------------------------------------
# Helper: wrap any two generators into an abrupt ConceptDriftStream ----
def make_drift(base_gen, drift_gen, position, width=1, seed=None):
    """Return a ConceptDriftStream with a single abrupt drift."""
    return ConceptDriftStream(
        stream=base_gen,
        drift_stream=drift_gen,
        position=position,
        width=width,
        random_state=seed,
    )

In [17]:
def nested_drifts(gen0, gen1, gen2, pos1, pos2, width=1):
    """Nest two ConceptDriftStreams so that gen0→gen1 at pos1, gen1→gen2 at pos2."""
    first_drift  = make_drift(gen0, gen1, position=pos1, width=width)
    second_drift = make_drift(first_drift, gen2, position=pos2, width=width)
    second_drift.prepare_for_use()          # initialise internal RNGs
    return second_drift

In [18]:
from tqdm import tqdm

In [None]:
from collections import deque
from pathlib import Path

import numpy as np
from skmultiflow.data import SEAGenerator, ConceptDriftStream
from skmultiflow.meta import AdaptiveRandomForestClassifier


# ──────────────────────────────────────────────────────────────────────
# 1. Build a SEA stream with drifts at 35 k and 60 k (width = 1)
POS1, POS2, WIDTH, N_SAMPLES = 35_000, 60_000, 1, 100_000

sea0 = SEAGenerator(classification_function=0, random_state=1)
sea1 = SEAGenerator(classification_function=1, random_state=2)
sea2 = SEAGenerator(classification_function=2, random_state=3)

drift1 = ConceptDriftStream(stream=sea0, drift_stream=sea1,
                            position=POS1, width=WIDTH, random_state=7)
sea_stream = ConceptDriftStream(stream=drift1, drift_stream=sea2,
                                position=POS2, width=WIDTH, random_state=8)
sea_stream.prepare_for_use()

# (optional) persist once for reproducibility
save_path = Path("SEADataset.csv")
if not save_path.exists():
    import csv, gzip
    with gzip.open(save_path.with_suffix(".csv.gz"), "wt", newline="") as gz:
        wr = csv.writer(gz)
        wr.writerow([f"x{i}" for i in range(sea_stream.n_features)] + ["y"])
        for _ in range(N_SAMPLES):
            X, y = sea_stream.next_sample()
            wr.writerow(np.append(X[0], y[0]))
    print(f"✓ Saved stream to {save_path}.gz")
    sea_stream.restart()               # rewind for evaluation


New instances of the Stream class are now ready to use after instantiation.
  probability_drift = 1.0 / (1.0 + np.exp(x))


✓ Saved stream to SEADataset.csv.gz


In [23]:
import pandas as pd
from skmultiflow.meta import AdaptiveRandomForestClassifier

# ──────────────────────────────────────────────────────────────────────
# 1. Load the saved stream with pandas
csv_file = "SEADataset.csv.gz"          # ".gz" extension is optional in read_csv
chunk_sz = 100                       # read in manageable pieces

reader = pd.read_csv(
    csv_file,
    compression="infer",                # auto‑detect .gz
    chunksize=chunk_sz                  # returns an iterator of DataFrame chunks
)

# 2. Re‑initialise the classifier (fresh run)
arf = AdaptiveRandomForestClassifier()

n_samples   = 0
correct_cnt = 0

# ──────────────────────────────────────────────────────────────────────
# 3. Iterate chunk‑by‑chunk, row‑by‑row (simulating an online stream)
for chunk in tqdm(reader, total=100_000 // chunk_sz):
    # fast vector → NumPy for efficiency (but still loop row‑wise)
    X_chunk = chunk.drop(columns="y").values
    y_chunk = chunk["y"].values

    for X_i, y_i in zip(X_chunk, y_chunk):
        X_i = X_i.reshape(1, -1)        # scikit‑multiflow expects 2‑D array

        # test phase
        y_pred = arf.predict(X_i)
        if y_pred.size and (y_pred[0] == y_i):
            correct_cnt += 1

        # train phase
        arf.partial_fit(X_i, [y_i])

        n_samples += 1

# ──────────────────────────────────────────────────────────────────────
print("\nAdaptive Random Forest (restored via pandas)")
print(f"{n_samples:,d} samples analysed")
print(f"Accuracy: {correct_cnt / n_samples:0.3f}")

100%|██████████| 1000/1000 [02:09<00:00,  7.74it/s]


Adaptive Random Forest (restored via pandas)
100,000 samples analysed
Accuracy: 0.991





In [None]:
#!/usr/bin/env python

import argparse
from collections import deque
from itertools import cycle
from typing import List, Type

import numpy as np
import pandas as pd
from tqdm import tqdm

from skmultiflow.trees import HoeffdingTreeClassifier
from skmultiflow.meta import AdaptiveRandomForestClassifier
from skmultiflow.drift_detection import DDM, EDDM, ADWIN
# If you need SAM‑kNN later:
# from skmultiflow.lazy import SAMKNNClassifier


# ──────────────────────────────────────────────────────────────────────
# 1 ·  Building‑block classes for the two custom ensembles
class SlidingWindowAccuracy:
    def __init__(self, size: int = 100):
        self.win = deque(maxlen=size)
    def update(self, correct: bool):
        self.win.append(1 if correct else 0)
    @property
    def value(self) -> float:
        return np.mean(self.win) if self.win else 0.0


class ActiveLearner:
    def __init__(self, detector_cls: Type[DDM], window: int = 100):
        self.clf = HoeffdingTreeClassifier()
        self.det = detector_cls()
        self.acc = SlidingWindowAccuracy(window)
    def predict(self, X):
        return self.clf.predict([X])[0]
    def update(self, X, y):
        y_hat   = self.predict(X)
        correct = y_hat == y
        self.acc.update(correct)
        self.det.add_element(int(not correct))
        self.clf.partial_fit([X], [y], classes=[0, 1])
        return correct
    def drift_detected(self):
        return getattr(self.det, "detected_change", False)


class _BaseEnsemble:
    DETECTORS = [DDM, EDDM, ADWIN]
    def __init__(self, n_learners=3, window=100, **kwargs):
        detector_cycle = cycle(self.DETECTORS)
        self.window   = window
        self.learners: List[ActiveLearner] = [
            ActiveLearner(next(detector_cycle), window) for _ in range(n_learners)
        ]
    def predict(self, X):
        votes = {}
        for lr in self.learners:
            label = lr.predict(X)
            w = lr.acc.value or 1e-6
            votes[label] = votes.get(label, 0.0) + w
        return max(votes.items(), key=lambda kv: (kv[1], -kv[0]))[0]
    def update(self, X, y):
        y_hat = self.predict(X)
        for i, lr in enumerate(self.learners):
            lr.update(X, y)
            if self._should_reset(lr):
                self.learners[i] = ActiveLearner(type(lr.det), self.window)
        return y_hat
    def _should_reset(self, lr: ActiveLearner):
        raise NotImplementedError


class ActiveDriftEnsemble(_BaseEnsemble):
    def _should_reset(self, lr):
        return lr.drift_detected()


class PassiveDriftEnsemble(_BaseEnsemble):
    def __init__(self, tau=0.70, **kw):
        super().__init__(**kw)
        self.tau = tau
    def _should_reset(self, lr):
        return lr.acc.value < self.tau


# ──────────────────────────────────────────────────────────────────────
# 2 ·  Factory for the four evaluation models
def make_model(name: str):
    name = name.lower()
    if name == "arf":
        return AdaptiveRandomForestClassifier()
    # elif name == "samknn":
    #     return SAMKNNClassifier()
    elif name == "active":
        return ActiveDriftEnsemble()
    elif name == "passive":
        return PassiveDriftEnsemble()
    else:
        raise ValueError("model must be arf | samknn | active | passive")


# ──────────────────────────────────────────────────────────────────────
# 3 ·  Main evaluation routine
def evaluate(file, model, chunksize=1000):
    reader = pd.read_csv(file, compression="infer", chunksize=chunksize)
    n_samples = correct = 0

    for chunk in tqdm(reader, total=None, unit="rows"):

        X_chunk = chunk.drop(columns="y").values
        y_chunk = chunk["y"].values
    

        for Xi, yi in zip(X_chunk, y_chunk):
            Xi = Xi.reshape(1, -1)
            y_pred = model.predict(Xi)
            if y_pred.size and (y_pred[0] == yi):
                correct += 1
            model.partial_fit(Xi, [yi]) if hasattr(model, "partial_fit") else model.update(Xi[0], yi)
            n_samples += 1

    return correct / n_samples, n_samples


0rows [00:00, ?rows/s]


KeyError: "['y'] not found in axis"