
# Predictive Prefetch Simulation (WPA Page Faults → Offline Model)
This notebook lets you:
- Load a **Windows Performance Analyzer** (WPA) **Page Faults** CSV export
- Preprocess and **bucket** virtual addresses
- Run **baselines**: LRU working-set, Stride, Markov-1, Markov-2
- Evaluate **prefetch** decisions with a time/fault horizon and cache budget
- (Optional) Compare with a lightweight **ML classifier** if `scikit-learn` is available


In [None]:

import os, re, math, json, random
from pathlib import Path
from collections import defaultdict, Counter, deque

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 120)
pd.set_option('display.width', 180)


In [None]:

# --- Configuration ---
CSV_PATH = r"/mnt/data/data.csv - Sheet1.csv"
PROCESS_NAME_FILTER = "chrome"  # set None for system-wide
BUCKET_SIZE = 65536

PREFETCH_K = 1
HORIZON_EVENTS = 10
CACHE_PAGES = 32768

RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)


In [None]:

# --- Load CSV ---
df_raw = pd.read_csv(CSV_PATH)

col_map = {
    'Log Time (s)': 'LogTimeSec',
    'ByteCount': 'Bytes',
    'Thread ID': 'TID',
    'Virtual Address': 'VirtualAddress',
}
for k, v in col_map.items():
    if k in df_raw.columns:
        df_raw.rename(columns={k: v}, inplace=True)

df = df_raw.dropna(subset=['Process']).copy()

def split_proc(s: str):
    m = re.match(r'^(.*)\s+\((\d+)\)$', str(s).strip())
    if m:
        return m.group(1).strip(), int(m.group(2))
    return s.strip(), np.nan

pinfo = df['Process'].apply(split_proc)
df['ProcessName'] = pinfo.map(lambda x: x[0])
df['PID'] = pinfo.map(lambda x: x[1]).astype('Int64')

for c in ['LogTimeSec', 'Bytes', 'Count']:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')

if PROCESS_NAME_FILTER:
    mask = df['ProcessName'].str.lower().str.contains(PROCESS_NAME_FILTER.lower(), na=False)
    df = df[mask].copy()

df = df.dropna(subset=['LogTimeSec', 'VirtualAddress']).copy()

def parse_va(v):
    s = str(v).strip()
    try:
        if s.lower().startswith('0x'):
            return int(s, 16)
        return int(s, 16)
    except Exception:
        try:
            return int(float(s))
        except Exception:
            return np.nan

df['VA'] = df['VirtualAddress'].apply(parse_va)
df = df.dropna(subset=['VA']).copy()
df['VA'] = df['VA'].astype('int64')

df['PageID'] = (df['VA'] // 4096).astype('int64')
df['BucketID'] = (df['VA'] // BUCKET_SIZE).astype('int64')

df = df.sort_values(['PID', 'LogTimeSec']).reset_index(drop=True)

print("Rows:", len(df), "Processes:", df['PID'].nunique())
df.head(10)


In [None]:

# --- Build sequences per PID ---
sequences = {}
for pid, grp in df.groupby('PID'):
    seq = list(zip(grp['LogTimeSec'].to_numpy(), grp['BucketID'].to_numpy()))
    if len(seq) >= 3:
        sequences[int(pid)] = seq

len(sequences), list(sequences.keys())[:5]


In [None]:

# --- Evaluation Utilities ---
from typing import List, Tuple, Dict
class PrefetchSimulator:
    def __init__(self, sequences: Dict[int, List[tuple]], cache_pages=32768, horizon_events=10):
        self.sequences = sequences
        self.cache_pages = cache_pages
        self.horizon_events = horizon_events

    def evaluate(self, policy_fn, prefetch_k=1, name="policy"):
        results = []
        for pid, seq in self.sequences.items():
            cache = deque(maxlen=self.cache_pages)
            cache_set = set()
            preds = hits = faults = overfetch = 0
            buckets = [b for _, b in seq]

            for i, b in enumerate(buckets):
                faults += 1
                if b not in cache_set:
                    if len(cache) == cache.maxlen and cache:
                        ev = cache.popleft()
                        cache_set.discard(ev)
                    cache.append(b); cache_set.add(b)
                else:
                    cache.remove(b); cache.append(b)

                ctx = {'pid': pid, 'i': i, 'seq': buckets, 'horizon_events': self.horizon_events}
                cand = policy_fn(ctx) or []
                cand = list(cand)[:prefetch_k]

                if cand:
                    preds += len(cand)
                    for c in cand:
                        if c not in cache_set:
                            if len(cache) == cache.maxlen and cache:
                                ev = cache.popleft(); cache_set.discard(ev)
                            cache.append(c); cache_set.add(c)
                        overfetch += 1

                    horizon_end = min(len(buckets), i + 1 + self.horizon_events)
                    fut = set(buckets[i+1:horizon_end])
                    if any(c in fut for c in cand):
                        hits += 1

            results.append({'PID': pid, 'Faults': faults, 'Prefetches': preds, 'PrefetchHits': hits,
                            'PrefetchPrecision': (hits/preds) if preds else 0.0,
                            'FaultHitRate': (hits/faults) if faults else 0.0,
                            'Overfetch': overfetch})
        out = pd.DataFrame(results); out['Policy'] = name; return out

def plot_bar(df_summary, metric, title):
    plt.figure()
    g = df_summary.groupby('Policy')[metric].mean().reset_index()
    plt.bar(range(len(g)), g[metric].values)
    plt.xticks(range(len(g)), g['Policy'].tolist(), rotation=0, ha='center')
    plt.title(title); plt.ylabel(metric); plt.tight_layout(); plt.show()


In [None]:

# --- Policies ---
def policy_none(ctx):
    return []

def policy_stride(ctx):
    i = ctx['i']; seq = ctx['seq']
    if i < 2: return []
    last, prev = seq[i], seq[i-1]
    stride = last - prev
    prev2 = seq[i-2]
    stride2 = prev - prev2
    if stride == stride2 and stride != 0:
        return [last + stride]
    return []

def build_markov(sequences, order=1):
    models = {}
    for pid, seq in sequences.items():
        buckets = [b for _, b in seq]
        if len(buckets) <= order: continue
        counts = defaultdict(Counter)
        for i in range(order, len(buckets)):
            key = tuple(buckets[i-order:i])
            nxt = buckets[i]
            counts[key][nxt] += 1
        best = {k: v.most_common(1)[0][0] for k, v in counts.items() if v}
        models[pid] = best
    return models

mk1 = build_markov(sequences, order=1)
mk2 = build_markov(sequences, order=2)

def policy_markov1(ctx):
    pid = ctx['pid']; i = ctx['i']; seq = ctx['seq']
    if i < 1 or pid not in mk1: return []
    key = (seq[i],)
    pred = mk1[pid].get(key)
    return [pred] if pred is not None else []

def policy_markov2(ctx):
    pid = ctx['pid']; i = ctx['i']; seq = ctx['seq']
    if i < 2 or pid not in mk2: return []
    key = (seq[i-1], seq[i])
    pred = mk2[pid].get(key)
    return [pred] if pred is not None else []


In [None]:

# --- Optional ML (scikit-learn) ---
try:
    from sklearn.linear_model import SGDClassifier
    from sklearn.feature_extraction import FeatureHasher
    SKLEARN_OK = True
except Exception as e:
    SKLEARN_OK = False
    print("scikit-learn not available; skipping ML section. Error:", e)

ML_LAST_M = 3
ML_CLASSES_LIMIT = 5000

def train_ml_models(sequences):
    if not SKLEARN_OK: return {}
    models = {}
    hasher = FeatureHasher(n_features=4096, input_type='string')
    for pid, seq in sequences.items():
        buckets = [b for _, b in seq]
        if len(buckets) <= ML_LAST_M + 1: continue
        counts = Counter(buckets)
        top_classes = set([b for b, _ in counts.most_common(ML_CLASSES_LIMIT)])

        X_raw, y = [], []
        for i in range(ML_LAST_M, len(buckets)-1):
            ctx = buckets[i-ML_LAST_M:i]
            nxt = buckets[i]
            if nxt not in top_classes: continue
            feats = [f"b={ctx[j]}@{j}" for j in range(len(ctx))]
            X_raw.append(feats); y.append(nxt)

        if not X_raw: continue
        X = hasher.transform(X_raw)
        clf = SGDClassifier(loss='log_loss', max_iter=1000, random_state=42)
        clf.fit(X, y)
        models[pid] = (clf, hasher, top_classes)
    return models

ml_models = train_ml_models(sequences)

def policy_ml(ctx):
    if not SKLEARN_OK: return []
    pid = ctx['pid']; i = ctx['i']; seq = ctx['seq']
    tup = ml_models.get(pid)
    if not tup or i < ML_LAST_M: return []
    clf, hasher, top_classes = tup
    ctx_b = seq[i-ML_LAST_M+1:i+1]
    feats = [f"b={ctx_b[j]}@{j}" for j in range(len(ctx_b))]
    X = hasher.transform([feats])
    try:
        pred = clf.predict(X)[0]
        if pred in top_classes:
            return [int(pred)]
    except Exception:
        pass
    return []


In [None]:

# --- Run Evaluations ---
sim = PrefetchSimulator(sequences, cache_pages=CACHE_PAGES, horizon_events=HORIZON_EVENTS)

res = []
res.append(sim.evaluate(policy_none, name="None", prefetch_k=PREFETCH_K))
res.append(sim.evaluate(policy_stride, name="Stride", prefetch_k=PREFETCH_K))
res.append(sim.evaluate(policy_markov1, name="Markov-1", prefetch_k=PREFETCH_K))
res.append(sim.evaluate(policy_markov2, name="Markov-2", prefetch_k=PREFETCH_K))
if 'ml_models' in globals() and ml_models:
    res.append(sim.evaluate(policy_ml, name="ML-Hashed@SGD", prefetch_k=PREFETCH_K))

summary = pd.concat(res, ignore_index=True)
summary.sort_values(['Policy', 'PID'], inplace=True)
summary.head(20)


In [None]:

# --- Aggregate metrics and plots ---
agg = (summary.groupby('Policy', as_index=False)
                .agg({'Faults':'sum','Prefetches':'sum','PrefetchHits':'sum'}))
agg['PrefetchPrecision'] = agg['PrefetchHits'] / agg['Prefetches'].replace(0, np.nan)
agg['FaultHitRate'] = agg['PrefetchHits'] / agg['Faults'].replace(0, np.nan)
agg


In [None]:

plot_bar(summary, 'PrefetchPrecision', 'Prefetch Precision (avg across PIDs)')
plot_bar(summary, 'FaultHitRate', 'Fault Hit Rate (avg across PIDs)')


In [None]:

# --- Save CSVs ---
out_dir = Path('/mnt/data')
summary_path = out_dir / 'prefetch_results_per_pid.csv'
agg_path = out_dir / 'prefetch_results_aggregate.csv'
summary.to_csv(summary_path, index=False)
agg.to_csv(agg_path, index=False)
print("Saved:", summary_path, agg_path)
