## X) Test load .WAV file

In [1]:
# quick_test.py
from dotenv import load_dotenv
import os, torch
from pyannote.audio import Pipeline

load_dotenv()
token = os.getenv("HF_TOKEN")
assert token and token.startswith("hf_")

print("CUDA:", torch.cuda.is_available())

pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token=token,
)
# Run on your short file
diar = pipeline("A-data/1-raw/1-2-speaker-enroll/001-Justin-Anderson/Justin-mini.wav")
print(diar)
#show segments
for i, (seg, track, spk) in enumerate(diar.itertracks(yield_label=True)):
    print(f"{i:02d}  {seg.start:.2f}–{seg.end:.2f}s  track={track}  -> {spk}")
    if i >= 5: break

# show first few segments
#for i, ((seg, _), label) in enumerate(diar.itertracks(yield_label=True)):
 #   print(f"{i:02d}  {seg.start:.2f}–{seg.end:.2f}s  -> {label}")
  #  if i >= 5: break


  torchaudio.set_audio_backend("soundfile")


CUDA: True


  backend = torchaudio.get_audio_backend()
  from speechbrain.pretrained import (
  torchaudio.set_audio_backend(backend)
  from torchaudio.backend.common import AudioMetaData
  std = sequences.std(dim=-1, correction=1)


[ 00:00:00.636 -->  00:00:05.237] A SPEAKER_00
[ 00:00:06.052 -->  00:00:08.616] B SPEAKER_00
[ 00:00:09.091 -->  00:00:13.047] C SPEAKER_00
[ 00:00:13.455 -->  00:00:18.259] D SPEAKER_00
[ 00:00:18.752 -->  00:00:21.689] E SPEAKER_00
[ 00:00:21.723 -->  00:00:22.555] F SPEAKER_00
00  0.64–5.24s  track=A  -> SPEAKER_00
01  6.05–8.62s  track=B  -> SPEAKER_00
02  9.09–13.05s  track=C  -> SPEAKER_00
03  13.46–18.26s  track=D  -> SPEAKER_00
04  18.75–21.69s  track=E  -> SPEAKER_00
05  21.72–22.56s  track=F  -> SPEAKER_00


In [2]:
for i, (seg, track, spk) in enumerate(diar.itertracks(yield_label=True)):
    print(f"{i:02d}  {seg.start:.2f}–{seg.end:.2f}s  track={track}  -> {spk}")
    #if i >= 5: break

00  0.64–5.24s  track=A  -> SPEAKER_00
01  6.05–8.62s  track=B  -> SPEAKER_00
02  9.09–13.05s  track=C  -> SPEAKER_00
03  13.46–18.26s  track=D  -> SPEAKER_00
04  18.75–21.69s  track=E  -> SPEAKER_00
05  21.72–22.56s  track=F  -> SPEAKER_00


In [3]:
print(diar)

[ 00:00:00.636 -->  00:00:05.237] A SPEAKER_00
[ 00:00:06.052 -->  00:00:08.616] B SPEAKER_00
[ 00:00:09.091 -->  00:00:13.047] C SPEAKER_00
[ 00:00:13.455 -->  00:00:18.259] D SPEAKER_00
[ 00:00:18.752 -->  00:00:21.689] E SPEAKER_00
[ 00:00:21.723 -->  00:00:22.555] F SPEAKER_00


In [4]:
diar.itertracks

<bound method Annotation.itertracks of <pyannote.core.annotation.Annotation object at 0x000001E0734ADB50>>

## 3) Build a manifest CSV (path,label,duration_s)

In [5]:
#3) Build a manifest CSV (path,label,duration_s)

import os, csv, librosa

WAVROOT = "A-data/1-raw/1-1-datasets" ## Just take from raw cause' 50-person dataset already in WAV 16khz files (ignore filename issues)
OUTCSV  = "A-data/2-processed/2-2-manifests/all.csv"

rows = []
for root, _, files in os.walk(WAVROOT):
    for f in files:
        if f.lower().endswith(".wav"):
            path = os.path.join(root, f)
            # label = immediate parent folder
            label = os.path.basename(os.path.dirname(path))
            try:
                dur = librosa.get_duration(path=path)
            except:
                dur = 0.0
            rows.append((path, label, f"{dur:.2f}"))

rows.sort()
os.makedirs(os.path.dirname(OUTCSV), exist_ok=True)
with open(OUTCSV, "w", newline="", encoding="utf-8") as fh:
    w = csv.writer(fh)
    w.writerow(["path","label","duration_s"])
    w.writerows(rows)
print(f"✓ Wrote {OUTCSV} with {len(rows)} rows")


✓ Wrote A-data/2-processed/2-2-manifests/all.csv with 2511 rows


## 3b) Create splits (train/val/test) first

Definitions (with analogy): 
Train → what the model actually learns from (study material).
Validation (val) → what you use to tune and choose the model (practice quiz you can take multiple times to adjust your study strategy).
Test → what you keep hidden until the very end, to report final performance (the real exam you only take once).

#### Why do we need validation (val)?

When you train, you often have knobs to adjust:
* Which algorithm (LogReg vs SVM)
* Hyperparameters (e.g., C=2.0 vs C=0.1 in LogisticRegression)
* Data prep choices (balance classes? scale differently?)
* Early stopping (stop training when performance stops improving)

If you use the test set to make those choices, you “peek” at your exam answers. The model will look artificially good, but you’ve actually tuned it to that test set.

So instead:

1. Train on train.
2. Evaluate on val while you adjust hyperparameters.
3. Once you’ve decided the “best” setup → lock it down.

Run one final report on test.

This way:

* val guides development.

* test measures generalization fairly.

In [6]:
## 3b) Create splits (train/val/test) first

import csv, os, random, collections

ALL_CSV  = "A-data/2-processed/2-2-manifests/all.csv"
SPLIT_CSV= "A-data/2-processed/2-2-manifests/split.csv"

random.seed(42)

# group by label
by_label = collections.defaultdict(list)
with open(ALL_CSV, newline="", encoding="utf-8") as fh:
    rdr = csv.DictReader(fh)
    for r in rdr:
        by_label[r["label"]].append(r["path"])

rows = []
for label, paths in by_label.items():
    random.shuffle(paths)
    n = len(paths)
    # 80/10/10 split, but be robust for tiny classes
    n_val  = max(1, int(0.10 * n)) if n >= 10 else max(0, int(0.10 * n))
    n_test = max(1, int(0.10 * n)) if n >= 10 else max(0, int(0.10 * n))
    # keep at least 1 in train if possible
    if n - (n_val + n_test) <= 0:
        n_val = 0
        n_test = min(1, n-1) if n > 1 else 0
    val  = set(paths[:n_val])
    test = set(paths[n_val:n_val+n_test])
    for p in paths:
        split = "train"
        if p in val:  split = "val"
        elif p in test: split = "test"
        rows.append({"path": p, "split": split})

os.makedirs(os.path.dirname(SPLIT_CSV), exist_ok=True)
with open(SPLIT_CSV, "w", newline="", encoding="utf-8") as fh:
    w = csv.DictWriter(fh, fieldnames=["path","split"])
    w.writeheader(); w.writerows(rows)

print(f"✓ Wrote {SPLIT_CSV} with {len(rows)} rows")


✓ Wrote A-data/2-processed/2-2-manifests/split.csv with 2511 rows


### Check the "split" (train/val/test) amounts

In [7]:
import pandas as pd

csv_path = "A-data/2-processed/2-2-manifests/split.csv"
df = pd.read_csv(csv_path)

# Count raw counts
print(df["split"].value_counts())

# Count percentages
print(df["split"].value_counts(normalize=True) * 100)


split
train    2051
val       230
test      230
Name: count, dtype: int64
split
train    81.680605
val       9.159697
test      9.159697
Name: proportion, dtype: float64


## 4) Extract ECAPA embeddings - had to "Turn on Windows “Developer Mode” (fastest, 1-time OS setting)"

In [8]:
# 4) Extract ECAPA embeddings

import os, csv, numpy as np, torch, torchaudio, pathlib
from speechbrain.pretrained import EncoderClassifier
from tqdm import tqdm

MANIFEST   = "A-data/2-processed/2-2-manifests/all.csv"
SPLIT_FILE = "A-data/2-processed/2-2-manifests/split.csv"
OUTDIR     = "B-work/1-dataset-ecapa-embeds"
MODEL_DIR  = "B-work/0-ecapa-model-cache"  # local snapshot folder

os.makedirs(OUTDIR, exist_ok=True)

# load split mapping
path2split = {}
with open(SPLIT_FILE, newline="", encoding="utf-8") as fh:
    for r in csv.DictReader(fh):
        path2split[r["path"]] = r["split"]

device = "cuda" if torch.cuda.is_available() else "cpu"
enc = EncoderClassifier.from_hparams(
    source=MODEL_DIR,         # load from local folder
    savedir=MODEL_DIR,        # keep caches here
    run_opts={"device": device}
)

def load_mono16k(path):
    wav, sr = torchaudio.load(path)
    if wav.shape[0] > 1:
        wav = torch.mean(wav, dim=0, keepdim=True)
    if sr != 16000:
        wav = torchaudio.functional.resample(wav, sr, 16000)
    return wav

paths, labels = [], []
with open(MANIFEST, newline="", encoding="utf-8") as fh:
    rdr = csv.DictReader(fh)
    for r in rdr:
        paths.append(r["path"])
        labels.append(r["label"])

out_index = []
for p, y in tqdm(list(zip(paths, labels)), desc="Embedding"):
    wav = load_mono16k(p).to(device)
    with torch.no_grad():
        emb = enc.encode_batch(wav).squeeze().cpu().numpy()
    spkdir = os.path.join(OUTDIR, y)
    pathlib.Path(spkdir).mkdir(parents=True, exist_ok=True)
    outnpy = os.path.join(spkdir, pathlib.Path(p).stem + ".npy")
    np.save(outnpy, emb)
    split = path2split.get(p, "train")  # default safe fallback
    out_index.append((outnpy, y, p, split))

# write index with split + original path
index_csv = os.path.join(OUTDIR, "index.csv")
with open(index_csv, "w", newline="", encoding="utf-8") as fh:
    w = csv.writer(fh)
    w.writerow(["npy","label","src_path","split"])
    w.writerows(out_index)

print("✓ ECAPA embeddings saved under:", OUTDIR)
print("✓ Index written:", index_csv)


  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)
Embedding: 100%|██████████| 2511/2511 [03:13<00:00, 12.95it/s]

✓ ECAPA embeddings saved under: B-work/1-dataset-ecapa-embeds
✓ Index written: B-work/1-dataset-ecapa-embeds\index.csv





In [9]:
# sanity test to make sure it can connect

import os
os.environ["HF_HUB_DISABLE_SYMLINKS"] = "1"  # force copy instead of symlink on Windows


import torch
from speechbrain.pretrained import EncoderClassifier
enc = EncoderClassifier.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    savedir="B-work/0-ecapa-model-cache",
    run_opts={"device": "cuda" if torch.cuda.is_available() else "cpu"},
)
print("Loaded ECAPA ✔")


Loaded ECAPA ✔


## 5) CANCELLED (break by test): Train a simple classifier (LogReg / LinearSVC) + metrics

In [10]:
## 5) Train a simple classifier (LogReg / LinearSVC) + metrics
## 5) Train a simple classifier (LogReg / LinearSVC) + metrics

import os, csv, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import joblib

INDEX = "B-work/1-dataset-ecapa-embeds/index.csv"
MODEL_OUT = "C-models/speaker_id_model.joblib"

X, y = [], []

# 1) Load embeddings and labels from index.csv
with open(INDEX, newline="", encoding="utf-8") as fh:
    rdr = csv.DictReader(fh)
    for r in rdr:
        X.append(np.load(r["npy"]))
        y.append(r["label"])

X = np.vstack(X)
y = np.array(y)

# 2) Encode string labels → integers
le = LabelEncoder()
y_enc = le.fit_transform(y)

# 3) Train/test split (stratified)
Xtr, Xte, ytr, yte = train_test_split(
    X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

# 4) Standardize features
scaler = StandardScaler(with_mean=True, with_std=True)
Xtr_s = scaler.fit_transform(Xtr)
Xte_s = scaler.transform(Xte)

# 5) Train classifier
clf = LogisticRegression(max_iter=2000, n_jobs=None, class_weight="balanced")
clf.fit(Xtr_s, ytr)

# 6) Evaluate
yhat = clf.predict(Xte_s)

print("== Classification Report ==")
print(classification_report(yte, yhat, target_names=le.classes_))
print("== Confusion Matrix ==")
print(confusion_matrix(yte, yhat))

# 7) Save bundle (model + scaler + encoder)
os.makedirs(os.path.dirname(MODEL_OUT), exist_ok=True)
joblib.dump(
    {"clf": clf, "scaler": scaler, "label_encoder": le},
    MODEL_OUT
)
print(f"✓ Saved {MODEL_OUT}")


== Classification Report ==
              precision    recall  f1-score   support

 Speaker0026       1.00      0.89      0.94         9
 Speaker0027       1.00      1.00      1.00         9
 Speaker0028       1.00      1.00      1.00        12
 Speaker0029       1.00      1.00      1.00         6
 Speaker0030       1.00      1.00      1.00         7
 Speaker0031       1.00      1.00      1.00         9
 Speaker0032       1.00      1.00      1.00         7
 Speaker0033       1.00      1.00      1.00         7
 Speaker0034       1.00      1.00      1.00         7
 Speaker0035       1.00      1.00      1.00         6
 Speaker0036       1.00      1.00      1.00         7
 Speaker0037       1.00      1.00      1.00        11
 Speaker0038       0.70      1.00      0.82         7
 Speaker0039       1.00      1.00      1.00        10
 Speaker0040       1.00      1.00      1.00         6
 Speaker0041       1.00      1.00      1.00         6
 Speaker0042       1.00      1.00      1.00         8

## 5a) Train Eval Classifier

In [11]:
# scripts/train_eval_classifier.py
import os, csv, pathlib, numpy as np, joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import matplotlib.pyplot as plt

INDEX_CSV = "B-work/1-dataset-ecapa-embeds/index.csv"
MODEL_OUT = "C-models/speaker_id_model.joblib"
REPORT_DIR= "D-reports"  # adjust if you use a different reports folder

pathlib.Path(REPORT_DIR).mkdir(parents=True, exist_ok=True)
pathlib.Path(os.path.dirname(MODEL_OUT)).mkdir(parents=True, exist_ok=True)

def load_split(split_name: str):
    X, y = [], []
    with open(INDEX_CSV, newline="", encoding="utf-8") as fh:
        rdr = csv.DictReader(fh)
        for r in rdr:
            if r["split"] != split_name:
                continue
            X.append(np.load(r["npy"]))
            y.append(r["label"])
    if not X:
        raise RuntimeError(f"No rows for split={split_name}. Did you generate split.csv and index.csv with 'split'?")
    return np.vstack(X), np.array(y)

# 1) Load splits
Xtr, ytr = load_split("train")
Xva, yva = load_split("val")
Xte, yte = load_split("test")

# 2) LabelEncoder: fit on TRAIN ONLY (avoid leakage)
le = LabelEncoder()
ytr_i = le.fit_transform(ytr)
yva_i = le.transform(yva)   # assumes val/test speakers ⊆ train speakers
yte_i = le.transform(yte)

# 3) Scale features: fit on TRAIN ONLY
scaler = StandardScaler(with_mean=True, with_std=True)
Xtr_s = scaler.fit_transform(Xtr)
Xva_s = scaler.transform(Xva)
Xte_s = scaler.transform(Xte)

# 4) Train
clf = LogisticRegression(
    max_iter=4000,
    class_weight="balanced",  # helpful when per-speaker counts differ
    n_jobs=None
)
clf.fit(Xtr_s, ytr_i)

# 5) Evaluate helper
def evaluate(Xs, ys_i, split_name):
    yhat = clf.predict(Xs)
    acc  = accuracy_score(ys_i, yhat)
    f1m  = f1_score(ys_i, yhat, average="macro")
    rep  = classification_report(ys_i, yhat, target_names=le.classes_)
    cm   = confusion_matrix(ys_i, yhat)

    # print to console
    print(f"\n== {split_name.upper()} ==")
    print(f"Accuracy: {acc:.4f} | Macro-F1: {f1m:.4f}")
    print(rep)

    # save text report
    with open(os.path.join(REPORT_DIR, f"{split_name}_report.txt"), "w", encoding="utf-8") as f:
        f.write(f"Accuracy: {acc:.4f}\nMacro-F1: {f1m:.4f}\n\n{rep}")

    # save confusion matrix fig
    plt.figure(figsize=(max(6, 0.5*len(le.classes_)), max(5, 0.5*len(le.classes_))))
    plt.imshow(cm, interpolation="nearest")
    plt.title(f"Confusion Matrix - {split_name}")
    plt.xlabel("Predicted"); plt.ylabel("True")
    plt.xticks(range(len(le.classes_)), le.classes_, rotation=90)
    plt.yticks(range(len(le.classes_)), le.classes_)
    plt.colorbar()
    plt.tight_layout()
    plt.savefig(os.path.join(REPORT_DIR, f"{split_name}_confusion_matrix.png"), dpi=150)
    plt.close()

# 6) Run evals
evaluate(Xtr_s, ytr_i, "train")
evaluate(Xva_s, yva_i, "val")
evaluate(Xte_s, yte_i, "test")

# 7) Save model bundle
joblib.dump(
    {"clf": clf, "scaler": scaler, "label_encoder": le},
    MODEL_OUT
)
print(f"\n✓ Saved model bundle → {MODEL_OUT}")
print(f"✓ Reports & confusion matrices → {REPORT_DIR}")



== TRAIN ==
Accuracy: 0.9990 | Macro-F1: 0.9991
              precision    recall  f1-score   support

 Speaker0026       1.00      1.00      1.00        37
 Speaker0027       1.00      1.00      1.00        39
 Speaker0028       1.00      1.00      1.00        49
 Speaker0029       1.00      1.00      1.00        25
 Speaker0030       1.00      1.00      1.00        27
 Speaker0031       1.00      1.00      1.00        39
 Speaker0032       1.00      1.00      1.00        31
 Speaker0033       1.00      1.00      1.00        29
 Speaker0034       1.00      1.00      1.00        28
 Speaker0035       1.00      1.00      1.00        26
 Speaker0036       1.00      1.00      1.00        27
 Speaker0037       1.00      1.00      1.00        45
 Speaker0038       1.00      1.00      1.00        28
 Speaker0039       1.00      1.00      1.00        41
 Speaker0040       1.00      1.00      1.00        26
 Speaker0041       1.00      1.00      1.00        26
 Speaker0042       1.00      1.0

### # scripts/predict_one_random_val.py

In [12]:
# scripts/predict_one_random_val.py
import csv, random, numpy as np, joblib

INDEX   = "B-work/1-dataset-ecapa-embeds/index.csv"
MODEL   = "C-models/speaker_id_model.joblib"

# 1) Load model bundle
bundle = joblib.load(MODEL)
clf    = bundle["clf"]
scaler = bundle["scaler"]
le     = bundle["label_encoder"]

# 2) Collect only VAL rows
val_rows = []
with open(INDEX, newline="", encoding="utf-8") as fh:
    for r in csv.DictReader(fh):
        if r.get("split") == "val":
            val_rows.append(r)

if not val_rows:
    raise RuntimeError("No 'val' rows found. Did you generate split.csv and index.csv with a val split?")

# 3) Pick ONE at random (set seed for repeatability if you want)
random.seed()  # or random.seed(42)
row = random.choice(val_rows)

# 4) Load embedding, scale, predict
emb   = np.load(row["npy"]).reshape(1, -1)
emb_s = scaler.transform(emb)
pred_i = clf.predict(emb_s)[0]
pred   = le.inverse_transform([pred_i])[0]

print(f"Truth: {row['label']}")
print(f"Pred : {pred}")

# Optional: show top-3 if classifier supports probabilities
if hasattr(clf, "predict_proba"):
    probs = clf.predict_proba(emb_s)[0]
    topk  = np.argsort(probs)[::-1][:3]
    print("\nTop-3:")
    for i in topk:
        print(f"  {le.classes_[i]}: {probs[i]:.3f}")


Truth: Speaker0029
Pred : Speaker0029

Top-3:
  Speaker0029: 0.998
  Speaker0032: 0.000
  Speaker0039: 0.000


### # 📦 Cell 1 — Setup & helpers (paths, loaders, plotting)

In [13]:
# === Setup & helpers ===
import os, csv, pathlib, numpy as np, joblib
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, Normalizer
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# Your project paths
INDEX_CSV  = "B-work/1-dataset-ecapa-embeds/index.csv"
REPORT_DIR = "D-reports/comparison"
MODEL_DIR  = "C-models/comparison"

pathlib.Path(REPORT_DIR).mkdir(parents=True, exist_ok=True)
pathlib.Path(MODEL_DIR).mkdir(parents=True, exist_ok=True)

def load_split(index_csv, split):
    X, y = [], []
    with open(index_csv, newline="", encoding="utf-8") as fh:
        rdr = csv.DictReader(fh)
        for r in rdr:
            if r["split"] != split:
                continue
            X.append(np.load(r["npy"]))
            y.append(r["label"])
    if not X:
        raise RuntimeError(f"No rows for split={split}. Regenerate index.csv with split?")
    return np.vstack(X), np.array(y)

def save_cm(cm, classes, out_png, title):
    plt.figure(figsize=(max(6, 0.5*len(classes)), max(5, 0.5*len(classes))))
    plt.imshow(cm, interpolation="nearest")
    plt.title(title)
    plt.xlabel("Predicted"); plt.ylabel("True")
    plt.xticks(range(len(classes)), classes, rotation=90)
    plt.yticks(range(len(classes)), classes)
    plt.colorbar()
    plt.tight_layout()
    plt.savefig(out_png, dpi=150)
    plt.close()


### 📥 Cell 2 — Load data & fit preprocessors (train-only fit)

In [14]:
# === Load splits ===
Xtr, ytr = load_split(INDEX_CSV, "train")
Xva, yva = load_split(INDEX_CSV, "val")
Xte, yte = load_split(INDEX_CSV, "test")

# === Label encoder: fit on TRAIN only ===
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
ytr_i = le.fit_transform(ytr)
yva_i = le.transform(yva)
yte_i = le.transform(yte)

# === Feature scaling: fit on TRAIN only ===
scaler = StandardScaler(with_mean=True, with_std=True)
Xtr_s  = scaler.fit_transform(Xtr)
Xva_s  = scaler.transform(Xva)
Xte_s  = scaler.transform(Xte)

print("Data ready:",
      "\n  train:", Xtr_s.shape,
      "\n  val  :", Xva_s.shape,
      "\n  test :", Xte_s.shape,
      "\n  classes:", len(le.classes_))


Data ready: 
  train: (2051, 192) 
  val  : (230, 192) 
  test : (230, 192) 
  classes: 50


### 🧪 Cell 3 — Evaluate helper (reused by all models)

In [15]:
# === Evaluation helper used by all models ===
def evaluate_and_report(name, clf, Xtr_s, ytr_i, Xva_s, yva_i, Xte_s, yte_i, save_bundle=True):
    clf.fit(Xtr_s, ytr_i)

    def do_eval(Xs, ys_i, split):
        yhat = clf.predict(Xs)
        acc  = accuracy_score(ys_i, yhat)
        f1m  = f1_score(ys_i, yhat, average="macro")
        rep  = classification_report(ys_i, yhat, target_names=le.classes_)
        cm   = confusion_matrix(ys_i, yhat)

        # save report & confusion matrix
        prefix = f"{name}_{split}"
        with open(os.path.join(REPORT_DIR, f"{prefix}_report.txt"), "w", encoding="utf-8") as f:
            f.write(f"Accuracy: {acc:.4f}\nMacro-F1: {f1m:.4f}\n\n{rep}")
        save_cm(cm, le.classes_, os.path.join(REPORT_DIR, f"{prefix}_cm.png"),
                f"{name} – Confusion Matrix ({split})")

        print(f"\n== {name} :: {split.upper()} ==")
        print(f"Accuracy: {acc:.4f} | Macro-F1: {f1m:.4f}")
        return acc, f1m

    acc_tr, f1_tr = do_eval(Xtr_s, ytr_i, "train")
    acc_va, f1_va = do_eval(Xva_s, yva_i, "val")
    acc_te, f1_te = do_eval(Xte_s, yte_i, "test")

    # optional: save model bundle
    if save_bundle:
        joblib.dump(
            {"clf": clf, "scaler": scaler, "label_encoder": le},
            os.path.join(MODEL_DIR, f"{name}.joblib")
        )

    return {
        "model": name,
        "acc_train": acc_tr, "f1_train": f1_tr,
        "acc_val": acc_va,   "f1_val": f1_va,
        "acc_test": acc_te,  "f1_test": f1_te
    }


#### 📈 Cell 4 — (Type 1: default) Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=4000, class_weight="balanced", n_jobs=None)
res_logreg = evaluate_and_report("logreg", logreg, Xtr_s, ytr_i, Xva_s, yva_i, Xte_s, yte_i)
res_logreg



== logreg :: TRAIN ==
Accuracy: 0.9990 | Macro-F1: 0.9991

== logreg :: VAL ==
Accuracy: 0.9913 | Macro-F1: 0.9958

== logreg :: TEST ==
Accuracy: 0.9826 | Macro-F1: 0.9822


{'model': 'logreg',
 'acc_train': 0.9990248659190639,
 'f1_train': 0.9991243025566553,
 'acc_val': 0.991304347826087,
 'f1_val': 0.9958235294117647,
 'acc_test': 0.9826086956521739,
 'f1_test': 0.9821680319680319}

#### 📈 Cell 5 — (Type 2) Linear SVM

In [17]:
from sklearn.svm import LinearSVC

lsvc = LinearSVC(class_weight="balanced")
res_lsvc = evaluate_and_report("linear_svc", lsvc, Xtr_s, ytr_i, Xva_s, yva_i, Xte_s, yte_i)
res_lsvc



== linear_svc :: TRAIN ==
Accuracy: 0.9990 | Macro-F1: 0.9991

== linear_svc :: VAL ==
Accuracy: 0.9957 | Macro-F1: 0.9975

== linear_svc :: TEST ==
Accuracy: 0.9739 | Macro-F1: 0.9752


{'model': 'linear_svc',
 'acc_train': 0.9990248659190639,
 'f1_train': 0.9991243025566553,
 'acc_val': 0.9956521739130435,
 'f1_val': 0.9974901960784314,
 'acc_test': 0.9739130434782609,
 'f1_test': 0.9751621711621712}

#### 📈 Cell 6 — (Type 3) kNN (optionally with L2 normalization)

In [18]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import Normalizer

# Toggle these if you want to experiment:
USE_L2_NORMALIZE = True   # often helps kNN on embedding spaces
K_FOR_KNN        = 5      # try 3, 5, 7

if USE_L2_NORMALIZE:
    l2 = Normalizer(norm="l2")
    Xtr_knn = l2.fit_transform(Xtr_s)
    Xva_knn = l2.transform(Xva_s)
    Xte_knn = l2.transform(Xte_s)
else:
    Xtr_knn, Xva_knn, Xte_knn = Xtr_s, Xva_s, Xte_s

knn = KNeighborsClassifier(n_neighbors=K_FOR_KNN, weights="distance")
res_knn = evaluate_and_report(f"knn_k{K_FOR_KNN}{'_l2' if USE_L2_NORMALIZE else ''}",
                              knn, Xtr_knn, ytr_i, Xva_knn, yva_i, Xte_knn, yte_i)
res_knn



== knn_k5_l2 :: TRAIN ==
Accuracy: 1.0000 | Macro-F1: 1.0000

== knn_k5_l2 :: VAL ==
Accuracy: 0.9870 | Macro-F1: 0.9935

== knn_k5_l2 :: TEST ==
Accuracy: 0.9696 | Macro-F1: 0.9665


{'model': 'knn_k5_l2',
 'acc_train': 1.0,
 'f1_train': 1.0,
 'acc_val': 0.9869565217391304,
 'f1_val': 0.9934527160146357,
 'acc_test': 0.9695652173913043,
 'f1_test': 0.9665051439788281}

### 🧾 Cell 7 — Summary table across models

In [19]:
import pandas as pd

summary = pd.DataFrame([res_logreg, res_lsvc, res_knn])
display(summary)

# Save for your PPT/report
out_csv = os.path.join(REPORT_DIR, "summary.csv")
summary.to_csv(out_csv, index=False)
print("✓ Summary saved →", out_csv)


Unnamed: 0,model,acc_train,f1_train,acc_val,f1_val,acc_test,f1_test
0,logreg,0.999025,0.999124,0.991304,0.995824,0.982609,0.982168
1,linear_svc,0.999025,0.999124,0.995652,0.99749,0.973913,0.975162
2,knn_k5_l2,1.0,1.0,0.986957,0.993453,0.969565,0.966505


✓ Summary saved → D-reports/comparison\summary.csv


# Live Data Pipeline

## 📥 1a. all sample voices: Normalize enrollment audio + build manifest

In [20]:
# Normalize A-data/1-raw/1-2-speaker-enroll --> A-data/2-processed/2-1-wav16k/2-1-2-speaker-enroll
# And create A-data/2-processed/2-2-manifests/enroll_index.csv

import os, uuid, pathlib, csv
import librosa, soundfile as sf

RAW_ENROLL       = pathlib.Path("A-data/1-raw/1-2-speaker-enroll")
PROC_ENROLL      = pathlib.Path("A-data/2-processed/2-1-wav16k/2-1-2-speaker-enroll")
MANIFESTS_DIR    = pathlib.Path("A-data/2-processed/2-2-manifests")
ENROLL_MANIFEST  = MANIFESTS_DIR / "enroll_index.csv"

PROC_ENROLL.mkdir(parents=True, exist_ok=True)
MANIFESTS_DIR.mkdir(parents=True, exist_ok=True)

def write_wav16k(src_path: pathlib.Path, dst_path: pathlib.Path, sr=16000):
    # librosa handles wav/flac/mp3/m4a/ogg via audioread
    y, _ = librosa.load(str(src_path), sr=sr, mono=True)
    dst_path.parent.mkdir(parents=True, exist_ok=True)
    sf.write(str(dst_path), y, sr, subtype="PCM_16")

rows = []
count_files = 0
people = []

valid_ext = {".wav", ".flac", ".mp3", ".m4a", ".ogg"}

for person_dir in sorted(RAW_ENROLL.glob("*")):
    if not person_dir.is_dir():
        continue
    person = person_dir.name
    people.append(person)
    out_person_dir = PROC_ENROLL / person

    for src in person_dir.rglob("*"):
        if src.suffix.lower() not in valid_ext:
            continue
        # create anonymized, collision-proof filename (keep stems if you prefer)
        dst = out_person_dir / f"{src.stem}.wav"
        try:
            write_wav16k(src, dst)
        except Exception as e:
            print(f"⚠️ Skipped {src} due to error: {e}")
            continue

        # duration for manifest (use librosa on the written file)
        dur = librosa.get_duration(path=str(dst))
        rows.append({"path": str(dst), "person": person, "duration_s": f"{dur:.2f}"})
        count_files += 1

# write manifest
with open(ENROLL_MANIFEST, "w", newline="", encoding="utf-8") as fh:
    w = csv.DictWriter(fh, fieldnames=["path","person","duration_s"])
    w.writeheader()
    w.writerows(rows)

print("✓ Normalized enrollment audio →", PROC_ENROLL)
print("✓ Wrote manifest →", ENROLL_MANIFEST)
print(f"People: {people}")
print(f"Clips processed: {count_files}")


✓ Normalized enrollment audio → A-data\2-processed\2-1-wav16k\2-1-2-speaker-enroll
✓ Wrote manifest → A-data\2-processed\2-2-manifests\enroll_index.csv
People: ['001-Justin-Anderson', '002-Sam-COACH--Cassidy', '003-Charlie-OWNER--Archer', '004-Claire-CS-LEAD--Hope', '005-Paul-FLEET--Mark']
Clips processed: 5


## 🧠 1b. Sample Voices: Build ECAPA mean embeddings per person

In [21]:
# Build per-person ECAPA mean embeddings from processed enrollment WAVs
# Uses your local model cache at B-work/0-ecapa-model-cache

import json, numpy as np, torch, torchaudio, pathlib
from speechbrain.pretrained import EncoderClassifier

PROC_ENROLL     = pathlib.Path("A-data/2-processed/2-1-wav16k/2-1-2-speaker-enroll")
LOCAL_MODEL_DIR = pathlib.Path("B-work/0-ecapa-model-cache")  # your local ECAPA snapshot folder
OUT_JSON        = pathlib.Path("B-work/2-speaker-enroll-ecapa/ecapa_means.json")

OUT_JSON.parent.mkdir(parents=True, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
enc = EncoderClassifier.from_hparams(
    source=str(LOCAL_MODEL_DIR),  # load from local cache, no symlinks
    savedir=str(LOCAL_MODEL_DIR),
    run_opts={"device": device},
)

def load_mono16k_torch(path: str):
    wav, sr = torchaudio.load(path)
    if wav.shape[0] > 1:
        wav = torch.mean(wav, dim=0, keepdim=True)
    if sr != 16000:
        wav = torchaudio.functional.resample(wav, sr, 16000)
    return wav

means = {}
for person_dir in sorted(PROC_ENROLL.glob("*")):
    if not person_dir.is_dir():
        continue
    person = person_dir.name
    embs = []
    for wav_path in person_dir.glob("*.wav"):
        wav = load_mono16k_torch(str(wav_path)).to(device)
        with torch.no_grad():
            emb = enc.encode_batch(wav).squeeze().cpu().numpy()
        embs.append(emb)
    if embs:
        means[person] = np.mean(np.stack(embs), axis=0).tolist()
        print(f"✓ {person}: {len(embs)} clips → mean embedding computed")
    else:
        print(f"⚠️ {person}: no WAVs found in {person_dir}")

with open(OUT_JSON, "w", encoding="utf-8") as fh:
    json.dump(means, fh, indent=2)

print("✓ Saved enrollment means →", OUT_JSON)




✓ 001-Justin-Anderson: 1 clips → mean embedding computed
✓ 002-Sam-COACH--Cassidy: 1 clips → mean embedding computed
✓ 003-Charlie-OWNER--Archer: 1 clips → mean embedding computed
✓ 004-Claire-CS-LEAD--Hope: 1 clips → mean embedding computed
✓ 005-Paul-FLEET--Mark: 1 clips → mean embedding computed
✓ Saved enrollment means → B-work\2-speaker-enroll-ecapa\ecapa_means.json


## X 2a. Normalize meeting audio

In [22]:
# Normalize A-data/1-raw/1-3-client-meetings --> A-data/2-processed/2-1-wav16k/2-1-3-client-meetings
# And create A-data/2-processed/2-2-manifests/meetings_index.csv

import os, pathlib, csv
import librosa, soundfile as sf

RAW_MEETINGS    = pathlib.Path("A-data/1-raw/1-3-client-meetings")
PROC_MEETINGS   = pathlib.Path("A-data/2-processed/2-1-wav16k/2-1-3-client-meetings")
MANIFESTS_DIR   = pathlib.Path("A-data/2-processed/2-2-manifests")
MEETING_MANIFEST= MANIFESTS_DIR / "meetings_index.csv"

PROC_MEETINGS.mkdir(parents=True, exist_ok=True)
MANIFESTS_DIR.mkdir(parents=True, exist_ok=True)

def write_wav16k(src_path: pathlib.Path, dst_path: pathlib.Path, sr=16000):
    y, _ = librosa.load(str(src_path), sr=sr, mono=True)
    dst_path.parent.mkdir(parents=True, exist_ok=True)
    sf.write(str(dst_path), y, sr, subtype="PCM_16")

rows, count_files, meetings = [], 0, []

valid_ext = {".wav", ".flac", ".mp3", ".m4a", ".ogg"}

for meeting_dir in sorted(RAW_MEETINGS.glob("*")):
    if not meeting_dir.is_dir():
        continue
    meeting_id = meeting_dir.name
    meetings.append(meeting_id)
    out_meeting_dir = PROC_MEETINGS / meeting_id

    for src in meeting_dir.rglob("*"):
        if src.suffix.lower() not in valid_ext:
            continue
        dst = out_meeting_dir / f"{src.stem}.wav"
        try:
            write_wav16k(src, dst)
        except Exception as e:
            print(f"⚠️ Skipped {src} due to error: {e}")
            continue
        dur = librosa.get_duration(path=str(dst))
        rows.append({"path": str(dst), "meeting_id": meeting_id, "duration_s": f"{dur:.2f}"})
        count_files += 1

with open(MEETING_MANIFEST, "w", newline="", encoding="utf-8") as fh:
    w = csv.DictWriter(fh, fieldnames=["path","meeting_id","duration_s"])
    w.writeheader(); w.writerows(rows)

print("✓ Normalized meeting audio →", PROC_MEETINGS)
print("✓ Wrote manifest →", MEETING_MANIFEST)
print(f"Meetings: {meetings}")
print(f"Clips processed: {count_files}")


✓ Normalized meeting audio → A-data\2-processed\2-1-wav16k\2-1-3-client-meetings
✓ Wrote manifest → A-data\2-processed\2-2-manifests\meetings_index.csv
Meetings: ['2025-09-08-Fake-Meeting-01']
Clips processed: 1


## 🧭 2b. (pre-Speaker Attempt) Diarization + STT

In [23]:
# Diarize all meetings under:
#   A-data/2-processed/2-1-wav16k/2-1-3-client-meetings/<meeting_id>/*.wav
# Save outputs to:
#   B-work/3-client-meetings-diarization/<meeting_id>/{diarization.rttm, segments.(tsv|json)}

import os, json, pathlib
from dotenv import load_dotenv
from pyannote.audio import Pipeline
import pandas as pd
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


MEET_ROOT = pathlib.Path("A-data/2-processed/2-1-wav16k/2-1-3-client-meetings")
OUT_ROOT  = pathlib.Path("B-work/3-client-meetings-diarization")

load_dotenv()
import os as _os
hf_token = _os.getenv("HF_TOKEN")
assert hf_token, "Add HF_TOKEN=... to your .env"

pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token=hf_token,
).to(device)




def diarize_file(wav_path: pathlib.Path, out_dir: pathlib.Path):
    out_dir.mkdir(parents=True, exist_ok=True)
    diar = pipeline(str(wav_path))

    # Save RTTM (standard who-spoke-when)
    with (out_dir / "diarization.rttm").open("w", encoding="utf-8") as f:
        diar.write_rttm(f)

    # Save segments as TSV + JSON
    seg_rows = []
    for seg, _, label in diar.itertracks(yield_label=True):
        seg_rows.append({
            "start": round(seg.start, 2),
            "end": round(seg.end, 2),
            "duration": round(seg.end - seg.start, 2),
            "cluster": label,
        })

    with (out_dir / "segments.tsv").open("w", encoding="utf-8") as f:
        f.write("start\tend\tduration\tcluster\n")
        for r in seg_rows:
            f.write(f"{r['start']}\t{r['end']}\t{r['duration']}\t{r['cluster']}\n")

    (out_dir / "segments.json").write_text(json.dumps(seg_rows, indent=2), encoding="utf-8")
    print(f"✓ Diarized: {wav_path.name} → {out_dir}")
    df = pd.DataFrame(seg_rows)
    return df

# Process each meeting folder (first .wav inside each)
for meeting_dir in sorted(MEET_ROOT.glob("*")):
    if not meeting_dir.is_dir():
        continue
    wavs = sorted(meeting_dir.glob("*.wav"))
    if not wavs:
        print(f"⚠️ No .wav in {meeting_dir}")
        continue
    df = diarize_file(wavs[0], OUT_ROOT / meeting_dir.name)

display(df)


It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.

  std = sequences.std(dim=-1, correction=1)


✓ Diarized: ElevenLabs_Fake_Meeting_01.wav → B-work\3-client-meetings-diarization\2025-09-08-Fake-Meeting-01


Unnamed: 0,start,end,duration,cluster
0,0.01,1.38,1.38,SPEAKER_02
1,1.98,3.78,1.80,SPEAKER_02
2,4.39,7.38,2.99,SPEAKER_00
3,7.75,9.01,1.26,SPEAKER_00
4,9.31,10.55,1.24,SPEAKER_00
...,...,...,...,...
72,166.75,168.16,1.41,SPEAKER_02
73,168.63,169.48,0.85,SPEAKER_02
74,170.57,171.59,1.02,SPEAKER_00
75,171.99,172.27,0.27,SPEAKER_00


## 🧭 2c. (pre-Speaker Attempt) Diarization + STT

In [24]:
# Name diarized speakers using enrollment ECAPA means
# Outputs:
#   B-work/3-client-meetings-diarization-named/<meeting_id>/named_segments.(json|tsv)
# Returns:
#   A Pandas DataFrame (df_named) for the last meeting processed

import os, json, pathlib, numpy as np, torch, torchaudio, pandas as pd
from sklearn.preprocessing import normalize
from scipy.spatial.distance import cdist
from speechbrain.pretrained import EncoderClassifier

ENROLL_MEANS = pathlib.Path("B-work/2-speaker-enroll-ecapa/ecapa_means.json")
MODEL_DIR    = pathlib.Path("B-work/0-ecapa-model-cache")
IN_ROOT      = pathlib.Path("B-work/3-client-meetings-diarization")
OUT_ROOT     = pathlib.Path("B-work/4-client-meetings-named-diary")
AUDIO_ROOT   = pathlib.Path("A-data/2-processed/2-1-wav16k/2-1-3-client-meetings")

assert ENROLL_MEANS.exists(), "Missing enrollment means JSON. Run the enrollment mean step first."
OUT_ROOT.mkdir(parents=True, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
enc = EncoderClassifier.from_hparams(
    source=str(MODEL_DIR),
    savedir=str(MODEL_DIR),
    run_opts={"device": device},
)

def load_mono16k(path: str):
    wav, sr = torchaudio.load(path)
    if wav.shape[0] > 1:
        wav = torch.mean(wav, dim=0, keepdim=True)
    if sr != 16000:
        wav = torchaudio.functional.resample(wav, sr, 16000)
    return wav, 16000

# Enrollment means → L2-normalized
enroll = json.loads(ENROLL_MEANS.read_text(encoding="utf-8"))
E_keys = list(enroll.keys())
E_mat  = np.stack([normalize(np.array(enroll[k]).reshape(1, -1))[0] for k in E_keys])

COSINE_SIM_THRESHOLD = 0.65  # tune later

df_named = None  # will hold last meeting's DataFrame

for meeting_dir in sorted(IN_ROOT.glob("*")):
    if not meeting_dir.is_dir():
        continue
    meeting_id = meeting_dir.name
    out_dir = OUT_ROOT / meeting_id
    out_dir.mkdir(parents=True, exist_ok=True)

    # Load audio
    wavs = list((AUDIO_ROOT/meeting_id).glob("*.wav"))
    if not wavs:
        print(f"⚠️ No audio for {meeting_id}")
        continue
    wav_path = wavs[0]
    wav, sr = load_mono16k(str(wav_path))

    # Load diarization segments
    segs_path = meeting_dir / "segments.json"
    if not segs_path.exists():
        print(f"⚠️ No segments.json in {meeting_dir}")
        continue
    segs = json.loads(segs_path.read_text(encoding="utf-8"))

    # Name each segment
    named = []
    for s in segs:
        s0, s1 = s["start"], s["end"]
        if s1 <= s0:
            continue
        seg_wav = wav[:, int(s0*sr):int(s1*sr)]
        if seg_wav.shape[-1] < int(0.2*sr):
            continue
        with torch.no_grad():
            emb = enc.encode_batch(seg_wav.to(device)).squeeze().cpu().numpy()
        emb_n = normalize(emb.reshape(1, -1))
        dists = cdist(emb_n, E_mat, metric="cosine")[0]
        best_idx = int(np.argmin(dists))
        sim = float(1 - dists[best_idx])
        who = E_keys[best_idx] if sim >= COSINE_SIM_THRESHOLD else "Unknown"
        named.append({
            "start": round(s0, 2),
            "end": round(s1, 2),
            "duration": round(s1 - s0, 2),
            "cluster": s["cluster"],
            "who": who,
            "cosine_sim": round(sim, 3),
        })

    # Save outputs
    (out_dir / "named_segments.json").write_text(json.dumps(named, indent=2), encoding="utf-8")
    df_named = pd.DataFrame(named)
    df_named.to_csv(out_dir / "named_segments.tsv", sep="\t", index=False)
    print(f"✓ Named segments → {out_dir}")

# 👉 df_named now contains the DataFrame of the last meeting processed
display(df_named)




✓ Named segments → B-work\4-client-meetings-named-diary\2025-09-08-Fake-Meeting-01


Unnamed: 0,start,end,duration,cluster,who,cosine_sim
0,0.01,1.38,1.37,SPEAKER_02,002-Sam-COACH--Cassidy,0.765
1,1.98,3.78,1.80,SPEAKER_02,002-Sam-COACH--Cassidy,0.686
2,4.39,7.38,2.99,SPEAKER_00,003-Charlie-OWNER--Archer,0.757
3,7.75,9.01,1.26,SPEAKER_00,003-Charlie-OWNER--Archer,0.747
4,9.31,10.55,1.24,SPEAKER_00,003-Charlie-OWNER--Archer,0.677
...,...,...,...,...,...,...
69,166.75,168.16,1.41,SPEAKER_02,002-Sam-COACH--Cassidy,0.728
70,168.63,169.48,0.85,SPEAKER_02,Unknown,0.572
71,170.57,171.59,1.02,SPEAKER_00,Unknown,0.567
72,171.99,172.27,0.28,SPEAKER_00,Unknown,0.217


In [25]:
df_named.describe()

Unnamed: 0,start,end,duration,cosine_sim
count,74.0,74.0,74.0,74.0
mean,85.695,87.527973,1.832973,0.695297
std,53.343536,53.315491,0.923005,0.133006
min,0.01,1.38,0.28,0.217
25%,35.7525,37.885,1.1925,0.631
50%,85.605,87.54,1.61,0.7475
75%,130.365,132.075,2.4525,0.784
max,172.67,173.52,4.63,0.872


### X 2d. Review Speaker Attempt (grouped with 1 speaker per-row, gap collapse)

In [26]:
import pandas as pd
import numpy as np

# df_named is the detailed per-segment table you already have
# Required cols: start, end, duration, cluster, who, cosine_sim
d = df_named.sort_values("start").reset_index(drop=True).copy()

# mark boundaries where the speaker changes (run-length encoding by 'who')
change = (d["who"] != d["who"].shift(1)).astype(int)
group_id = change.cumsum()

# helper to compute gap totals and pause counts per run
def run_gaps(g):
    # gaps only between consecutive rows inside the run
    gaps = (g["start"].iloc[1:].to_numpy() - g["end"].iloc[:-1].to_numpy())
    gaps = gaps[gaps > 0] if len(gaps) else np.array([])
    return pd.Series({
        "pauses": len(gaps),                 # number of gaps merged
        "gap_total": float(gaps.sum()) if len(gaps) else 0.0
    })

# aggregate per run
agg = d.groupby(group_id).apply(lambda g: pd.Series({
    "start": g["start"].iloc[0],
    "end": g["end"].iloc[-1],
    # spoken time only (sum of segment durations; gaps excluded)
    "talk_duration": float(g["duration"].sum()),
    # optional: wall-clock span (includes gaps) if you want it
    "span_duration": float(g["end"].iloc[-1] - g["start"].iloc[0]),
    "who": g["who"].iloc[0],
    # keep the first cluster label in the run (you can join all if you want)
    "cluster_first": g["cluster"].iloc[0],
    # confidence summaries
    "cosine_sim_mean": float(g["cosine_sim"].mean()),
    "cosine_sim_min": float(g["cosine_sim"].min()),
    "segments_merged": int(len(g)),
})).reset_index(drop=True)

# add gaps info
gaps_info = d.groupby(group_id).apply(run_gaps).reset_index(drop=True)
df_compact = pd.concat([agg, gaps_info], axis=1)

# nice ordering
df_compact = df_compact[
    ["start","end","talk_duration","span_duration",
     "pauses","gap_total","segments_merged",
     "who","cluster_first","cosine_sim_mean","cosine_sim_min"]
].sort_values("start").reset_index(drop=True)

display(df_named.head(10))    # original detailed segments (unchanged)
display(df_compact)  # new compacted view


Unnamed: 0,start,end,duration,cluster,who,cosine_sim
0,0.01,1.38,1.37,SPEAKER_02,002-Sam-COACH--Cassidy,0.765
1,1.98,3.78,1.8,SPEAKER_02,002-Sam-COACH--Cassidy,0.686
2,4.39,7.38,2.99,SPEAKER_00,003-Charlie-OWNER--Archer,0.757
3,7.75,9.01,1.26,SPEAKER_00,003-Charlie-OWNER--Archer,0.747
4,9.31,10.55,1.24,SPEAKER_00,003-Charlie-OWNER--Archer,0.677
5,10.98,12.27,1.29,SPEAKER_00,003-Charlie-OWNER--Archer,0.707
6,12.91,14.69,1.78,SPEAKER_00,Unknown,0.616
7,15.07,16.41,1.34,SPEAKER_00,003-Charlie-OWNER--Archer,0.703
8,16.97,17.41,0.44,SPEAKER_00,Unknown,0.546
9,17.89,20.35,2.46,SPEAKER_00,003-Charlie-OWNER--Archer,0.77


Unnamed: 0,start,end,talk_duration,span_duration,pauses,gap_total,segments_merged,who,cluster_first,cosine_sim_mean,cosine_sim_min
0,0.01,3.78,3.17,3.77,1.0,0.6,2,002-Sam-COACH--Cassidy,SPEAKER_02,0.7255,0.686
1,4.39,12.27,6.78,7.88,3.0,1.1,4,003-Charlie-OWNER--Archer,SPEAKER_00,0.722,0.677
2,12.91,14.69,1.78,1.78,0.0,0.0,1,Unknown,SPEAKER_00,0.616,0.616
3,15.07,16.41,1.34,1.34,0.0,0.0,1,003-Charlie-OWNER--Archer,SPEAKER_00,0.703,0.703
4,16.97,17.41,0.44,0.44,0.0,0.0,1,Unknown,SPEAKER_00,0.546,0.546
5,17.89,20.35,2.46,2.46,0.0,0.0,1,003-Charlie-OWNER--Archer,SPEAKER_00,0.77,0.77
6,20.76,21.81,1.05,1.05,0.0,0.0,1,Unknown,SPEAKER_00,0.622,0.622
7,22.47,26.63,3.61,4.16,1.0,0.55,2,003-Charlie-OWNER--Archer,SPEAKER_00,0.6915,0.676
8,27.56,29.09,1.53,1.53,0.0,0.0,1,002-Sam-COACH--Cassidy,SPEAKER_02,0.771,0.771
9,29.38,30.3,0.92,0.92,0.0,0.0,1,Unknown,SPEAKER_02,0.603,0.603


In [27]:
df_compact.loc[~(df_compact['who'] == 'Unknown')].describe()

Unnamed: 0,start,end,talk_duration,span_duration,pauses,gap_total,segments_merged,cosine_sim_mean,cosine_sim_min
count,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0
mean,71.111905,77.438571,5.559048,6.326667,1.571429,0.767619,2.571429,0.748243,0.722
std,50.219499,51.827323,5.736545,6.75528,2.357359,1.08444,2.357359,0.045349,0.047413
min,0.01,3.78,1.12,1.12,0.0,0.0,1.0,0.662,0.651
25%,27.56,29.09,1.61,1.61,0.0,0.0,1.0,0.722,0.677
50%,67.12,68.24,3.17,3.77,0.0,0.0,1.0,0.754333,0.742
75%,116.54,117.89,6.78,7.88,3.0,1.1,4.0,0.781667,0.763
max,156.71,168.16,20.64,24.36,8.0,3.72,9.0,0.8215,0.799


In [28]:
df_compact.loc[df_compact['who'] == 'Unknown'].describe()

Unnamed: 0,start,end,talk_duration,span_duration,pauses,gap_total,segments_merged,cosine_sim_mean,cosine_sim_min
count,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0
mean,76.213333,77.643333,1.26,1.43,0.266667,0.17,1.333333,0.536467,0.504
std,49.896556,50.504558,0.66674,1.120886,0.798809,0.505272,0.816497,0.085185,0.133737
min,12.91,14.69,0.44,0.44,0.0,0.0,1.0,0.353,0.217
25%,31.445,32.465,0.77,0.77,0.0,0.0,1.0,0.47975,0.4175
50%,65.9,66.66,1.12,1.12,0.0,0.0,1.0,0.546,0.546
75%,116.475,117.62,1.485,1.485,0.0,0.0,1.0,0.612,0.612
max,168.63,173.52,3.0,4.89,3.0,1.89,4.0,0.637,0.637


In [29]:
df_compact.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   start            36 non-null     float64
 1   end              36 non-null     float64
 2   talk_duration    36 non-null     float64
 3   span_duration    36 non-null     float64
 4   pauses           36 non-null     float64
 5   gap_total        36 non-null     float64
 6   segments_merged  36 non-null     int64  
 7   who              36 non-null     object 
 8   cluster_first    36 non-null     object 
 9   cosine_sim_mean  36 non-null     float64
 10  cosine_sim_min   36 non-null     float64
dtypes: float64(8), int64(1), object(2)
memory usage: 3.2+ KB


### AUDIT: See how it goes

In [30]:
import pandas as pd

def summarize_speakers(df, time_col="duration", label_col="who"):
    """
    Summarize how much each unique speaker talked.
    df: DataFrame with a column for time and a column for speaker labels.
    time_col: which col to sum (use 'duration' from df_named, or 'talk_duration' from df_compact)
    label_col: which col holds speaker names (default 'who')
    """
    total = df[time_col].sum()
    summary = (
        df.groupby(label_col)[time_col]
          .sum()
          .reset_index()
          .rename(columns={time_col: "seconds"})
    )
    summary["percent"] = 100 * summary["seconds"] / total
    summary = summary.sort_values("seconds", ascending=False).reset_index(drop=True)
    return summary

# Example usage:
summary_named   = summarize_speakers(df_named, time_col="duration")
summary_compact = summarize_speakers(df_compact, time_col="talk_duration")

print("From raw segments:")
display(summary_named)

print("From compacted runs:")
display(summary_compact)


From raw segments:


Unnamed: 0,who,seconds,percent
0,004-Claire-CS-LEAD--Hope,38.66,28.501917
1,005-Paul-FLEET--Mark,29.84,21.99941
2,002-Sam-COACH--Cassidy,24.98,18.416396
3,003-Charlie-OWNER--Archer,23.26,17.148334
4,Unknown,18.9,13.933943


From compacted runs:


Unnamed: 0,who,seconds,percent
0,004-Claire-CS-LEAD--Hope,38.66,28.501917
1,005-Paul-FLEET--Mark,29.84,21.99941
2,002-Sam-COACH--Cassidy,24.98,18.416396
3,003-Charlie-OWNER--Archer,23.26,17.148334
4,Unknown,18.9,13.933943
