In [1]:
# Cell 1: imports and basic paths

from pathlib import Path
import json

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    RocCurveDisplay,
)

import matplotlib.pyplot as plt

BASE_DIR = Path(".")
DATA_DIR = BASE_DIR / "data"
MODELS_DIR = BASE_DIR / "models"

FEATURES_CSV = MODELS_DIR / "scene_features_val_from_yolo.csv"  # from notebook 5
MERGED_CSV   = MODELS_DIR / "workzone_frame_features_with_clip.csv"
LABEL_CSV    = DATA_DIR / "workzone_manual_labels.csv"

print("features csv exists:", FEATURES_CSV.exists())
print("merged csv exists  :", MERGED_CSV.exists())
print("label csv exists   :", LABEL_CSV.exists())


features csv exists: True
merged csv exists  : True
label csv exists   : True


In [2]:
from pathlib import Path
import pandas as pd

DATA_DIR = Path("data")
MODELS_DIR = Path("models")

MERGED_CSV = MODELS_DIR / "workzone_frame_features_with_clip.csv"
LABEL_CSV  = DATA_DIR / "workzone_manual_labels.csv"

df_merged = pd.read_csv(MERGED_CSV)
df_labels = pd.read_csv(LABEL_CSV)

print("Merged CLIP+YOLO rows:", len(df_merged))
print("Label CSV rows:", len(df_labels))
print("\nLabel CSV head:")
print(df_labels.head())

print("\nLabel value counts (including NaN):")
print(df_labels["workzone_label"].value_counts(dropna=False))

# Join and see how many match
df_labeled = df_merged.merge(df_labels, on="file_name", how="inner")
print("\nAfter merge, rows:", len(df_labeled))
print(df_labeled[["file_name", "workzone_label"]].head())


Merged CLIP+YOLO rows: 2098
Label CSV rows: 400

Label CSV head:
                                           file_name  workzone_label
0  philadelphia_828ed3aacd9442cf922819cdee32dbe4_...             NaN
1  washington_dc_3fc86386a4eb4ceea382f68ffa5ead84...             NaN
2  indianapolis_6b8389b36b7f40ad9aa94b8c2c34d148_...             NaN
3  san_francisco_0efc638c166b4d05b3e6b50f9115ac38...             NaN
4  houston_b0e38fe839374b7e90fe124e4cc4b5f0_00000...             NaN

Label value counts (including NaN):
workzone_label
NaN    400
Name: count, dtype: int64

After merge, rows: 400
                                           file_name  workzone_label
0  columbus_e9e7af8c07ee421797f325aaa827ed29_0000...             NaN
1  columbus_b30da26bf8f344eebd0d1266908baacf_0000...             NaN
2  columbus_ac5aa017fffd46a58c2df7ae14c3bd27_0000...             NaN
3  columbus_77fb08eee2ed43e490c27f40cccf8976_0000...             NaN
4  columbus_77fb08eee2ed43e490c27f40cccf8976_0000...           

In [4]:
# Cell 2: load features + clip score

if MERGED_CSV.exists():
    df = pd.read_csv(MERGED_CSV)
    print("Loaded merged features from:", MERGED_CSV)
else:
    # Fallback: rebuild a minimal merged table using features + clip score
    # Assumes you saved a CSV with columns ["file_name", "clip_workzone_score"]
    CLIP_CSV = MODELS_DIR / "clip_workzone_scores.csv"
    if not CLIP_CSV.exists():
        raise FileNotFoundError(
            "Missing workzone_frame_features_with_clip.csv and clip_workzone_scores.csv. "
            "Re-run Notebook 6 to create them."
        )

    df_feat = pd.read_csv(FEATURES_CSV)
    df_clip = pd.read_csv(CLIP_CSV)  # should have file_name + clip_workzone_score
    df = df_feat.merge(df_clip, on="file_name", how="left")

    # Recompute z scores and heuristic score as in notebook 6
    for col in ["frac_channelization", "frac_workers", "frac_vehicles", "clip_workzone_score"]:
        mu = df[col].mean()
        sigma = df[col].std() + 1e-6
        df[f"{col}_z"] = (df[col] - mu) / sigma

    weights = {
        "frac_channelization_z": 0.9,
        "frac_workers_z": 0.7,
        "frac_vehicles_z": 0.4,
        "clip_workzone_score_z": 0.6,
    }

    raw = np.zeros(len(df), dtype="float32")
    for name, w in weights.items():
        raw += w * df[name].values.astype("float32")

    def logistic(x):
        return 1.0 / (1.0 + np.exp(-x))

    df["workzone_raw"] = raw
    df["workzone_score"] = logistic(raw)

    df.to_csv(MERGED_CSV, index=False)
    print("Rebuilt and saved merged features to:", MERGED_CSV)

df.head()


Loaded merged features from: models/workzone_frame_features_with_clip.csv


Unnamed: 0,image_id,file_name,total_objs,count_channelization,frac_channelization,count_workers,frac_workers,count_vehicles,frac_vehicles,count_ttc_signs,...,img_path,clip_workzone_score,frac_channelization_z,frac_workers_z,frac_vehicles_z,clip_workzone_score_z,workzone_raw,workzone_score,seq_id,frame_idx
0,1,columbus_ed065d9b86d545b2af0042a058e7e907_0000...,9,3,0.333333,0,0.0,6,0.666667,0,...,data/images/columbus_ed065d9b86d545b2af0042a05...,-0.007094,-0.547021,-1.043189,2.311754,-0.239283,-0.44142,0.391403,columbus_ed065d9b86d545b2af0042a058e7e907_000000,23970
1,2,columbus_ed065d9b86d545b2af0042a058e7e907_0000...,21,6,0.285714,14,0.666667,1,0.047619,0,...,data/images/columbus_ed065d9b86d545b2af0042a05...,0.009551,-0.688553,0.975418,-0.254538,0.750493,0.411575,0.601466,columbus_ed065d9b86d545b2af0042a058e7e907_000000,14640
2,3,columbus_ed065d9b86d545b2af0042a058e7e907_0000...,3,1,0.333333,0,0.0,2,0.666667,0,...,data/images/columbus_ed065d9b86d545b2af0042a05...,-0.003145,-0.547021,-1.043189,2.311754,-0.004454,-0.300523,0.42543,columbus_ed065d9b86d545b2af0042a058e7e907_000000,10740
3,4,columbus_ed065d9b86d545b2af0042a058e7e907_0000...,7,2,0.285714,0,0.0,5,0.714286,0,...,data/images/columbus_ed065d9b86d545b2af0042a05...,0.011783,-0.688553,-1.043189,2.509161,0.88323,0.183672,0.545789,columbus_ed065d9b86d545b2af0042a058e7e907_000000,10590
4,5,columbus_ed065d9b86d545b2af0042a058e7e907_0000...,5,1,0.2,0,0.0,4,0.8,0,...,data/images/columbus_ed065d9b86d545b2af0042a05...,-0.003656,-0.943311,-1.043189,2.864493,-0.034861,-0.454332,0.388331,columbus_ed065d9b86d545b2af0042a058e7e907_000000,10530


In [5]:
# Cell 3: create manual label template (run once, then edit the CSV externally)

if LABEL_CSV.exists():
    print("Label file already exists at:", LABEL_CSV)
else:
    # sample up to 400 frames spread across videos
    # you can edit n_samples if you want more or less
    n_samples = min(400, len(df))
    df_sample = (
        df[["file_name"]]
        .drop_duplicates()
        .sample(n_samples, random_state=0)
        .reset_index(drop=True)
    )
    df_sample["workzone_label"] = ""  # fill with 1 or 0 by hand

    LABEL_CSV.parent.mkdir(parents=True, exist_ok=True)
    df_sample.to_csv(LABEL_CSV, index=False)
    print("Created label template at:", LABEL_CSV)
    print("Open it in a spreadsheet editor and set workzone_label = 1 or 0.")


Created label template at: data/workzone_manual_labels.csv
Open it in a spreadsheet editor and set workzone_label = 1 or 0.


In [6]:
# Cell 4: load manual labels and merge

if not LABEL_CSV.exists():
    raise FileNotFoundError(
        f"{LABEL_CSV} does not exist. Run Cell 3, then fill the labels by hand."
    )

df_labels = pd.read_csv(LABEL_CSV)
print("Label head:")
display(df_labels.head())

# Keep only labeled rows and merge on file_name
df_labeled = df.merge(df_labels, on="file_name", how="inner")
df_labeled = df_labeled.dropna(subset=["workzone_label"])

# normalize label to 0 / 1
def parse_label(x):
    if isinstance(x, str):
        x = x.strip().lower()
        if x in ["1", "yes", "y", "inside", "workzone"]:
            return 1
        if x in ["0", "no", "n", "outside", "none"]:
            return 0
    return int(x)

df_labeled["workzone_label"] = df_labeled["workzone_label"].apply(parse_label).astype(int)

print("Label distribution:")
print(df_labeled["workzone_label"].value_counts())

df_labeled.head()


Label head:


Unnamed: 0,file_name,workzone_label
0,philadelphia_828ed3aacd9442cf922819cdee32dbe4_...,
1,washington_dc_3fc86386a4eb4ceea382f68ffa5ead84...,
2,indianapolis_6b8389b36b7f40ad9aa94b8c2c34d148_...,
3,san_francisco_0efc638c166b4d05b3e6b50f9115ac38...,
4,houston_b0e38fe839374b7e90fe124e4cc4b5f0_00000...,


Label distribution:
Series([], Name: count, dtype: int64)


Unnamed: 0,image_id,file_name,total_objs,count_channelization,frac_channelization,count_workers,frac_workers,count_vehicles,frac_vehicles,count_ttc_signs,...,clip_workzone_score,frac_channelization_z,frac_workers_z,frac_vehicles_z,clip_workzone_score_z,workzone_raw,workzone_score,seq_id,frame_idx,workzone_label


In [7]:
# Cell 5: build feature matrix X and target y

feature_cols = [
    # basic counts and fractions
    "total_objs",
    "frac_channelization",
    "frac_workers",
    "frac_vehicles",
    # spatial distribution of workers and cones
    "workers_left",
    "workers_mid",
    "workers_right",
    "channelization_left",
    "channelization_mid",
    "channelization_right",
    "workers_near",
    "workers_mid.1",     # beware name collision, we will clean this below
    "workers_far",
    "channelization_near",
    "channelization_mid.1",
    "channelization_far",
    # CLIP semantic score
    "clip_workzone_score",
]

# Some CSV writers duplicate names like "workers_mid" -> "workers_mid.1".
# Let us fix that safely.
feature_cols_clean = [c for c in feature_cols if c in df_labeled.columns]

print("Using feature columns:")
print(feature_cols_clean)

X = df_labeled[feature_cols_clean].values.astype("float32")
y = df_labeled["workzone_label"].values.astype("int64")

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=0,
    stratify=y,
)

print("Train size:", X_train.shape, "Val size:", X_val.shape)


Using feature columns:
['total_objs', 'frac_channelization', 'frac_workers', 'frac_vehicles', 'workers_left', 'workers_mid', 'workers_right', 'channelization_left', 'channelization_mid', 'channelization_right', 'workers_near', 'workers_far', 'channelization_near', 'channelization_far', 'clip_workzone_score']


ValueError: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.