# Birth-year label-leak experiments (regex-based)\n
\n
TF-IDF + LinearSVC classification on binned birth-year labels, comparing original vs regex-cleaned text.\n

In [None]:
from pathlib import Path
import re

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score

# Data directory: assume notebook is in model/ and data is at ../data
DATA_DIR = Path("..") / "data"
if not DATA_DIR.exists():
    raise FileNotFoundError(f"Expected data directory at {DATA_DIR.resolve()} but it does not exist")

print("Using DATA_DIR:", DATA_DIR.resolve())

# Load and clean birth_year dataset
print("\nLoading birth_year.csv ...")
age_df = pd.read_csv(DATA_DIR / "birth_year.csv", header=None, names=["text", "label"])

age_df["label"] = pd.to_numeric(age_df["label"], errors="coerce")
age_df = age_df.dropna(subset=["label"])
age_df["label"] = age_df["label"].astype(int)

# Keep only reasonable birth years
age_df = age_df[(age_df["label"] >= 1940) & (age_df["label"] <= 2010)].copy()

# Subsample for speed
age_df = age_df.sample(n=min(20000, len(age_df)), random_state=42).reset_index(drop=True)
print("Dataset shape:", age_df.shape)
print("Birth year range:", int(age_df["label"].min()), "-", int(age_df["label"].max()))


# Bin birth years into coarse categories for classification

def bin_year(y: int) -> str:
    if y < 1960:
        return "1940-1959"
    if y < 1970:
        return "1960-1969"
    if y < 1980:
        return "1970-1979"
    if y < 1990:
        return "1980-1989"
    if y < 2000:
        return "1990-1999"
    return "2000-2010"


age_df["label_bin"] = age_df["label"].apply(bin_year)
print("Label bins:")
print(age_df["label_bin"].value_counts())


# Regex patterns for label-leaking tokens (same as gender experiments)
LEAK_PATTERNS = {
    "age_gender_combo": r"\b(\d{1,2})\s*[MFmf]\b|\b[MFmf]\s*(\d{1,2})\b",
    "i_am_age": r"[Ii]'?m\s+(\d{1,2})\b|[Ii]\s+am\s+(\d{1,2})\b",
    "i_am_gender": r"[Ii]'?m\s+a?\s*(male|female|man|woman|guy|girl)\b",
    "age_years_old": r"\b(\d{1,2})\s*(?:years?\s*old|yo|y\.o\.)\b",
    "age_brackets": r"\(\s*(\d{1,2})\s*[MFmf]\s*\)|\(\s*[MFmf]\s*(\d{1,2})\s*\)",
}

LEAK_REGEX = re.compile("|".join(LEAK_PATTERNS.values()), re.IGNORECASE)


def remove_leak_tokens(text: str) -> str:
    """Remove substrings that match any of the leak patterns."""
    return LEAK_REGEX.sub(" ", str(text))


print("\nCreating cleaned text column (regex-based leak removal) ...")
age_df["text_clean"] = age_df["text"].apply(remove_leak_tokens)

# Shared train/test split
print("\nCreating train/test split ...")
X_raw = age_df["text"].values
X_clean = age_df["text_clean"].values
y = age_df["label_bin"].values

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_raw, y, test_size=0.2, random_state=42, stratify=y
)

# Use the same indices for the cleaned version
X_train_clean, X_test_clean, _, _ = train_test_split(
    X_clean, y, test_size=0.2, random_state=42, stratify=y
)

# Original text model
print("\nTraining TF-IDF + LinearSVC on ORIGINAL text ...")
vec_raw = TfidfVectorizer(max_features=20000, ngram_range=(1, 2), min_df=5)
X_train_vec_raw = vec_raw.fit_transform(X_train_raw)
X_test_vec_raw = vec_raw.transform(X_test_raw)

clf_raw = LinearSVC()
clf_raw.fit(X_train_vec_raw, y_train)
y_pred_raw = clf_raw.predict(X_test_vec_raw)

# Cleaned text model
print("Training TF-IDF + LinearSVC on CLEANED text ...")
vec_clean = TfidfVectorizer(max_features=20000, ngram_range=(1, 2), min_df=5)
X_train_vec_clean = vec_clean.fit_transform(X_train_clean)
X_test_vec_clean = vec_clean.transform(X_test_clean)

clf_clean = LinearSVC()
clf_clean.fit(X_train_vec_clean, y_train)
y_pred_clean = clf_clean.predict(X_test_vec_clean)


def show_scores(name: str, y_true, y_pred) -> None:
    print(name)
    print("  Accuracy:", round(accuracy_score(y_true, y_pred), 3))
    print("  Macro F1:", round(f1_score(y_true, y_pred, average="macro"), 3))


print("\nTF-IDF + LinearSVC (birth-year bins)")
show_scores("Original text (with potential label leaks):", y_test, y_pred_raw)
print()
show_scores("Cleaned text (regex-based leak removal):", y_test, y_pred_clean)
