# Gender label-leak experiments (regex-based)\n
\n
This notebook runs TF-IDF + LinearSVC for gender classification, comparing:\n
- Original text\n
- Text with label-leaking patterns removed using regex (age + gender cues)\n

In [None]:
import torch

print(torch.is_cuda_available())

In [1]:
from pathlib import Path
import re

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score

# Resolve data directory (works from repo root or model/)
DATA_DIR_CANDIDATES = [
    Path("..") / "data",
]

for p in DATA_DIR_CANDIDATES:
    if p.exists():
        DATA_DIR = p
        break
else:
    raise FileNotFoundError(
        "Could not find data directory. Checked: "
        + ", ".join(str(p) for p in DATA_DIR_CANDIDATES)
    )

print("Using DATA_DIR:", DATA_DIR.resolve())

# Load and clean gender dataset
print("\nLoading gender.csv ...")
gender_df = pd.read_csv(DATA_DIR / "gender.csv", header=None, names=["text", "label"])

# Keep only binary labels 0/1
gender_df = gender_df[gender_df["label"].isin([0, 1, "0", "1"])].copy()
gender_df["label"] = gender_df["label"].astype(int)

# Subsample for speed
gender_df = gender_df.sample(n=min(20000, len(gender_df)), random_state=42).reset_index(drop=True)
print("Dataset shape:", gender_df.shape)
print(gender_df["label"].value_counts())

# Regex patterns for label-leaking tokens (from run_analysis.py)
LEAK_PATTERNS = {
    "age_gender_combo": r"\b(\d{1,2})\s*[MFmf]\b|\b[MFmf]\s*(\d{1,2})\b",
    "i_am_age": r"[Ii]'?m\s+(\d{1,2})\b|[Ii]\s+am\s+(\d{1,2})\b",
    "i_am_gender": r"[Ii]'?m\s+a?\s*(male|female|man|woman|guy|girl)\b",
    "age_years_old": r"\b(\d{1,2})\s*(?:years?\s*old|yo|y\.o\.)\b",
    "age_brackets": r"\(\s*(\d{1,2})\s*[MFmf]\s*\)|\(\s*[MFmf]\s*(\d{1,2})\s*\)",
}

LEAK_REGEX = re.compile("|".join(LEAK_PATTERNS.values()), re.IGNORECASE)


def remove_leak_tokens(text: str) -> str:
    """Remove substrings that match any of the leak patterns."""
    return LEAK_REGEX.sub(" ", str(text))


print("\nCreating cleaned text column (regex-based leak removal) ...")
gender_df["text_clean"] = gender_df["text"].apply(remove_leak_tokens)

# Shared train/test split
print("\nCreating train/test split ...")
df_shuffled = gender_df.sample(frac=1, random_state=123).reset_index(drop=True)
split_idx = int(0.8 * len(df_shuffled))
train_df = df_shuffled.iloc[:split_idx]
test_df = df_shuffled.iloc[split_idx:]

# Original text model
print("\nTraining TF-IDF + LinearSVC on ORIGINAL text ...")
vec_raw = TfidfVectorizer(max_features=20000, ngram_range=(1, 2), min_df=5)
X_train_raw = vec_raw.fit_transform(train_df["text"])
X_test_raw = vec_raw.transform(test_df["text"])

clf_raw = LinearSVC()
clf_raw.fit(X_train_raw, train_df["label"])
y_pred_raw = clf_raw.predict(X_test_raw)

# Cleaned text model
print("Training TF-IDF + LinearSVC on CLEANED text ...")
vec_clean = TfidfVectorizer(max_features=20000, ngram_range=(1, 2), min_df=5)
X_train_clean = vec_clean.fit_transform(train_df["text_clean"])
X_test_clean = vec_clean.transform(test_df["text_clean"])

clf_clean = LinearSVC()
clf_clean.fit(X_train_clean, train_df["label"])
y_pred_clean = clf_clean.predict(X_test_clean)


def show_scores(name: str, y_true, y_pred) -> None:
    print(name)
    print("  Accuracy:", round(accuracy_score(y_true, y_pred), 3))
    print("  Macro F1:", round(f1_score(y_true, y_pred, average="macro"), 3))


print("\nTF-IDF + LinearSVC (gender)")
show_scores("Original text (with potential label leaks):", test_df["label"], y_pred_raw)
print()
show_scores("Cleaned text (regex-based leak removal):", test_df["label"], y_pred_clean)


Using DATA_DIR: C:\Users\muham\OneDrive - TU Eindhoven\q2\lang-and-ai\data

Loading gender.csv ...
Dataset shape: (20000, 2)
label
0    10634
1     9366
Name: count, dtype: int64

Creating cleaned text column (regex-based leak removal) ...

Creating train/test split ...

Training TF-IDF + LinearSVC on ORIGINAL text ...
Training TF-IDF + LinearSVC on CLEANED text ...

TF-IDF + LinearSVC (gender)
Original text (with potential label leaks):
  Accuracy: 0.913
  Macro F1: 0.913

Cleaned text (regex-based leak removal):
  Accuracy: 0.914
  Macro F1: 0.914
