# Gender stylometric baselines\n
\n
Stylometric-feature-only baselines (Logistic Regression + RandomForest) for gender classification.\n

In [None]:
from pathlib import Path
from collections import Counter

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

# Data directory: assume notebook is in model/ and data is at ../data
DATA_DIR = Path("..") / "data"
if not DATA_DIR.exists():
    raise FileNotFoundError(f"Expected data directory at {DATA_DIR.resolve()} but it does not exist")

print("Using DATA_DIR:", DATA_DIR.resolve())

# Load and subsample gender dataset (no header in CSV)
print("\nLoading gender.csv ...")
gender_df = pd.read_csv(DATA_DIR / "gender.csv", header=None, names=["text", "label"])

# Keep only binary labels and cast to int
gender_df = gender_df[gender_df["label"].isin([0, 1, "0", "1"])].copy()
gender_df["label"] = gender_df["label"].astype(int)

# Subsample for faster experiments
gender_df = gender_df.sample(n=min(20000, len(gender_df)), random_state=42).reset_index(drop=True)
print("Dataset shape:", gender_df.shape)
print(gender_df["label"].value_counts())


def extract_stylometric_features(text: str) -> dict:
    text = str(text)
    words = text.split()
    # crude sentence split (periods)
    temp = text.replace("?", ".").replace("!", ".")
    sentences = [s.strip() for s in temp.split(".") if s.strip()]

    features = {}
    features["char_count"] = len(text)
    features["word_count"] = len(words)
    features["sentence_count"] = max(len(sentences), 1)
    features["avg_word_length"] = float(np.mean([len(w) for w in words])) if words else 0.0
    features["avg_sentence_length"] = features["word_count"] / features["sentence_count"]

    unique_words = set(w.lower() for w in words)
    features["vocab_richness"] = len(unique_words) / max(len(words), 1)

    punct_chars = ",.!?;:-"
    punct_counts = Counter(c for c in text if c in punct_chars)
    total_punct = sum(punct_counts.values())
    features["punct_ratio"] = total_punct / max(len(text), 1)
    features["exclamation_ratio"] = punct_counts.get("!", 0) / max(total_punct, 1)
    features["question_ratio"] = punct_counts.get("?", 0) / max(total_punct, 1)
    features["comma_ratio"] = punct_counts.get(",", 0) / max(total_punct, 1)
    features["caps_ratio"] = sum(1 for c in text if c.isupper()) / max(len(text), 1)

    function_words = {
        "the", "a", "an", "is", "are", "was", "were", "be", "been", "to", "of", "in",
        "for", "on", "with", "at", "by", "from", "i", "you", "he", "she", "it", "we",
        "they", "my", "your",
    }
    lower_words = [w.lower() for w in words]
    features["function_word_ratio"] = (
        sum(1 for w in lower_words if w in function_words) / max(len(words), 1)
    )

    first_person = {"i", "me", "my", "mine", "myself", "we", "us", "our", "ours"}
    second_person = {"you", "your", "yours", "yourself"}
    features["first_person_ratio"] = (
        sum(1 for w in lower_words if w in first_person) / max(len(words), 1)
    )
    features["second_person_ratio"] = (
        sum(1 for w in lower_words if w in second_person) / max(len(words), 1)
    )

    return features


print("\nExtracting stylometric features ...")
gender_features = pd.DataFrame([extract_stylometric_features(t) for t in gender_df["text"]])
gender_features["label"] = gender_df["label"].values
feature_cols = [c for c in gender_features.columns if c != "label"]
print("Number of stylometric features:", len(feature_cols))

X = gender_features[feature_cols].values
y = gender_features["label"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression baseline
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

# Random Forest baseline
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("\nStylometric baselines (gender)")
print(
    "Logistic Regression - Acc:", round(accuracy_score(y_test, y_pred_lr), 3),
    "Macro F1:", round(f1_score(y_test, y_pred_lr, average="macro"), 3),
)
print(
    "Random Forest      - Acc:", round(accuracy_score(y_test, y_pred_rf), 3),
    "Macro F1:", round(f1_score(y_test, y_pred_rf, average="macro"), 3),
)
