## Logistic Regression and SVM on Prompt-level (with GroupKFold)


In [None]:
import sqlite3
import pandas as pd
conn = sqlite3.connect('../../giicg.db')
prompts_df = pd.read_sql('SELECT * FROM expanded_roberta_prompts', conn)
conn.close()


## Filter genders & Normalize text


In [None]:
from helpers.normalization import (remove_punctuation_and_newlines, remove_capitalization)
prompts_df = prompts_df[prompts_df['gender'].isin(['Woman (cisgender)', 'Man (cisgender)'])]
prompts_df['conversational_norm'] = ((prompts_df['conversational']
                                     .apply(remove_punctuation_and_newlines))
                                     .apply(remove_capitalization))
prompts_df


## Vectorize prompts


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(prompts_df['conversational_norm'])
y = prompts_df['gender'].map({'Woman (cisgender)': 1, 'Man (cisgender)': 0}).values
groups = prompts_df['user_id'].values # Needed for group-aware split
print(X.shape, y.shape)


## Setup Group-aware 5-fold (GroupKFold via scikit-learn)


In [None]:
from sklearn.model_selection import GroupKFold, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
gkf = GroupKFold(n_splits=5)


## Logistic Regression (prompt-level, grouped by user)


In [None]:
logreg = LogisticRegression(max_iter=1000)
pred_logreg = cross_val_predict(logreg, X, y, cv=gkf.split(X, y, groups=groups))
print('--- Logistic Regression ---')
print(classification_report(y, pred_logreg, target_names=['Man (cisgender)', 'Woman (cisgender)']))


## Linear SVM (prompt-level, grouped by user)


In [None]:
svm = LinearSVC(max_iter=1000)
pred_svm = cross_val_predict(svm, X, y, cv=gkf.split(X, y, groups=groups))
print('--- Linear SVM ---')
print(classification_report(y, pred_svm, target_names=['Man (cisgender)', 'Woman (cisgender)']))
