## Logistic Regression and SVM on Prompt-level (with GroupKFold)


In [4]:
import sqlite3
import pandas as pd
conn = sqlite3.connect('../../giicg.db')
prompts_df = pd.read_sql('SELECT * FROM expanded_roberta_prompts', conn)
conn.close()


In [5]:

from IPython.display import display

# Number of messages per user (after filtering to two genders)
messages_per_user = (
    prompts_df['user_id']
    .value_counts()
    .rename('message_count')
    .rename_axis('user_id')
    .reset_index()
    .sort_values('message_count', ascending=False)
)

print(f"Total users (after filtering): {len(messages_per_user)}")
messages_per_user # top 10 users by message count



Total users (after filtering): 27


Unnamed: 0,user_id,message_count
0,91,81
1,34,66
2,79,61
3,47,51
4,73,50
5,55,36
6,89,31
7,16,25
8,28,22
9,77,20


In [6]:
# Number of unique users per gender (after filtering to two genders)
users_per_gender = (
    prompts_df[['user_id', 'gender']]
    .drop_duplicates()
    .groupby('gender')['user_id']
    .nunique()
    .rename('unique_users')
    .reset_index()
    .sort_values('unique_users', ascending=False)
)

users_per_gender

Unnamed: 0,gender,unique_users
0,Man (cisgender),15
1,Woman (cisgender),12


## Filter genders & Normalize text


In [7]:
from helpers.normalization import (remove_punctuation_and_newlines, remove_capitalization)
prompts_df = prompts_df[prompts_df['gender'].isin(['Woman (cisgender)', 'Man (cisgender)'])]
prompts_df['conversational_norm'] = ((prompts_df['conversational']
                                     .apply(remove_punctuation_and_newlines))
                                     .apply(remove_capitalization))
prompts_df


  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,index,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language,label,conversational_norm
0,0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6,en,0,parsing data from python iterator how it could...
1,1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en,0,write python function to do operations with in...
2,2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6,en,0,write shortest tutorial on creating rag on ema...
3,3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6,en,0,what is faiss
4,4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en,0,transform given code to process large mbox file
...,...,...,...,...,...,...,...,...,...,...,...,...,...
562,391,1234,65,user,can we add peid for when pefile fails?,can we add peid for when pefile fails?,,,Woman (cisgender),73,en,1,can we add peid for when pefile fails
563,429,1322,65,user,"param_grid = {\n 'min_samples': [5, 10, 20]...",provide more steps,"param_grid = {\n 'min_samples': [5, 10, 20]...",,Woman (cisgender),73,en,1,provide more steps
564,334,484,21,user,i think i onlz want to think about the imbalan...,i think i only want to think about the imbalan...,,,Woman (cisgender),73,en,1,i think i only want to think about the imbalan...
565,444,1364,65,user,from sklearn.cluster import OPTICS\nfrom sklea...,this worked. but i do not have visualizations ...,from sklearn.cluster import OPTICS\nfrom sklea...,,Woman (cisgender),73,en,1,this worked but i do not have visualizations a...


## Vectorize prompts


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(prompts_df['conversational_norm'])
y = prompts_df['gender'].map({'Woman (cisgender)': 1, 'Man (cisgender)': 0}).values
groups = prompts_df['user_id'].values # Needed for group-aware split
print(X.shape, y.shape)


(567, 1861) (567,)


## Setup Group-aware 5-fold (GroupKFold via scikit-learn)


In [9]:
from sklearn.model_selection import GroupKFold, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
gkf = GroupKFold(n_splits=5)


## Logistic Regression (prompt-level, grouped by user)


In [10]:
logreg = LogisticRegression(max_iter=1000)
pred_logreg = cross_val_predict(logreg, X, y, cv=gkf.split(X, y, groups=groups))
print('--- Logistic Regression ---')
print(classification_report(y, pred_logreg, target_names=['Man (cisgender)', 'Woman (cisgender)']))


--- Logistic Regression ---
                   precision    recall  f1-score   support

  Man (cisgender)       0.43      0.38      0.40       282
Woman (cisgender)       0.45      0.50      0.47       285

         accuracy                           0.44       567
        macro avg       0.44      0.44      0.44       567
     weighted avg       0.44      0.44      0.44       567



## Linear SVM (prompt-level, grouped by user)


In [11]:
svm = LinearSVC(max_iter=1000)
pred_svm = cross_val_predict(svm, X, y, cv=gkf.split(X, y, groups=groups))
print('--- Linear SVM ---')
print(classification_report(y, pred_svm, target_names=['Man (cisgender)', 'Woman (cisgender)']))


--- Linear SVM ---
                   precision    recall  f1-score   support

  Man (cisgender)       0.47      0.40      0.43       282
Woman (cisgender)       0.48      0.55      0.51       285

         accuracy                           0.47       567
        macro avg       0.47      0.47      0.47       567
     weighted avg       0.47      0.47      0.47       567



In [12]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np
import pandas as pd

# Set up GroupKFold
n_splits = 5
gkf = GroupKFold(n_splits=n_splits)
logreg = LogisticRegression(max_iter=1000)
svm = LinearSVC(max_iter=1000)

# Store metrics for each model and fold
lr_metrics = {'accuracy': [], 'precision': [], 'recall': [], 'f1': []}
svm_metrics = {'accuracy': [], 'precision': [], 'recall': [], 'f1': []}

for train_idx, test_idx in gkf.split(X, y, groups=groups):
    # Logistic Regression
    logreg.fit(X[train_idx], y[train_idx])
    y_pred_lr = logreg.predict(X[test_idx])
    lr_metrics['accuracy'].append(accuracy_score(y[test_idx], y_pred_lr))
    lr_metrics['precision'].append(precision_score(y[test_idx], y_pred_lr))
    lr_metrics['recall'].append(recall_score(y[test_idx], y_pred_lr))
    lr_metrics['f1'].append(f1_score(y[test_idx], y_pred_lr))

    # SVM
    svm.fit(X[train_idx], y[train_idx])
    y_pred_svm = svm.predict(X[test_idx])
    svm_metrics['accuracy'].append(accuracy_score(y[test_idx], y_pred_svm))
    svm_metrics['precision'].append(precision_score(y[test_idx], y_pred_svm))
    svm_metrics['recall'].append(recall_score(y[test_idx], y_pred_svm))
    svm_metrics['f1'].append(f1_score(y[test_idx], y_pred_svm))

# Build results summary
results_summary = pd.DataFrame({
    'LogReg_mean': {m: np.mean(val) for m, val in lr_metrics.items()},
    'LogReg_std': {m: np.std(val) for m, val in lr_metrics.items()},
    'SVM_mean': {m: np.mean(val) for m, val in svm_metrics.items()},
    'SVM_std': {m: np.std(val) for m, val in svm_metrics.items()},
})
print(results_summary)

           LogReg_mean  LogReg_std  SVM_mean   SVM_std
accuracy      0.439122    0.088807  0.474539  0.067998
precision     0.529563    0.350307  0.514641  0.342621
recall        0.450356    0.263349  0.471878  0.258261
f1            0.433766    0.224752  0.453716  0.244435


## User weights

In [13]:

import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Per-row weight = 1 / (# messages of that user), normalized to average ~ 1
user_counts = prompts_df['user_id'].value_counts()
weights = prompts_df['user_id'].map(1.0 / user_counts).values
weights = weights * (len(weights) / weights.sum())

n_splits = 5
gkf = GroupKFold(n_splits=n_splits)

logreg = LogisticRegression(max_iter=1000)
svm = LinearSVC(max_iter=1000)

# Collect out-of-fold predictions for fair reporting
y_pred_lr = np.empty_like(y)
y_pred_svm = np.empty_like(y)

for train_idx, test_idx in gkf.split(X, y, groups=groups):
    # Train with sample weights so each user has equal influence
    logreg.fit(X[train_idx], y[train_idx], sample_weight=weights[train_idx])
    svm.fit(X[train_idx], y[train_idx], sample_weight=weights[train_idx])

    y_pred_lr[test_idx] = logreg.predict(X[test_idx])
    y_pred_svm[test_idx] = svm.predict(X[test_idx])

print('--- Logistic Regression (user-balanced weights) ---')
print(classification_report(y, y_pred_lr, target_names=['Man (cisgender)', 'Woman (cisgender)']))

print('--- Linear SVM (user-balanced weights) ---')
print(classification_report(y, y_pred_svm, target_names=['Man (cisgender)', 'Woman (cisgender)']))

--- Logistic Regression (user-balanced weights) ---
                   precision    recall  f1-score   support

  Man (cisgender)       0.43      0.38      0.41       282
Woman (cisgender)       0.45      0.50      0.48       285

         accuracy                           0.44       567
        macro avg       0.44      0.44      0.44       567
     weighted avg       0.44      0.44      0.44       567

--- Linear SVM (user-balanced weights) ---
                   precision    recall  f1-score   support

  Man (cisgender)       0.47      0.39      0.42       282
Woman (cisgender)       0.49      0.57      0.52       285

         accuracy                           0.48       567
        macro avg       0.48      0.48      0.47       567
     weighted avg       0.48      0.48      0.48       567

