## Logistic Regression and SVM on Prompt-level (with GroupKFold)


In [1]:
import sqlite3
import pandas as pd
conn = sqlite3.connect('../../../data/giicg.db')
prompts_df = pd.read_sql('SELECT * FROM expanded_roberta_prompts', conn)
conn.close()


In [2]:

from IPython.display import display

# Number of messages per user (after filtering to two genders)
messages_per_user = (
    prompts_df['user_id']
    .value_counts()
    .rename('message_count')
    .rename_axis('user_id')
    .reset_index()
    .sort_values('message_count', ascending=False)
)

print(f"Total users (after filtering): {len(messages_per_user)}")
print(f"mean message count: {messages_per_user['message_count'].mean()}")
messages_per_user # top 10 users by message count



Total users (after filtering): 28
mean message count: 19.142857142857142


Unnamed: 0,user_id,message_count
0,91,81
1,34,66
2,79,61
3,47,51
4,55,36
5,89,31
6,16,25
7,28,22
8,77,20
9,73,19


In [3]:
# Number of unique users per gender (after filtering to two genders)
users_per_gender = (
    prompts_df[['user_id', 'gender']]
    .drop_duplicates()
    .groupby('gender')['user_id']
    .nunique()
    .rename('unique_users')
    .reset_index()
    .sort_values('unique_users', ascending=False)
)

users_per_gender

Unnamed: 0,gender,unique_users
0,Man (cisgender),15
1,Woman (cisgender),13


## Filter genders & Normalize text


In [4]:
from helpers.normalization import (remove_punctuation_and_newlines, remove_capitalization)
prompts_df = prompts_df[prompts_df['gender'].isin(['Woman (cisgender)', 'Man (cisgender)'])]
prompts_df['masked_translated_norm'] = ((prompts_df['masked_translated']
                                     .apply(remove_punctuation_and_newlines))
                                     .apply(remove_capitalization))
prompts_df


  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,level_0,index,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language,label,masked_prompt,masked_translated,masked_translated_norm
0,0,0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6,en,0,"parsing data from [TERM], how it could be hand...","parsing data from [TERM], how it could be hand...",parsing data from term how it could be handled...
1,1,1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en,0,Write python function to do operations with in...,Write python function to do operations with in...,write python function to do operations with in...
2,2,2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6,en,0,Write shortest tutorial on creating [TERM] on ...,Write shortest tutorial on creating [TERM] on ...,write shortest tutorial on creating term on em...
3,3,3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6,en,0,what is [TERM],what is [TERM],what is term
4,4,4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en,0,Transform given code to process large [TERM] file,Transform given code to process large [TERM] file,transform given code to process large term file
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
531,531,501,1674,87,user,Accuracy: 1.0\n Count: 2\nMetrics for neptune...,these are the results. i to calculate a statis...,,Accuracy: 1.0\n Count: 2\nMetrics for neptune...,Woman (cisgender),73,en,1,[INFO]\n[INFO]\n[TERM]:\n [TERM]: [OTHER]\n ...,[INFO]\n[INFO]\n[TERM]:\n [TERM]: [OTHER]\n ...,info info term term other term other ter...
532,532,416,1290,65,user,how are we currently processing non numerical ...,how are we currently processing non numerical ...,"def perform_optics_clustering(file_path, outpu...",,Woman (cisgender),73,en,1,how are we currently processing non numerical ...,how are we currently processing non numerical ...,how are we currently processing non numerical ...
533,533,425,1314,65,user,what is the reachability score,what is the reachability score,,,Woman (cisgender),73,en,1,what is the [TERM],what is the [TERM],what is the term
534,534,309,372,21,user,"my features are saved in ""train_features.npy"" ...","my features are saved in ""train_features.npy"" ...",,,Woman (cisgender),73,en,1,my features are saved in [TERM] and the file n...,my features are saved in [TERM] and the file n...,my features are saved in term and the file nam...


## Vectorize prompts


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(prompts_df['masked_translated_norm'])
y = prompts_df['gender'].map({'Woman (cisgender)': 1, 'Man (cisgender)': 0}).values
groups = prompts_df['user_id'].values # Needed for group-aware split
print(X.shape, y.shape)


(536, 2211) (536,)


## Setup Group-aware 5-fold (GroupKFold via scikit-learn)


In [6]:
from sklearn.model_selection import GroupKFold, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
gkf = GroupKFold(n_splits=5)


## Logistic Regression (prompt-level, grouped by user)


In [7]:
logreg = LogisticRegression(max_iter=1000)
pred_logreg = cross_val_predict(logreg, X, y, cv=gkf.split(X, y, groups=groups))
print('--- Logistic Regression ---')
print(classification_report(y, pred_logreg, target_names=['Man (cisgender)', 'Woman (cisgender)']))


--- Logistic Regression ---
                   precision    recall  f1-score   support

  Man (cisgender)       0.45      0.47      0.46       282
Woman (cisgender)       0.38      0.36      0.37       254

         accuracy                           0.42       536
        macro avg       0.41      0.41      0.41       536
     weighted avg       0.41      0.42      0.41       536



## Linear SVM (prompt-level, grouped by user)


In [8]:
svm = LinearSVC(max_iter=1000)
pred_svm = cross_val_predict(svm, X, y, cv=gkf.split(X, y, groups=groups))
print('--- Linear SVM ---')
print(classification_report(y, pred_svm, target_names=['Man (cisgender)', 'Woman (cisgender)']))


--- Linear SVM ---
                   precision    recall  f1-score   support

  Man (cisgender)       0.50      0.50      0.50       282
Woman (cisgender)       0.44      0.43      0.44       254

         accuracy                           0.47       536
        macro avg       0.47      0.47      0.47       536
     weighted avg       0.47      0.47      0.47       536





In [9]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np
import pandas as pd

# Set up GroupKFold
n_splits = 5
gkf = GroupKFold(n_splits=n_splits)
logreg = LogisticRegression(max_iter=1000)
svm = LinearSVC(max_iter=1000)

# Store metrics for each model and fold
lr_metrics = {'accuracy': [], 'precision': [], 'recall': [], 'f1': []}
svm_metrics = {'accuracy': [], 'precision': [], 'recall': [], 'f1': []}

for train_idx, test_idx in gkf.split(X, y, groups=groups):
    # Logistic Regression
    logreg.fit(X[train_idx], y[train_idx])
    y_pred_lr = logreg.predict(X[test_idx])
    lr_metrics['accuracy'].append(accuracy_score(y[test_idx], y_pred_lr))
    lr_metrics['precision'].append(precision_score(y[test_idx], y_pred_lr))
    lr_metrics['recall'].append(recall_score(y[test_idx], y_pred_lr))
    lr_metrics['f1'].append(f1_score(y[test_idx], y_pred_lr))

    # SVM
    svm.fit(X[train_idx], y[train_idx])
    y_pred_svm = svm.predict(X[test_idx])
    svm_metrics['accuracy'].append(accuracy_score(y[test_idx], y_pred_svm))
    svm_metrics['precision'].append(precision_score(y[test_idx], y_pred_svm))
    svm_metrics['recall'].append(recall_score(y[test_idx], y_pred_svm))
    svm_metrics['f1'].append(f1_score(y[test_idx], y_pred_svm))

# Build results summary
results_summary = pd.DataFrame({
    'LogReg_mean': {m: np.mean(val) for m, val in lr_metrics.items()},
    'LogReg_std': {m: np.std(val) for m, val in lr_metrics.items()},
    'SVM_mean': {m: np.mean(val) for m, val in svm_metrics.items()},
    'SVM_std': {m: np.std(val) for m, val in svm_metrics.items()},
})
print(results_summary)

           LogReg_mean  LogReg_std  SVM_mean   SVM_std
accuracy      0.415834    0.119283  0.470077  0.083604
precision     0.500716    0.402272  0.499784  0.410870
recall        0.386624    0.274604  0.387176  0.219436
f1            0.322076    0.200120  0.362908  0.231440


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## User weights

In [10]:

import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Per-row weight = 1 / (# messages of that user), normalized to average ~ 1
user_counts = prompts_df['user_id'].value_counts()
weights = prompts_df['user_id'].map(1.0 / user_counts).values
weights = weights * (len(weights) / weights.sum())

n_splits = 5
gkf = GroupKFold(n_splits=n_splits)

logreg = LogisticRegression(max_iter=1000)
svm = LinearSVC(max_iter=1000)

# Collect out-of-fold predictions for fair reporting
y_pred_lr = np.empty_like(y)
y_pred_svm = np.empty_like(y)

for train_idx, test_idx in gkf.split(X, y, groups=groups):
    # Train with sample weights so each user has equal influence
    logreg.fit(X[train_idx], y[train_idx], sample_weight=weights[train_idx])
    svm.fit(X[train_idx], y[train_idx], sample_weight=weights[train_idx])

    y_pred_lr[test_idx] = logreg.predict(X[test_idx])
    y_pred_svm[test_idx] = svm.predict(X[test_idx])

print('--- Logistic Regression (user-balanced weights) ---')
print(classification_report(y, y_pred_lr, target_names=['Man (cisgender)', 'Woman (cisgender)']))

print('--- Linear SVM (user-balanced weights) ---')
print(classification_report(y, y_pred_svm, target_names=['Man (cisgender)', 'Woman (cisgender)']))



--- Logistic Regression (user-balanced weights) ---
                   precision    recall  f1-score   support

  Man (cisgender)       0.46      0.43      0.44       282
Woman (cisgender)       0.41      0.44      0.42       254

         accuracy                           0.43       536
        macro avg       0.43      0.43      0.43       536
     weighted avg       0.43      0.43      0.43       536

--- Linear SVM (user-balanced weights) ---
                   precision    recall  f1-score   support

  Man (cisgender)       0.48      0.48      0.48       282
Woman (cisgender)       0.43      0.43      0.43       254

         accuracy                           0.46       536
        macro avg       0.45      0.45      0.45       536
     weighted avg       0.46      0.46      0.46       536



