# Gender Prediction based on Bag of Words
- all prompts of a user are concatenated and then vectorized
use expanded prompts wihtout capitalization, new lines and punctuation

In [24]:
import sqlite3
import pandas as pd

conn = sqlite3.connect('../../giicg.db')

prompts = pd.read_sql("SELECT * FROM expanded_prompts", conn)
conn.close()
prompts

Unnamed: 0,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language
0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6,en
1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en
2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6,en
3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6,en
4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en
...,...,...,...,...,...,...,...,...,...,...
748,1131,54,user,import pandas as pd\nimport numpy as np\nfrom ...,"I want to tune optimal thresholds. Currently, ...",import pandas as pd\nimport numpy as np\nfrom ...,The narratives list looks like this:\nnarrativ...,Man (cisgender),92,en
749,1532,71,user,"from transformers import AutoTokenizer, AutoMo...",I want to use an LLM for listwise reranking in...,"from transformers import AutoTokenizer, AutoMo...",,Man (cisgender),92,en
750,1646,82,user,"def run_query(query, n_results):\n query_em...",this is my code. I want to: Get nodes and edge...,"def run_query(query, n_results):\n query_em...",,Man (cisgender),92,en
751,1849,2,user,\n I am working on the problem of reconstru...,\n I am working on the problem of reconstru...,,Classic CV - Drone navigation\nIf you ever tho...,Man (cisgender),8,en


## Normalize

In [25]:
from helpers.normalization import remove_punctuation_and_newlines, remove_capitalization

prompts['conversational'] = prompts['conversational'].apply(remove_punctuation_and_newlines)
prompts['conversational'] = prompts['conversational'].apply(remove_capitalization)

prompts

Unnamed: 0,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language
0,1,1,user,"parsing data from python iterator, how it coul...",parsing data from python iterator how it could...,,,Man (cisgender),6,en
1,730,32,user,Write python function to do operations with in...,write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en
2,1133,55,user,Write shortest tutorial on creating RAG on ema...,write shortest tutorial on creating rag on ema...,,,Man (cisgender),6,en
3,1135,55,user,what is FAISS,what is faiss,,,Man (cisgender),6,en
4,1137,55,user,Transform given code to process large .mbox file,transform given code to process large mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en
...,...,...,...,...,...,...,...,...,...,...
748,1131,54,user,import pandas as pd\nimport numpy as np\nfrom ...,i want to tune optimal thresholds currently i ...,import pandas as pd\nimport numpy as np\nfrom ...,The narratives list looks like this:\nnarrativ...,Man (cisgender),92,en
749,1532,71,user,"from transformers import AutoTokenizer, AutoMo...",i want to use an llm for listwise reranking in...,"from transformers import AutoTokenizer, AutoMo...",,Man (cisgender),92,en
750,1646,82,user,"def run_query(query, n_results):\n query_em...",this is my code i want to get nodes and edges ...,"def run_query(query, n_results):\n query_em...",,Man (cisgender),92,en
751,1849,2,user,\n I am working on the problem of reconstru...,i am working on the problem of reconstruc...,,Classic CV - Drone navigation\nIf you ever tho...,Man (cisgender),8,en


## Concatenate prompts per user

In [26]:
user_prompts = (
    prompts.groupby(['user_id', 'gender'])['conversational']
    .apply(' '.join)
    .reset_index()    # Reset index to create a DataFrame
)

user_prompts.columns = ['user_id', 'gender', 'combined_prompts']
user_prompts = user_prompts[user_prompts['gender'].isin(['Woman (cisgender)', 'Man (cisgender)'])].reset_index()

user_prompts

Unnamed: 0,index,user_id,gender,combined_prompts
0,0,6,Man (cisgender),parsing data from python iterator how it could...
1,1,8,Man (cisgender),i am working on the problem of reconstruc...
2,2,11,Woman (cisgender),can you adapt the following code so that inste...
3,3,15,Man (cisgender),setalltables action is currently not fetching ...
4,4,16,Woman (cisgender),i want to use dummy hot encoding to replace th...
5,5,25,Man (cisgender),what is the best way to encode and compress a ...
6,6,28,Woman (cisgender),i have a pandas dataframe like this i want to...
7,8,31,Man (cisgender),how can i make use of an observablehqdatabasec...
8,9,34,Man (cisgender),blender and python i have a collection of hund...
9,10,46,Man (cisgender),how to run a python future without blocking ie...


## Vectorize and label

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(user_prompts['combined_prompts'])
y = user_prompts['gender'].apply(lambda x: 1 if x == 'Woman (cisgender)' else 0)


## Logistic Regression

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
import numpy as np

log_reg = LogisticRegression(max_iter=1000)
y_pred_lr = cross_val_predict(log_reg, X, y, cv=5)
print("Logistic Regression Results (5-fold CV):")
print(classification_report(y, y_pred_lr))

# Fit on full data to inspect coefficients
log_reg.fit(X, y)
coefficients = log_reg.coef_[0]
words = vectorizer.get_feature_names_out()
coeff_df = pd.DataFrame({'word': words, 'coefficient': coefficients})

top_women_words = coeff_df.sort_values(by='coefficient', ascending=False).head(10)
top_men_words = coeff_df.sort_values(by='coefficient', ascending=True).head(10)
print("Top words indicating 'Woman (cisgender)':")
print(top_women_words)
print("Top words indicating 'Man (cisgender)':")
print(top_men_words)


Logistic Regression Results (5-fold CV):
              precision    recall  f1-score   support

           0       0.67      0.80      0.73        15
           1       0.67      0.50      0.57        12

    accuracy                           0.67        27
   macro avg       0.67      0.65      0.65        27
weighted avg       0.67      0.67      0.66        27

Top words indicating 'Woman (cisgender)':
         word  coefficient
2096      you     0.322304
298       can     0.319397
1189      new     0.254626
1106       me     0.226543
133       and     0.217548
2060  without     0.206613
1372   please     0.195602
174        as     0.182050
317    change     0.168213
2045    where     0.150212
Top words indicating 'Man (cisgender)':
          word  coefficient
958         is    -0.311814
889         if    -0.166107
779       from    -0.151577
1234        of    -0.147671
781   function    -0.131132
1973       use    -0.116053
1210       not    -0.112035
1848       the    -0.108791
1

In [31]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

skf = StratifiedKFold(n_splits=5)
accs, f1s, precs, recs = [], [], [], []

for train_idx, test_idx in skf.split(X, y):
    log_reg.fit(X[train_idx], y[train_idx])
    y_pred = log_reg.predict(X[test_idx])
    accs.append(accuracy_score(y[test_idx], y_pred))
    f1s.append(f1_score(y[test_idx], y_pred, average='weighted'))
    precs.append(precision_score(y[test_idx], y_pred, average='weighted'))
    recs.append(recall_score(y[test_idx], y_pred, average='weighted'))

results = pd.DataFrame({
    'accuracy': accs,
    'f1': f1s,
    'precision': precs,
    'recall': recs
})
results_summary = results.agg(['mean', 'std']).transpose()

# Export
results_summary.to_latex('cv_metrics_summary.tex')
results_summary

Unnamed: 0,mean,std
accuracy,0.66,0.173845
f1,0.639286,0.194471
precision,0.688333,0.243784
recall,0.66,0.173845


## SVM

In [29]:
from sklearn.svm import LinearSVC

svm = LinearSVC(max_iter=10000)
y_pred_svm = cross_val_predict(svm, X, y, cv=5)
print("SVM Results (5-fold CV):")
print(classification_report(y, y_pred_svm))

# Fit on full data to inspect coefficients
svm.fit(X, y)
coefs = svm.coef_[0]
coef_df = pd.DataFrame({'word': words, 'coefficient': coefs})

top_women_words = coef_df.sort_values(by='coefficient', ascending=False).head(10)
top_men_words = coef_df.sort_values(by='coefficient', ascending=True).head(10)
print("Top words indicating 'Woman (cisgender)':")
print(top_women_words)
print("Top words indicating 'Man (cisgender)':")
print(top_men_words)


SVM Results (5-fold CV):
              precision    recall  f1-score   support

           0       0.68      0.87      0.76        15
           1       0.75      0.50      0.60        12

    accuracy                           0.70        27
   macro avg       0.72      0.68      0.68        27
weighted avg       0.71      0.70      0.69        27

Top words indicating 'Woman (cisgender)':
         word  coefficient
2096      you     0.089907
1189      new     0.081720
298       can     0.081071
174        as     0.076709
1106       me     0.056069
2060  without     0.055288
133       and     0.052628
1372   please     0.051426
2045    where     0.050012
1707    small     0.048284
Top words indicating 'Man (cisgender)':
              word  coefficient
958             is    -0.113281
1234            of    -0.061490
889             if    -0.050823
2051           why    -0.045049
179             at    -0.042706
476      currently    -0.038787
890         iframe    -0.038067
560   distrib

In [36]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd


skf = StratifiedKFold(n_splits=5)
svm_accs, svm_f1s, svm_precs, svm_recs = [], [], [], []

for train_idx, test_idx in skf.split(X, y):
    svm.fit(X[train_idx], y[train_idx])
    y_pred = svm.predict(X[test_idx])
    svm_accs.append(accuracy_score(y[test_idx], y_pred))
    svm_f1s.append(f1_score(y[test_idx], y_pred, average='weighted'))
    svm_precs.append(precision_score(y[test_idx], y_pred, average='weighted'))
    svm_recs.append(recall_score(y[test_idx], y_pred, average='weighted'))

svm_results = pd.DataFrame({
    'accuracy': svm_accs,
    'f1': svm_f1s,
    'precision': svm_precs,
    'recall': svm_recs
})
svm_results_summary = svm_results.agg(['mean', 'std']).transpose()

svm_results_summary

Unnamed: 0,mean,std
accuracy,0.706667,0.089443
f1,0.68619,0.095981
precision,0.783333,0.106719
recall,0.706667,0.089443
