# Gender Prediction based on Bag of Words
- each prompt is vectorized individually
- test and train splits are user based to avoid data leakage (data from the same user in test and train data)
- use expanded prompts wihtout capitalization, new lines and punctuation

In [1]:
import sqlite3
import pandas as pd

conn = sqlite3.connect('../../../giicg.db')

prompts = pd.read_sql("SELECT * FROM expanded_prompts", conn)
conn.close()
prompts

Unnamed: 0,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language
0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6,en
1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en
2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6,en
3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6,en
4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en
...,...,...,...,...,...,...,...,...,...,...
748,1131,54,user,import pandas as pd\nimport numpy as np\nfrom ...,"I want to tune optimal thresholds. Currently, ...",import pandas as pd\nimport numpy as np\nfrom ...,The narratives list looks like this:\nnarrativ...,Man (cisgender),92,en
749,1532,71,user,"from transformers import AutoTokenizer, AutoMo...",I want to use an LLM for listwise reranking in...,"from transformers import AutoTokenizer, AutoMo...",,Man (cisgender),92,en
750,1646,82,user,"def run_query(query, n_results):\n query_em...",this is my code. I want to: Get nodes and edge...,"def run_query(query, n_results):\n query_em...",,Man (cisgender),92,en
751,1849,2,user,\n I am working on the problem of reconstru...,\n I am working on the problem of reconstru...,,Classic CV - Drone navigation\nIf you ever tho...,Man (cisgender),8,en


## Normalize and Filter

In [2]:
from helpers.normalization import remove_punctuation_and_newlines, remove_capitalization

prompts['conversational'] = prompts['conversational'].apply(remove_punctuation_and_newlines)
prompts['conversational'] = prompts['conversational'].apply(remove_capitalization)

prompts = prompts[prompts['gender'].isin(['Woman (cisgender)', 'Man (cisgender)'])].reset_index()

prompts

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,index,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language
0,0,1,1,user,"parsing data from python iterator, how it coul...",parsing data from python iterator how it could...,,,Man (cisgender),6,en
1,1,730,32,user,Write python function to do operations with in...,write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en
2,2,1133,55,user,Write shortest tutorial on creating RAG on ema...,write shortest tutorial on creating rag on ema...,,,Man (cisgender),6,en
3,3,1135,55,user,what is FAISS,what is faiss,,,Man (cisgender),6,en
4,4,1137,55,user,Transform given code to process large .mbox file,transform given code to process large mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en
...,...,...,...,...,...,...,...,...,...,...,...
741,748,1131,54,user,import pandas as pd\nimport numpy as np\nfrom ...,i want to tune optimal thresholds currently i ...,import pandas as pd\nimport numpy as np\nfrom ...,The narratives list looks like this:\nnarrativ...,Man (cisgender),92,en
742,749,1532,71,user,"from transformers import AutoTokenizer, AutoMo...",i want to use an llm for listwise reranking in...,"from transformers import AutoTokenizer, AutoMo...",,Man (cisgender),92,en
743,750,1646,82,user,"def run_query(query, n_results):\n query_em...",this is my code i want to get nodes and edges ...,"def run_query(query, n_results):\n query_em...",,Man (cisgender),92,en
744,751,1849,2,user,\n I am working on the problem of reconstru...,i am working on the problem of reconstruc...,,Classic CV - Drone navigation\nIf you ever tho...,Man (cisgender),8,en


## Vectorize

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
# Vectorize the prompts to create a document-term matrix
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(prompts['conversational'])

## Logistic Regression

In [5]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd

# Convert to a DataFrame for better interpretability
word_frequencies = pd.DataFrame(
    X.toarray(),
    columns=vectorizer.get_feature_names_out()
)

# Add gender as the target variable
word_frequencies['gender'] = prompts['gender'].apply(lambda x: 1 if x == 'Woman (cisgender)' else 0)
# Add user_id as group identifier
word_frequencies['user_id'] = prompts['user_id']

# Separate features (X), target (y), and groups
X_features = word_frequencies.drop(columns=['gender', 'user_id'])
y = word_frequencies['gender']
groups = word_frequencies['user_id']

# User-level train-test split
gss = GroupShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
train_idx, test_idx = next(gss.split(X_features, y, groups=groups))

X_train, X_test = X_features.iloc[train_idx], X_features.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# Initialize the logistic regression model
log_reg = LogisticRegression(max_iter=1000)

# Train the model
log_reg.fit(X_train, y_train)

# Get feature names and coefficients
coefficients = log_reg.coef_[0]
words = vectorizer.get_feature_names_out()

# Create a DataFrame for easier interpretation
coeff_df = pd.DataFrame({
    'word': words,
    'coefficient': coefficients
})

# Sort by absolute value of coefficients for most discriminative words
discriminative_words = coeff_df.sort_values(by='coefficient', ascending=False)

# Top words indicating 'Woman (cisgender)'
top_women_words = discriminative_words.head(10)

# Top words indicating 'Man (cisgender)'
top_men_words = discriminative_words.tail(10)

print(top_women_words)
print(top_men_words)

# Make predictions
y_pred = log_reg.predict(X_test)

# Generate a classification report
report = classification_report(y_test, y_pred)
print(report)

         word  coefficient
1714       so     1.192011
2026     want     1.090670
380    column     1.058017
479      data     1.027068
1990   values     1.001054
723      file     0.929085
1447   python     0.855483
1158     more     0.853406
1121   method     0.791863
1433  provide     0.749003
           word  coefficient
933      inside    -0.850487
328        chat    -0.893409
2067       work    -0.913101
1017       left    -0.996114
2061       woff    -1.010160
1224     nunito    -1.044823
1355  photoshop    -1.221669
721      figure    -1.225112
781    function    -1.329530
757        font    -1.434745
              precision    recall  f1-score   support

           0       0.93      0.14      0.24       101
           1       0.15      0.94      0.25        16

    accuracy                           0.25       117
   macro avg       0.54      0.54      0.25       117
weighted avg       0.83      0.25      0.24       117



## SVM

In [6]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np

# Prepare the target just like before
word_frequencies['gender'] = prompts['gender'].apply(lambda x: 1 if x == 'Woman (cisgender)' else 0)
word_frequencies['user_id'] = prompts['user_id']

X_features = word_frequencies.drop(columns=['gender', 'user_id'])
y = word_frequencies['gender']
groups = word_frequencies['user_id']

# User-level train-test split
gss = GroupShuffleSplit(n_splits=1, test_size=0.10, random_state=42)
train_idx, test_idx = next(gss.split(X_features, y, groups=groups))

X_train, X_test = X_features.iloc[train_idx], X_features.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# Train SVM
svm = LinearSVC(max_iter=10000)
svm.fit(X_train, y_train)

y_pred = svm.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

coefs = svm.coef_[0]
words = vectorizer.get_feature_names_out()
coef_df = pd.DataFrame({'word': words, 'coefficient': coefs})

# Top words for 'Woman (cisgender)'
top_women_words = coef_df.sort_values(by='coefficient', ascending=False).head(10)
print(top_women_words)

# Top words for 'Man (cisgender)'
top_men_words = coef_df.sort_values(by='coefficient', ascending=True).head(10)
print(top_men_words)

              precision    recall  f1-score   support

           0       1.00      0.25      0.40        77
           1       0.00      0.00      0.00         0

    accuracy                           0.25        77
   macro avg       0.50      0.12      0.20        77
weighted avg       1.00      0.25      0.40        77

            word  coefficient
380       column     0.727119
1433     provide     0.625525
1579         run     0.602093
1343     perfect     0.595014
690           f1     0.580812
1714          so     0.576123
992          key     0.575640
610           eh     0.574357
723         file     0.571744
1987  validation     0.564319
            word  coefficient
2090       yayyy    -1.013399
1950  underscore    -0.964576
388       coming    -0.961711
781     function    -0.954810
700        faiss    -0.948291
1355   photoshop    -0.854709
328         chat    -0.817405
784        funny    -0.785476
906    important    -0.776544
721       figure    -0.773215


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
