# Gender Prediction based on Bag of Words
- each prompt is vectorized individually
- test and train splits are user based to avoid data leakage (data from the same user in test and train data)
- use expanded prompts wihtout capitalization, new lines and punctuation

In [1]:
import sqlite3
import pandas as pd

conn = sqlite3.connect('../../giicg.db')

prompts = pd.read_sql("SELECT * FROM expanded_prompts", conn)
conn.close()
prompts

Unnamed: 0,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language
0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6,en
1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en
2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6,en
3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6,en
4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en
...,...,...,...,...,...,...,...,...,...,...
755,724,31,user,import pandas as pd\nimport numpy as np\nfrom ...,Please replace my retrieval pipeline here with...,import pandas as pd\nimport numpy as np\nfrom ...,You are tasked with separating user prompts in...,Man (cisgender),92,en
756,726,31,user,"please update my code accordingly, no comments...","please update my code accordingly, no comments...",,,Man (cisgender),92,en
757,1131,54,user,import pandas as pd\nimport numpy as np\nfrom ...,"I want to tune optimal thresholds. Currently, ...",import pandas as pd\nimport numpy as np\nfrom ...,The narratives list looks like this:\nnarrativ...,Man (cisgender),92,en
758,1532,71,user,"from transformers import AutoTokenizer, AutoMo...",I want to use an LLM for listwise reranking in...,"from transformers import AutoTokenizer, AutoMo...",,Man (cisgender),92,en


## Normalize and Filter

In [2]:
from helpers.normalization import remove_punctuation_and_newlines, remove_capitalization

prompts['conversational'] = prompts['conversational'].apply(remove_punctuation_and_newlines)
prompts['conversational'] = prompts['conversational'].apply(remove_capitalization)

prompts = prompts[prompts['gender'].isin(['Woman (cisgender)', 'Man (cisgender)'])].reset_index()

prompts

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,index,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language
0,0,1,1,user,"parsing data from python iterator, how it coul...",parsing data from python iterator how it could...,,,Man (cisgender),6,en
1,1,730,32,user,Write python function to do operations with in...,write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en
2,2,1133,55,user,Write shortest tutorial on creating RAG on ema...,write shortest tutorial on creating rag on ema...,,,Man (cisgender),6,en
3,3,1135,55,user,what is FAISS,what is faiss,,,Man (cisgender),6,en
4,4,1137,55,user,Transform given code to process large .mbox file,transform given code to process large mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en
...,...,...,...,...,...,...,...,...,...,...,...
748,755,724,31,user,import pandas as pd\nimport numpy as np\nfrom ...,please replace my retrieval pipeline here with...,import pandas as pd\nimport numpy as np\nfrom ...,You are tasked with separating user prompts in...,Man (cisgender),92,en
749,756,726,31,user,"please update my code accordingly, no comments...",please update my code accordingly no comments ...,,,Man (cisgender),92,en
750,757,1131,54,user,import pandas as pd\nimport numpy as np\nfrom ...,i want to tune optimal thresholds currently i ...,import pandas as pd\nimport numpy as np\nfrom ...,The narratives list looks like this:\nnarrativ...,Man (cisgender),92,en
751,758,1532,71,user,"from transformers import AutoTokenizer, AutoMo...",i want to use an llm for listwise reranking in...,"from transformers import AutoTokenizer, AutoMo...",,Man (cisgender),92,en


## Vectorize

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
# Vectorize the prompts to create a document-term matrix
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(prompts['conversational'])

## Logistic Regression

In [4]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd

# Convert to a DataFrame for better interpretability
word_frequencies = pd.DataFrame(
    X.toarray(),
    columns=vectorizer.get_feature_names_out()
)

# Add gender as the target variable
word_frequencies['gender'] = prompts['gender'].apply(lambda x: 1 if x == 'Woman (cisgender)' else 0)
# Add user_id as group identifier
word_frequencies['user_id'] = prompts['user_id']

# Separate features (X), target (y), and groups
X_features = word_frequencies.drop(columns=['gender', 'user_id'])
y = word_frequencies['gender']
groups = word_frequencies['user_id']

# User-level train-test split
gss = GroupShuffleSplit(n_splits=1, test_size=0.10, random_state=42)
train_idx, test_idx = next(gss.split(X_features, y, groups=groups))

X_train, X_test = X_features.iloc[train_idx], X_features.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# Initialize the logistic regression model
log_reg = LogisticRegression(max_iter=1000)

# Train the model
log_reg.fit(X_train, y_train)

# Get feature names and coefficients
coefficients = log_reg.coef_[0]
words = vectorizer.get_feature_names_out()

# Create a DataFrame for easier interpretation
coeff_df = pd.DataFrame({
    'word': words,
    'coefficient': coefficients
})

# Sort by absolute value of coefficients for most discriminative words
discriminative_words = coeff_df.sort_values(by='coefficient', ascending=False)

# Top words indicating 'Woman (cisgender)'
top_women_words = discriminative_words.head(10)

# Top words indicating 'Man (cisgender)'
top_men_words = discriminative_words.tail(10)

print(top_women_words)
print(top_men_words)

# Make predictions
y_pred = log_reg.predict(X_test)

# Generate a classification report
report = classification_report(y_test, y_pred)
print(report)

            word  coefficient
382       column     1.313872
1161        more     1.033701
481         data     1.016589
1717          so     0.978600
1857       these     0.970342
2029        want     0.931875
1450      python     0.923260
1171          my     0.869537
2080       write     0.865340
592   duplicates     0.822524
           word  coefficient
1227     nunito    -0.706009
675     explain    -0.743795
721      figure    -0.765886
937      inside    -0.794202
1034       line    -0.824775
1219        now    -0.844009
782    function    -0.853358
1357  photoshop    -0.954212
379      colors    -0.969440
757        font    -0.973129
              precision    recall  f1-score   support

           0       0.05      0.36      0.09        14
           1       0.94      0.59      0.72       238

    accuracy                           0.58       252
   macro avg       0.49      0.47      0.40       252
weighted avg       0.89      0.58      0.69       252



## SVM

In [5]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np

# Prepare the target just like before
word_frequencies['gender'] = prompts['gender'].apply(lambda x: 1 if x == 'Woman (cisgender)' else 0)
word_frequencies['user_id'] = prompts['user_id']

X_features = word_frequencies.drop(columns=['gender', 'user_id'])
y = word_frequencies['gender']
groups = word_frequencies['user_id']

# User-level train-test split
gss = GroupShuffleSplit(n_splits=1, test_size=0.10, random_state=42)
train_idx, test_idx = next(gss.split(X_features, y, groups=groups))

X_train, X_test = X_features.iloc[train_idx], X_features.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# Train SVM
svm = LinearSVC(max_iter=10000)
svm.fit(X_train, y_train)

y_pred = svm.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

coefs = svm.coef_[0]
words = vectorizer.get_feature_names_out()
coef_df = pd.DataFrame({'word': words, 'coefficient': coefs})

# Top words for 'Woman (cisgender)'
top_women_words = coef_df.sort_values(by='coefficient', ascending=False).head(10)
print(top_women_words)

# Top words for 'Man (cisgender)'
top_men_words = coef_df.sort_values(by='coefficient', ascending=True).head(10)
print(top_men_words)

              precision    recall  f1-score   support

           0       0.07      0.43      0.12        14
           1       0.95      0.66      0.78       238

    accuracy                           0.65       252
   macro avg       0.51      0.55      0.45       252
weighted avg       0.90      0.65      0.75       252

                       word  coefficient
611                      eh     0.823424
382                  column     0.784822
1656  setcountersubfigures3     0.755414
444                    cool     0.698954
2053                  whole     0.685131
1134               minipage     0.678303
1375                   plot     0.677524
1205         nonfunctioning     0.675828
1857                  these     0.659502
481                    data     0.654712
            word  coefficient
1034        line    -0.893501
1049        load    -0.794180
1952  underscore    -0.728137
1357   photoshop    -0.709820
782     function    -0.661645
2093       yayyy    -0.657506
1998        