# Gender Prediction based on Bag of Words
- all prompts of a user are concatenated and then vectorized
use expanded prompts wihtout capitalization, new lines and punctuation

In [1]:
import sqlite3
import pandas as pd

conn = sqlite3.connect('../../giicg.db')

prompts = pd.read_sql("SELECT * FROM expanded_prompts", conn)
conn.close()
prompts

Unnamed: 0,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language
0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6,en
1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en
2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6,en
3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6,en
4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en
...,...,...,...,...,...,...,...,...,...,...
755,724,31,user,import pandas as pd\nimport numpy as np\nfrom ...,Please replace my retrieval pipeline here with...,import pandas as pd\nimport numpy as np\nfrom ...,You are tasked with separating user prompts in...,Man (cisgender),92,en
756,726,31,user,"please update my code accordingly, no comments...","please update my code accordingly, no comments...",,,Man (cisgender),92,en
757,1131,54,user,import pandas as pd\nimport numpy as np\nfrom ...,"I want to tune optimal thresholds. Currently, ...",import pandas as pd\nimport numpy as np\nfrom ...,The narratives list looks like this:\nnarrativ...,Man (cisgender),92,en
758,1532,71,user,"from transformers import AutoTokenizer, AutoMo...",I want to use an LLM for listwise reranking in...,"from transformers import AutoTokenizer, AutoMo...",,Man (cisgender),92,en


## Normalize

In [2]:
from helpers.normalization import remove_punctuation_and_newlines, remove_capitalization

prompts['conversational'] = prompts['conversational'].apply(remove_punctuation_and_newlines)
prompts['conversational'] = prompts['conversational'].apply(remove_capitalization)

prompts

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language
0,1,1,user,"parsing data from python iterator, how it coul...",parsing data from python iterator how it could...,,,Man (cisgender),6,en
1,730,32,user,Write python function to do operations with in...,write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en
2,1133,55,user,Write shortest tutorial on creating RAG on ema...,write shortest tutorial on creating rag on ema...,,,Man (cisgender),6,en
3,1135,55,user,what is FAISS,what is faiss,,,Man (cisgender),6,en
4,1137,55,user,Transform given code to process large .mbox file,transform given code to process large mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en
...,...,...,...,...,...,...,...,...,...,...
755,724,31,user,import pandas as pd\nimport numpy as np\nfrom ...,please replace my retrieval pipeline here with...,import pandas as pd\nimport numpy as np\nfrom ...,You are tasked with separating user prompts in...,Man (cisgender),92,en
756,726,31,user,"please update my code accordingly, no comments...",please update my code accordingly no comments ...,,,Man (cisgender),92,en
757,1131,54,user,import pandas as pd\nimport numpy as np\nfrom ...,i want to tune optimal thresholds currently i ...,import pandas as pd\nimport numpy as np\nfrom ...,The narratives list looks like this:\nnarrativ...,Man (cisgender),92,en
758,1532,71,user,"from transformers import AutoTokenizer, AutoMo...",i want to use an llm for listwise reranking in...,"from transformers import AutoTokenizer, AutoMo...",,Man (cisgender),92,en


## Concatenate all prompts

In [3]:
user_prompts = (
    prompts.groupby(['user_id', 'gender'])['conversational']
    .apply(' '.join)
    .reset_index()    # Reset index to create a DataFrame
)

user_prompts.columns = ['user_id', 'gender', 'combined_prompts']
user_prompts = user_prompts[user_prompts['gender'].isin(['Woman (cisgender)', 'Man (cisgender)'])].reset_index()

user_prompts

Unnamed: 0,index,user_id,gender,combined_prompts
0,0,6,Man (cisgender),parsing data from python iterator how it could...
1,1,11,Woman (cisgender),can you adapt the following code so that inste...
2,2,15,Man (cisgender),setalltables action is currently not fetching ...
3,3,16,Woman (cisgender),i want to use dummy hot encoding to replace th...
4,4,25,Man (cisgender),what is the best way to encode and compress a ...
5,5,28,Woman (cisgender),i have a pandas dataframe like this i want to...
6,7,31,Man (cisgender),how can i make use of an observablehqdatabasec...
7,8,34,Man (cisgender),blender and python i have a collection of hund...
8,9,46,Man (cisgender),how to run a python future without blocking ie...
9,10,47,Man (cisgender),can you create photoshop scripts in which form...


## Vectorize

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
# Vectorize the prompts to create a document-term matrix
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(user_prompts['combined_prompts'])

## Logistic Regression

In [5]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Convert to a DataFrame for better interpretability
word_frequencies = pd.DataFrame(
    X.toarray(),
    columns=vectorizer.get_feature_names_out()
)

# Add gender as the target variable
word_frequencies['gender'] = user_prompts['gender'].apply(lambda x: 1 if x == 'Woman (cisgender)' else 0)


# Separate features (X) and target (y)
X = word_frequencies.drop('gender', axis=1)
y = word_frequencies['gender']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)


# Initialize the logistic regression model
log_reg = LogisticRegression(max_iter=1000)

# Train the model
log_reg.fit(X_train, y_train)


# Get feature names and coefficients
coefficients = log_reg.coef_[0]
words = vectorizer.get_feature_names_out()

# Create a DataFrame for easier interpretation
coeff_df = pd.DataFrame({
    'word': words,
    'coefficient': coefficients
})

# Sort by absolute value of coefficients for most discriminative words
discriminative_words = coeff_df.sort_values(by='coefficient', ascending=False)

# Top words indicating 'Woman (cisgender)'
top_women_words = discriminative_words.head(10)

# Top words indicating 'Man (cisgender)'
top_men_words = discriminative_words.tail(10)


print(top_women_words)
print(top_men_words)

# Make predictions
y_pred = log_reg.predict(X_test)

# Generate a classification report
report = classification_report(y_test, y_pred)
print(report)

         word  coefficient
2099      you     0.310579
300       can     0.216447
2063  without     0.212364
1109       me     0.208998
134       and     0.198019
1192      new     0.181977
1374   please     0.168503
1269       or     0.167267
382    column     0.156943
804      give     0.140481
           word  coefficient
478   currently    -0.094468
782    function    -0.096545
872         how    -0.101838
128          an    -0.102818
966          it    -0.108563
1976        use    -0.115051
893          if    -0.115303
1889         to    -0.116423
1237         of    -0.121988
961          is    -0.228590
              precision    recall  f1-score   support

           0       0.57      1.00      0.73         4
           1       1.00      0.25      0.40         4

    accuracy                           0.62         8
   macro avg       0.79      0.62      0.56         8
weighted avg       0.79      0.62      0.56         8



## SVM

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
import numpy as np
import pandas as pd

y = user_prompts['gender'].apply(lambda x: 1 if x == 'Woman (cisgender)' else 0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42)

svm = LinearSVC(max_iter=10000)
svm.fit(X_train, y_train)

y_pred = svm.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)



coefs = svm.coef_[0]
words = vectorizer.get_feature_names_out()
coef_df = pd.DataFrame({'word': words, 'coefficient': coefs})

# Top words for 'Woman (cisgender)'
top_women_words = coef_df.sort_values(by='coefficient', ascending=False).head(10)
print(top_women_words)

# Top words for 'Man (cisgender)'
top_men_words = coef_df.sort_values(by='coefficient', ascending=True).head(10)
print(top_men_words)



              precision    recall  f1-score   support

           0       0.50      1.00      0.67         5
           1       1.00      0.17      0.29         6

    accuracy                           0.55        11
   macro avg       0.75      0.58      0.48        11
weighted avg       0.77      0.55      0.46        11

         word  coefficient
2099      you     0.083422
2063  without     0.061494
1269       or     0.050785
2048    where     0.048330
1192      new     0.048133
175        as     0.047725
1374   please     0.047214
1109       me     0.044260
1710    small     0.043112
804      give     0.042726
              word  coefficient
961             is    -0.056152
872            how    -0.049843
1889            to    -0.048738
128             an    -0.042447
1976           use    -0.033327
562   distribution    -0.030826
894         iframe    -0.030826
1850          that    -0.030471
966             it    -0.029183
893             if    -0.027231
