# Gender Prediction based on Bag of Words
- all prompts of a user are concatenated and then vectorized
use expanded prompts wihtout capitalization, new lines and punctuation

In [None]:
import sqlite3
import pandas as pd

conn = sqlite3.connect('../../giicg.db')

prompts = pd.read_sql("SELECT * FROM expanded_prompts", conn)
conn.close()
prompts

## Normalize

In [None]:
from helpers.normalization import remove_punctuation_and_newlines, remove_capitalization

prompts['conversational'] = prompts['conversational'].apply(remove_punctuation_and_newlines)
prompts['conversational'] = prompts['conversational'].apply(remove_capitalization)

prompts

## Concatenate prompts per user

In [None]:
user_prompts = (
    prompts.groupby(['user_id', 'gender'])['conversational']
    .apply(' '.join)
    .reset_index()    # Reset index to create a DataFrame
)

user_prompts.columns = ['user_id', 'gender', 'combined_prompts']
user_prompts = user_prompts[user_prompts['gender'].isin(['Woman (cisgender)', 'Man (cisgender)'])].reset_index()

user_prompts

## Vectorize and label

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(user_prompts['combined_prompts'])
y = user_prompts['gender'].apply(lambda x: 1 if x == 'Woman (cisgender)' else 0)


## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
import numpy as np

log_reg = LogisticRegression(max_iter=1000)
y_pred_lr = cross_val_predict(log_reg, X, y, cv=5)
print("Logistic Regression Results (5-fold CV):")
print(classification_report(y, y_pred_lr))

# Fit on full data to inspect coefficients
log_reg.fit(X, y)
coefficients = log_reg.coef_[0]
words = vectorizer.get_feature_names_out()
coeff_df = pd.DataFrame({'word': words, 'coefficient': coefficients})

top_women_words = coeff_df.sort_values(by='coefficient', ascending=False).head(10)
top_men_words = coeff_df.sort_values(by='coefficient', ascending=True).head(10)
print("Top words indicating 'Woman (cisgender)':")
print(top_women_words)
print("Top words indicating 'Man (cisgender)':")
print(top_men_words)


## SVM

In [None]:
from sklearn.svm import LinearSVC

svm = LinearSVC(max_iter=10000)
y_pred_svm = cross_val_predict(svm, X, y, cv=5)
print("SVM Results (5-fold CV):")
print(classification_report(y, y_pred_svm))

# Fit on full data to inspect coefficients
svm.fit(X, y)
coefs = svm.coef_[0]
coef_df = pd.DataFrame({'word': words, 'coefficient': coefs})

top_women_words = coef_df.sort_values(by='coefficient', ascending=False).head(10)
top_men_words = coef_df.sort_values(by='coefficient', ascending=True).head(10)
print("Top words indicating 'Woman (cisgender)':")
print(top_women_words)
print("Top words indicating 'Man (cisgender)':")
print(top_men_words)
