# Top Used Words
use expanded, decapitalited and punctuation removed prompts
- concatenate all prompts from all conversations per user
- then count frequencies per user
- normalize by total token count
- fuse counts within each gender group
- take top 10 or 15 words
- logistic regression or chi square to find most discriminative word


In [7]:
import sqlite3
import pandas as pd

conn = sqlite3.connect('../../data/giicg.db')

prompts = pd.read_sql("SELECT * FROM expanded_prompts", conn)


In [8]:
from helpers.normalization import remove_punctuation_and_newlines, remove_capitalization

prompts['conversational'] = prompts['conversational'].apply(remove_punctuation_and_newlines)
prompts['conversational'] = prompts['conversational'].apply(remove_capitalization)

## Concatenate all prompts per user

In [9]:
user_prompts = (
    prompts.groupby(['user_id', 'gender'])['conversational']
    .apply(' '.join)
    .reset_index()    # Reset index to create a DataFrame
)

user_prompts.columns = ['user_id', 'gender', 'combined_prompts']
user_prompts = user_prompts[user_prompts['gender'].isin(['Woman (cisgender)', 'Man (cisgender)'])].reset_index()

## Count Uni and Bi grams

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_unigrams = CountVectorizer()
vectorizer_bigrams = CountVectorizer(ngram_range=(2,2))

unigram_matrix = vectorizer_unigrams.fit_transform(user_prompts['combined_prompts'])
words = vectorizer_unigrams.get_feature_names_out()
unigram_df = pd.DataFrame(unigram_matrix.toarray(), columns=words)
unigram_df['user_id'] = user_prompts['user_id'].values
unigram_df['gender'] = user_prompts['gender'].values

bigram_matrix = vectorizer_bigrams.fit_transform(user_prompts['combined_prompts'])
bigrams = vectorizer_bigrams.get_feature_names_out()
bigram_fd = pd.DataFrame(bigram_matrix.toarray(), columns=bigrams)
bigram_fd['user_id'] = user_prompts['user_id'].values
bigram_fd['gender'] = user_prompts['gender'].values

# 1. Normalize counts per user (row) by their total word count
word_cols = words  # all column names for words
user_totals = unigram_df[word_cols].sum(axis=1)
unigram_df[word_cols] = unigram_df[word_cols].div(user_totals, axis=0)

bigram_cols = bigrams  # all column names for words
user_totals = bigram_fd[bigram_cols].sum(axis=1)
bigram_fd[bigram_cols] = bigram_fd[bigram_cols].div(user_totals, axis=0)

# 2. Compute the mean vector per gender (i.e., average normalized word frequencies)
gender_unigrams = unigram_df.groupby('gender')[word_cols].mean()
gender_bigrams = bigram_fd.groupby('gender')[bigram_cols].mean()


In [11]:
N = 10
female_uni = gender_unigrams.loc['Woman (cisgender)'].sort_values(ascending=False).head(N).reset_index()
male_uni = gender_unigrams.loc['Man (cisgender)'].sort_values(ascending=False).head(N).reset_index()
female_uni.columns = ['word_f', 'freq_f']
male_uni.columns = ['word_m', 'freq_m']

female_bi= gender_bigrams.loc['Woman (cisgender)'].sort_values(ascending=False).head(N).reset_index()
male_bi = gender_bigrams.loc['Man (cisgender)'].sort_values(ascending=False).head(N).reset_index()
female_bi.columns = ['bigr_f', 'bi_freq_f']
male_bi.columns = ['bigr_m', 'bi_freq_m']


combined_df = pd.concat([male_uni, male_bi, female_uni, female_bi], axis=1)
combined_df.to_latex("most_used_n_grams.tex", float_format="%.4f", header=["Word", "Freq.", "Bigram", "Freq.", "Word", "Freq." , "Bigram", "Freq."])
combined_df


Unnamed: 0,word_m,freq_m,bigr_m,bi_freq_m,word_f,freq_f,bigr_f,bi_freq_f
0,the,0.074242,of the,0.005596,the,0.067518,can you,0.013407
1,to,0.034931,in the,0.005528,to,0.031762,want to,0.008138
2,is,0.029539,at the,0.005458,and,0.025825,of the,0.006636
3,of,0.02727,how can,0.005372,can,0.021263,the same,0.005318
4,in,0.018219,on the,0.004774,you,0.017356,in the,0.005018
5,and,0.017033,is the,0.004667,of,0.016486,give me,0.00494
6,it,0.014231,to the,0.003846,in,0.015971,how can,0.004778
7,that,0.01394,need to,0.003581,this,0.01446,you give,0.00452
8,how,0.013723,what is,0.003489,code,0.014315,judgment balanced,0.004049
9,this,0.011637,want to,0.003239,is,0.013461,have the,0.003865
