# Top Used Words
use expanded, decapitalited and punctuation removed prompts
- concatenate all prompts from all conversations per user
- then count frequencies per user
- normalize by total token count
- fuse counts within each gender group
- take top 10 or 15 words
- logistic regression or chi square to find most discriminative word

Problems:
- chi square is not interpretable

In [1]:
import sqlite3
import pandas as pd

conn = sqlite3.connect('../../giicg.db')

prompts = pd.read_sql("SELECT * FROM expanded_prompts", conn)
prompts

Unnamed: 0,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language
0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6,en
1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en
2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6,en
3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6,en
4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en
...,...,...,...,...,...,...,...,...,...,...
748,1131,54,user,import pandas as pd\nimport numpy as np\nfrom ...,"I want to tune optimal thresholds. Currently, ...",import pandas as pd\nimport numpy as np\nfrom ...,The narratives list looks like this:\nnarrativ...,Man (cisgender),92,en
749,1532,71,user,"from transformers import AutoTokenizer, AutoMo...",I want to use an LLM for listwise reranking in...,"from transformers import AutoTokenizer, AutoMo...",,Man (cisgender),92,en
750,1646,82,user,"def run_query(query, n_results):\n query_em...",this is my code. I want to: Get nodes and edge...,"def run_query(query, n_results):\n query_em...",,Man (cisgender),92,en
751,1849,2,user,\n I am working on the problem of reconstru...,I am working on the problem of reconstruction ...,,Classic CV - Drone navigation\nIf you ever tho...,Man (cisgender),8,en


In [2]:
from helpers.normalization import remove_punctuation_and_newlines, remove_capitalization

prompts['conversational'] = prompts['conversational'].apply(remove_punctuation_and_newlines)
prompts['conversational'] = prompts['conversational'].apply(remove_capitalization)

prompts

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language
0,1,1,user,"parsing data from python iterator, how it coul...",parsing data from python iterator how it could...,,,Man (cisgender),6,en
1,730,32,user,Write python function to do operations with in...,write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en
2,1133,55,user,Write shortest tutorial on creating RAG on ema...,write shortest tutorial on creating rag on ema...,,,Man (cisgender),6,en
3,1135,55,user,what is FAISS,what is faiss,,,Man (cisgender),6,en
4,1137,55,user,Transform given code to process large .mbox file,transform given code to process large mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en
...,...,...,...,...,...,...,...,...,...,...
748,1131,54,user,import pandas as pd\nimport numpy as np\nfrom ...,i want to tune optimal thresholds currently i ...,import pandas as pd\nimport numpy as np\nfrom ...,The narratives list looks like this:\nnarrativ...,Man (cisgender),92,en
749,1532,71,user,"from transformers import AutoTokenizer, AutoMo...",i want to use an llm for listwise reranking in...,"from transformers import AutoTokenizer, AutoMo...",,Man (cisgender),92,en
750,1646,82,user,"def run_query(query, n_results):\n query_em...",this is my code i want to get nodes and edges ...,"def run_query(query, n_results):\n query_em...",,Man (cisgender),92,en
751,1849,2,user,\n I am working on the problem of reconstru...,i am working on the problem of reconstruction ...,,Classic CV - Drone navigation\nIf you ever tho...,Man (cisgender),8,en


## Find most used words throughout the entire corpus

In [30]:
from collections import Counter

all_text = ' '.join(prompts['conversational'])
words = all_text.split()
word_freq = Counter(words)
common_words = word_freq.most_common(20)
print(common_words)


[('the', 874), ('i', 431), ('to', 406), ('a', 288), ('and', 221), ('is', 219), ('of', 211), ('it', 189), ('in', 181), ('that', 153), ('can', 152), ('this', 146), ('for', 133), ('with', 125), ('are', 116), ('want', 115), ('not', 115), ('how', 114), ('you', 111), ('have', 106)]


## 1. Concatenate all prompts per user

In [3]:
user_prompts = (
    prompts.groupby(['user_id', 'gender'])['conversational']
    .apply(' '.join)
    .reset_index()    # Reset index to create a DataFrame
)

user_prompts.columns = ['user_id', 'gender', 'combined_prompts']
user_prompts = user_prompts[user_prompts['gender'].isin(['Woman (cisgender)', 'Man (cisgender)'])].reset_index()

user_prompts

Unnamed: 0,index,user_id,gender,combined_prompts
0,0,6,Man (cisgender),parsing data from python iterator how it could...
1,1,8,Man (cisgender),i am working on the problem of reconstruction ...
2,2,11,Woman (cisgender),can you adapt the following code so that inste...
3,3,15,Man (cisgender),setalltables action is currently not fetching ...
4,4,16,Woman (cisgender),i want to use dummy hot encoding to replace th...
5,5,25,Man (cisgender),what is the best way to encode and compress a ...
6,6,28,Woman (cisgender),i have a pandas dataframe like this i want to...
7,8,31,Man (cisgender),how can i make use of an observablehqdatabasec...
8,9,34,Man (cisgender),blender and python i have a collection of hund...
9,10,46,Man (cisgender),how to run a python future without blocking ie...


## Logistic Regression

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Vectorize the prompts to create a document-term matrix
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(user_prompts['combined_prompts'])

# Convert to a DataFrame for better interpretability
word_frequencies = pd.DataFrame(
    X.toarray(),
    columns=vectorizer.get_feature_names_out()
)

# Add gender as the target variable
word_frequencies['gender'] = user_prompts['gender'].apply(lambda x: 1 if x == 'Woman (cisgender)' else 0)


# Separate features (X) and target (y)
X = word_frequencies.drop('gender', axis=1)
y = word_frequencies['gender']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Initialize the logistic regression model
log_reg = LogisticRegression(max_iter=1000)

# Train the model
log_reg.fit(X_train, y_train)


# Get feature names and coefficients
coefficients = log_reg.coef_[0]
words = vectorizer.get_feature_names_out()

# Create a DataFrame for easier interpretation
coeff_df = pd.DataFrame({
    'word': words,
    'coefficient': coefficients
})

# Sort by absolute value of coefficients for most discriminative words
discriminative_words = coeff_df.sort_values(by='coefficient', ascending=False)

# Top words indicating 'Woman (cisgender)'
top_female_words = discriminative_words.head(10)

# Top words indicating 'Man (cisgender)'
top_men_words = discriminative_words.tail(10)


print(top_female_words)
print(top_men_words)

# Make predictions
y_pred = log_reg.predict(X_test)

# Generate a classification report
report = classification_report(y_test, y_pred)
print(report)




         word  coefficient
2095      you     0.264209
298       can     0.251166
1189      new     0.225952
2059  without     0.218596
133       and     0.211172
1266       or     0.174576
174        as     0.169954
871       how     0.167279
1106       me     0.149475
2044    where     0.137678
              word  coefficient
476      currently    -0.097426
560   distribution    -0.100389
890         iframe    -0.100389
781       function    -0.101238
889             if    -0.113901
1210           not    -0.114626
127             an    -0.130929
1972           use    -0.139127
1234            of    -0.235134
958             is    -0.307355
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         1

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6



In [11]:
vectorizer_unigrams = CountVectorizer()
vectorizer_bigrams = CountVectorizer(ngram_range=(2,2))

unigram_matrix = vectorizer_unigrams.fit_transform(user_prompts['combined_prompts'])
words = vectorizer_unigrams.get_feature_names_out()
unigram_df = pd.DataFrame(unigram_matrix.toarray(), columns=words)
unigram_df['user_id'] = user_prompts['user_id'].values
unigram_df['gender'] = user_prompts['gender'].values

bigram_matrix = vectorizer_bigrams.fit_transform(user_prompts['combined_prompts'])
bigrams = vectorizer_bigrams.get_feature_names_out()
bigram_fd = pd.DataFrame(bigram_matrix.toarray(), columns=bigrams)
bigram_fd['user_id'] = user_prompts['user_id'].values
bigram_fd['gender'] = user_prompts['gender'].values

# 1. Normalize counts per user (row) by their total word count
word_cols = words  # all column names for words
user_totals = unigram_df[word_cols].sum(axis=1)
unigram_df[word_cols] = unigram_df[word_cols].div(user_totals, axis=0)

bigram_cols = bigrams  # all column names for words
user_totals = bigram_fd[bigram_cols].sum(axis=1)
bigram_fd[bigram_cols] = bigram_fd[bigram_cols].div(user_totals, axis=0)

# 2. Compute the mean vector per gender (i.e., average normalized word frequencies)
gender_unigrams = unigram_df.groupby('gender')[word_cols].mean()
gender_bigrams = bigram_fd.groupby('gender')[bigram_cols].mean()


In [14]:
N = 10
female_uni = gender_unigrams.loc['Woman (cisgender)'].sort_values(ascending=False).head(N).reset_index()
male_uni = gender_unigrams.loc['Man (cisgender)'].sort_values(ascending=False).head(N).reset_index()
female_uni.columns = ['word_f', 'freq_f']
male_uni.columns = ['word_m', 'freq_m']

female_bi= gender_bigrams.loc['Woman (cisgender)'].sort_values(ascending=False).head(N).reset_index()
male_bi = gender_bigrams.loc['Man (cisgender)'].sort_values(ascending=False).head(N).reset_index()
female_bi.columns = ['bigr_f', 'bi_freq_f']
male_bi.columns = ['bigr_m', 'bi_freq_m']


combined_df = pd.concat([male_uni, male_bi, female_uni, female_bi], axis=1)
combined_df.to_latex("most_used_n_grams.tex", float_format="%.4f", header=["Word", "Freq.", "Bigram", "Freq.", "Word", "Freq." , "Bigram", "Freq."])
combined_df


Unnamed: 0,word_m,freq_m,bigr_m,bi_freq_m,word_f,freq_f,bigr_f,bi_freq_f
0,the,0.074242,of the,0.005596,the,0.064811,can you,0.013783
1,to,0.034931,in the,0.005528,to,0.032486,want to,0.007786
2,is,0.029539,at the,0.005458,and,0.023764,of the,0.007189
3,of,0.02727,how can,0.005372,can,0.022119,give me,0.005352
4,in,0.018219,on the,0.004774,you,0.01807,how can,0.00499
5,and,0.017033,is the,0.004667,of,0.017676,you give,0.004896
6,it,0.014231,to the,0.003846,in,0.015562,in the,0.004406
7,that,0.01394,need to,0.003581,code,0.015508,the code,0.004175
8,how,0.013723,what is,0.003489,is,0.014217,the same,0.0037
9,this,0.011637,want to,0.003239,it,0.014213,all of,0.00367
