# Top Used Words
use expanded, decapitalited and punctuation removed prompts
- concatenate all prompts from all conversations per user
- then count frequencies per user
- normalize by total token count
- fuse counts within each gender group
- take top 10 or 15 words
- logistic regression or chi square to find most discriminative word

Problems:
- chi square is not interpretable

In [2]:
import sqlite3
import pandas as pd

conn = sqlite3.connect('../../giicg.db')

prompts = pd.read_sql("SELECT * FROM expanded_prompts", conn)
prompts

Unnamed: 0,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language
0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6,en
1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en
2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6,en
3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6,en
4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en
...,...,...,...,...,...,...,...,...,...,...
748,1131,54,user,import pandas as pd\nimport numpy as np\nfrom ...,"I want to tune optimal thresholds. Currently, ...",import pandas as pd\nimport numpy as np\nfrom ...,The narratives list looks like this:\nnarrativ...,Man (cisgender),92,en
749,1532,71,user,"from transformers import AutoTokenizer, AutoMo...",I want to use an LLM for listwise reranking in...,"from transformers import AutoTokenizer, AutoMo...",,Man (cisgender),92,en
750,1646,82,user,"def run_query(query, n_results):\n query_em...",this is my code. I want to: Get nodes and edge...,"def run_query(query, n_results):\n query_em...",,Man (cisgender),92,en
751,1849,2,user,\n I am working on the problem of reconstru...,\n I am working on the problem of reconstru...,,Classic CV - Drone navigation\nIf you ever tho...,Man (cisgender),8,en


In [3]:
from helpers.normalization import remove_punctuation_and_newlines, remove_capitalization

prompts['conversational'] = prompts['conversational'].apply(remove_punctuation_and_newlines)
prompts['conversational'] = prompts['conversational'].apply(remove_capitalization)

prompts

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language
0,1,1,user,"parsing data from python iterator, how it coul...",parsing data from python iterator how it could...,,,Man (cisgender),6,en
1,730,32,user,Write python function to do operations with in...,write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en
2,1133,55,user,Write shortest tutorial on creating RAG on ema...,write shortest tutorial on creating rag on ema...,,,Man (cisgender),6,en
3,1135,55,user,what is FAISS,what is faiss,,,Man (cisgender),6,en
4,1137,55,user,Transform given code to process large .mbox file,transform given code to process large mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en
...,...,...,...,...,...,...,...,...,...,...
748,1131,54,user,import pandas as pd\nimport numpy as np\nfrom ...,i want to tune optimal thresholds currently i ...,import pandas as pd\nimport numpy as np\nfrom ...,The narratives list looks like this:\nnarrativ...,Man (cisgender),92,en
749,1532,71,user,"from transformers import AutoTokenizer, AutoMo...",i want to use an llm for listwise reranking in...,"from transformers import AutoTokenizer, AutoMo...",,Man (cisgender),92,en
750,1646,82,user,"def run_query(query, n_results):\n query_em...",this is my code i want to get nodes and edges ...,"def run_query(query, n_results):\n query_em...",,Man (cisgender),92,en
751,1849,2,user,\n I am working on the problem of reconstru...,i am working on the problem of reconstruc...,,Classic CV - Drone navigation\nIf you ever tho...,Man (cisgender),8,en


## Find most used words throughout the entire corpus

In [13]:
from collections import Counter

all_text = ' '.join(prompts['conversational'])
words = all_text.split()
word_freq = Counter(words)
common_words = word_freq.most_common(20)
print(common_words)


[('the', 874), ('i', 433), ('to', 406), ('a', 290), ('and', 221), ('is', 219), ('of', 211), ('it', 189), ('in', 181), ('that', 153), ('can', 152), ('this', 146), ('for', 133), ('with', 125), ('are', 116), ('want', 115), ('not', 115), ('how', 114), ('you', 112), ('have', 106)]


## 1. Concatenate all prompts per user

In [7]:
user_prompts = (
    prompts.groupby(['user_id', 'gender'])['conversational']
    .apply(' '.join)
    .reset_index()    # Reset index to create a DataFrame
)

user_prompts.columns = ['user_id', 'gender', 'combined_prompts']
user_prompts = user_prompts[user_prompts['gender'].isin(['Woman (cisgender)', 'Man (cisgender)'])].reset_index()

user_prompts

Unnamed: 0,index,user_id,gender,combined_prompts
0,0,6,Man (cisgender),parsing data from python iterator how it could...
1,1,8,Man (cisgender),i am working on the problem of reconstruc...
2,2,11,Woman (cisgender),can you adapt the following code so that inste...
3,3,15,Man (cisgender),setalltables action is currently not fetching ...
4,4,16,Woman (cisgender),i want to use dummy hot encoding to replace th...
5,5,25,Man (cisgender),what is the best way to encode and compress a ...
6,6,28,Woman (cisgender),i have a pandas dataframe like this i want to...
7,8,31,Man (cisgender),how can i make use of an observablehqdatabasec...
8,9,34,Man (cisgender),blender and python i have a collection of hund...
9,10,46,Man (cisgender),how to run a python future without blocking ie...


## 1. Tokenize and count

In [8]:
import spacy
from collections import Counter

tokenizer = spacy.blank("en")

def spacy_tokenizer(text):
    #stopwords = {"a", "an", "the", "this", "that", "in", "on", "at"}  # Set of articles to remove
    #return [token.text for token in tokenizer(text) if token.text.lower() not in stopwords]
    return [token.text for token in tokenizer(text)]


def divide_counts(counts_dict, total_tokens):
    return Counter({k: v / total_tokens for k, v in counts_dict.items()})


user_prompts['tokens'] = user_prompts['combined_prompts'].apply(spacy_tokenizer)

user_prompts['counts'] = user_prompts['tokens'].apply(Counter)
user_prompts['n_tokens'] = user_prompts['tokens'].apply(lambda x: len(x))

user_prompts['normalized_counts'] = user_prompts.apply(lambda row: divide_counts(row['counts'], row['n_tokens']), axis=1)

user_prompts

Unnamed: 0,index,user_id,gender,combined_prompts,tokens,counts,n_tokens,normalized_counts
0,0,6,Man (cisgender),parsing data from python iterator how it could...,"[parsing, data, from, python, iterator, how, i...","{'parsing': 1, 'data': 2, 'from': 4, 'python':...",201,"{'parsing': 0.004975124378109453, 'data': 0.00..."
1,1,8,Man (cisgender),i am working on the problem of reconstruc...,"[ , i, am, working, on, the, problem, of, ...","{' ': 2, 'i': 1, 'am': 1, 'working': 1, 'o...",39,"{' ': 0.05128205128205128, 'i': 0.02564102..."
2,2,11,Woman (cisgender),can you adapt the following code so that inste...,"[can, you, adapt, the, following, code, so, th...","{'can': 9, 'you': 8, 'adapt': 3, 'the': 15, 'f...",194,"{'can': 0.04639175257731959, 'you': 0.04123711..."
3,3,15,Man (cisgender),setalltables action is currently not fetching ...,"[setalltables, action, is, currently, not, fet...","{'setalltables': 1, 'action': 1, 'is': 2, 'cur...",36,"{'setalltables': 0.027777777777777776, 'action..."
4,4,16,Woman (cisgender),i want to use dummy hot encoding to replace th...,"[i, want, to, use, dummy, hot, encoding, to, r...","{'i': 28, 'want': 6, 'to': 23, 'use': 2, 'dumm...",548,"{'i': 0.051094890510948905, 'want': 0.01094890..."
5,5,25,Man (cisgender),what is the best way to encode and compress a ...,"[what, is, the, best, way, to, encode, and, co...","{'what': 1, 'is': 6, 'the': 5, 'best': 1, 'way...",108,"{'what': 0.009259259259259259, 'is': 0.0555555..."
6,6,28,Woman (cisgender),i have a pandas dataframe like this i want to...,"[i, have, a, pandas, dataframe, like, this, ,...","{'i': 24, 'have': 7, 'a': 10, 'pandas': 1, 'da...",569,"{'i': 0.0421792618629174, 'have': 0.0123022847..."
7,8,31,Man (cisgender),how can i make use of an observablehqdatabasec...,"[how, can, i, make, use, of, an, observablehqd...","{'how': 3, 'can': 3, 'i': 3, 'make': 1, 'use':...",93,"{'how': 0.03225806451612903, 'can': 0.03225806..."
8,9,34,Man (cisgender),blender and python i have a collection of hund...,"[blender, and, python, i, have, a, collection,...","{'blender': 2, 'and': 21, 'python': 3, 'i': 26...",1323,"{'blender': 0.0015117157974300832, 'and': 0.01..."
9,10,46,Man (cisgender),how to run a python future without blocking ie...,"[how, to, run, a, python, future, without, blo...","{'how': 3, 'to': 4, 'run': 1, 'a': 4, 'python'...",69,"{'how': 0.043478260869565216, 'to': 0.05797101..."


## Top 10 per User

In [9]:
def get_top_n_tokens(counter, n=10):
    # Get the n most common tokens from the counts dictionary

    return counter.most_common(n)


# Apply the function to get the top 5 tokens for each row and create a new column
user_prompts['top_10'] = user_prompts['normalized_counts'].apply(get_top_n_tokens, n=10)

# Check the updated DataFrame
user_prompts

Unnamed: 0,index,user_id,gender,combined_prompts,tokens,counts,n_tokens,normalized_counts,top_10
0,0,6,Man (cisgender),parsing data from python iterator how it could...,"[parsing, data, from, python, iterator, how, i...","{'parsing': 1, 'data': 2, 'from': 4, 'python':...",201,"{'parsing': 0.004975124378109453, 'data': 0.00...","[(to, 0.04477611940298507), (rows, 0.029850746..."
1,1,8,Man (cisgender),i am working on the problem of reconstruc...,"[ , i, am, working, on, the, problem, of, ...","{' ': 2, 'i': 1, 'am': 1, 'working': 1, 'o...",39,"{' ': 0.05128205128205128, 'i': 0.02564102...","[(the, 0.10256410256410256), (of, 0.1025641025..."
2,2,11,Woman (cisgender),can you adapt the following code so that inste...,"[can, you, adapt, the, following, code, so, th...","{'can': 9, 'you': 8, 'adapt': 3, 'the': 15, 'f...",194,"{'can': 0.04639175257731959, 'you': 0.04123711...","[(the, 0.07731958762886598), (can, 0.046391752..."
3,3,15,Man (cisgender),setalltables action is currently not fetching ...,"[setalltables, action, is, currently, not, fet...","{'setalltables': 1, 'action': 1, 'is': 2, 'cur...",36,"{'setalltables': 0.027777777777777776, 'action...","[(is, 0.05555555555555555), (why, 0.0555555555..."
4,4,16,Woman (cisgender),i want to use dummy hot encoding to replace th...,"[i, want, to, use, dummy, hot, encoding, to, r...","{'i': 28, 'want': 6, 'to': 23, 'use': 2, 'dumm...",548,"{'i': 0.051094890510948905, 'want': 0.01094890...","[(the, 0.07116788321167883), (i, 0.05109489051..."
5,5,25,Man (cisgender),what is the best way to encode and compress a ...,"[what, is, the, best, way, to, encode, and, co...","{'what': 1, 'is': 6, 'the': 5, 'best': 1, 'way...",108,"{'what': 0.009259259259259259, 'is': 0.0555555...","[(is, 0.05555555555555555), (type, 0.055555555..."
6,6,28,Woman (cisgender),i have a pandas dataframe like this i want to...,"[i, have, a, pandas, dataframe, like, this, ,...","{'i': 24, 'have': 7, 'a': 10, 'pandas': 1, 'da...",569,"{'i': 0.0421792618629174, 'have': 0.0123022847...","[(the, 0.07205623901581722), (i, 0.04217926186..."
7,8,31,Man (cisgender),how can i make use of an observablehqdatabasec...,"[how, can, i, make, use, of, an, observablehqd...","{'how': 3, 'can': 3, 'i': 3, 'make': 1, 'use':...",93,"{'how': 0.03225806451612903, 'can': 0.03225806...","[(of, 0.043010752688172046), (a, 0.04301075268..."
8,9,34,Man (cisgender),blender and python i have a collection of hund...,"[blender, and, python, i, have, a, collection,...","{'blender': 2, 'and': 21, 'python': 3, 'i': 26...",1323,"{'blender': 0.0015117157974300832, 'and': 0.01...","[(the, 0.07634164777021919), (a, 0.02947845804..."
9,10,46,Man (cisgender),how to run a python future without blocking ie...,"[how, to, run, a, python, future, without, blo...","{'how': 3, 'to': 4, 'run': 1, 'a': 4, 'python'...",69,"{'how': 0.043478260869565216, 'to': 0.05797101...","[(to, 0.057971014492753624), (a, 0.05797101449..."


## Top 10 per Gender

In [10]:
#filtered_data = user_prompts[user_prompts['gender'].isin(['Woman (cisgender)', 'Man (cisgender)'])]


gender_aggregated_counts = {}
for gender, group in user_prompts.groupby('gender'):
    # Start with an empty Counter and update it with counts from each row
    combined_counter = Counter()
    for row_counter in group['normalized_counts']:
        combined_counter.update(row_counter)
    gender_aggregated_counts[gender] = combined_counter

# Function to get top N tokens from a Counter
def get_top_n_from_counter(counter, n=10):
    return counter.most_common(n)

# Get the top N tokens for each gender
top_n_per_gender = {
    gender: get_top_n_from_counter(counter, n=10)
    for gender, counter in gender_aggregated_counts.items()
}

# Display the results
for gender, top_tokens in top_n_per_gender.items():
    print(f"Top tokens for {gender.capitalize()}:")
    for token, freq in top_tokens:
        print(f"{token}: {freq:.4f}")
    print()


Top tokens for Man (cisgender):
the: 1.0217
to: 0.4788
i: 0.4553
a: 0.4456
is: 0.4016
of: 0.3727
in: 0.2485
and: 0.2343
it: 0.1942
that: 0.1903

Top tokens for Woman (cisgender):
the: 0.7129
i: 0.4387
to: 0.3583
a: 0.3154
and: 0.2606
can: 0.2484
you: 0.1990
of: 0.1935
in: 0.1726
code: 0.1709



In [11]:
import numpy as np
from scipy.stats import chisquare

# Combine frequencies for unified vocab
unified_vocab = set(gender_aggregated_counts['Man (cisgender)']) | set(gender_aggregated_counts['Woman (cisgender)'])
#print(unified_vocab)
#print(gender_aggregated_counts["Man (cisgender)"])
results = {}

for word in unified_vocab:
    male_count = gender_aggregated_counts["Man (cisgender)"].get(word, 0)
    female_count = gender_aggregated_counts["Woman (cisgender)"].get(word, 0)
    
    observed = [male_count, female_count]
    total = sum(observed)
    expected = [total / 2, total / 2]  # Equal division under null hypothesis

    # Perform chi-square test
    chi2, p = chisquare(f_obs=observed, f_exp=expected)
    results[word] = {"chi2": chi2, "p_value": p}

# Sort words by discriminative power (lower p-value, higher chi2)
discriminative_words = sorted(results.items(), key=lambda x: x[1]['p_value'])

#print(discriminative_words)

# Filter by p-value < 0.05 and sort by chi2 in descending order
filtered_discriminative_words = [
    (word, values["chi2"], values["p_value"])
    for word, values in results.items()
    if values["p_value"] < 0.05
]
filtered_discriminative_words.sort(key=lambda x: x[1], reverse=True)

# Print each word with chi2 and p-value in one line
for word, stats in discriminative_words:
    print(f"Word: {word}, Chi2: {stats['chi2']:.4f}, P-value: {stats['p_value']:.4f}")




Word: 	, Chi2: 0.1144, P-value: 0.7352
Word: is, Chi2: 0.1076, P-value: 0.7429
Word: you, Chi2: 0.0721, P-value: 0.7883
Word: of, Chi2: 0.0567, P-value: 0.8119
Word: the, Chi2: 0.0550, P-value: 0.8146
Word: type, Chi2: 0.0514, P-value: 0.8206
Word:      , Chi2: 0.0513, P-value: 0.8208
Word: on, Chi2: 0.0512, P-value: 0.8209
Word: column, Chi2: 0.0450, P-value: 0.8320
Word: please, Chi2: 0.0444, P-value: 0.8332
Word: me, Chi2: 0.0431, P-value: 0.8355
Word: if, Chi2: 0.0408, P-value: 0.8399
Word: from, Chi2: 0.0405, P-value: 0.8405
Word: fetching, Chi2: 0.0394, P-value: 0.8427
Word: why, Chi2: 0.0392, P-value: 0.8430
Word: at, Chi2: 0.0389, P-value: 0.8436
Word: currently, Chi2: 0.0354, P-value: 0.8508
Word: matching, Chi2: 0.0345, P-value: 0.8527
Word: snippet, Chi2: 0.0345, P-value: 0.8527
Word: videos, Chi2: 0.0345, P-value: 0.8527
Word: gif, Chi2: 0.0345, P-value: 0.8527
Word: scenes, Chi2: 0.0345, P-value: 0.8527
Word: problem, Chi2: 0.0336, P-value: 0.8546
Word: new, Chi2: 0.0327, 

## Logistic Regression

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Vectorize the prompts to create a document-term matrix
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(user_prompts['combined_prompts'])

# Convert to a DataFrame for better interpretability
word_frequencies = pd.DataFrame(
    X.toarray(),
    columns=vectorizer.get_feature_names_out()
)

# Add gender as the target variable
word_frequencies['gender'] = user_prompts['gender'].apply(lambda x: 1 if x == 'Woman (cisgender)' else 0)


# Separate features (X) and target (y)
X = word_frequencies.drop('gender', axis=1)
y = word_frequencies['gender']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Initialize the logistic regression model
log_reg = LogisticRegression(max_iter=1000)

# Train the model
log_reg.fit(X_train, y_train)


# Get feature names and coefficients
coefficients = log_reg.coef_[0]
words = vectorizer.get_feature_names_out()

# Create a DataFrame for easier interpretation
coeff_df = pd.DataFrame({
    'word': words,
    'coefficient': coefficients
})

# Sort by absolute value of coefficients for most discriminative words
discriminative_words = coeff_df.sort_values(by='coefficient', ascending=False)

# Top words indicating 'Woman (cisgender)'
top_women_words = discriminative_words.head(10)

# Top words indicating 'Man (cisgender)'
top_men_words = discriminative_words.tail(10)


print(top_women_words)
print(top_men_words)

# Make predictions
y_pred = log_reg.predict(X_test)

# Generate a classification report
report = classification_report(y_test, y_pred)
print(report)




         word  coefficient
2096      you     0.264083
298       can     0.251456
1189      new     0.226016
2060  without     0.218490
133       and     0.211458
1266       or     0.174434
174        as     0.169691
871       how     0.167468
1106       me     0.149551
2045    where     0.137734
              word  coefficient
476      currently    -0.097399
890         iframe    -0.100317
560   distribution    -0.100317
781       function    -0.101065
889             if    -0.113763
1210           not    -0.114608
127             an    -0.130801
1973           use    -0.139075
1234            of    -0.235331
958             is    -0.307072
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         1

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6

