# Bigram Counts

- bigrams are counted per prompt and normalized by the number of possible bigrams per prompt (n_tokens - 1)
aggregation method: Per-sample Normalization & Averaging
- all prompts are combined per user
- counts per combined prompt are normalized by the number of possible bigrams in that prompt
- counts are fused per gender and normalized by the size of each group
 - this avoids to have longer prompt dominate the results as they would in pooling/global normalization

In [1]:
import sqlite3
import pandas as pd
import os

from helpers.occurences import load_spacy_lanuage_model, count_bigrams

db_path = "../../giicg.db"
if not os.path.exists(db_path):
    raise FileNotFoundError(f"Database file does not exist: {db_path}")

conn = sqlite3.connect(db_path)
prompts = pd.read_sql("SELECT ep.*, u.age, u.work_exp_years FROM expanded_prompts ep JOIN users u ON ep.user_id = u.user_id", conn)

user_prompts = (
    prompts
    .fillna({'work_exp_years': 'None'})
    .groupby(['user_id', 'gender', 'age', 'work_exp_years'])['conversational']
    .apply(' '.join)
    .reset_index()    # Reset index to create a DataFrame
)
user_prompts

Unnamed: 0,user_id,gender,age,work_exp_years,conversational
0,6,Man (cisgender),19-25,3.0,parsing data from python iterator how it could...
1,11,Woman (cisgender),26-30,1.0,Can you adapt the following code so that inste...
2,15,Man (cisgender),26-30,6.0,SET_ALL_TABLES action is currently not fetchin...
3,16,Woman (cisgender),19-25,,I want to use Dummy Hot encoding to replace th...
4,25,Man (cisgender),26-30,6.0,what is the best way to encode and compress a ...
5,28,Woman (cisgender),31-35,5.0,I have a pandas dataframe like this:\n\nI want...
6,30,Non-binary,26-30,,as a NLP and LLM researcher I am recently down...
7,31,Man (cisgender),36-40,22.0,How can I make use of an ObservableHQDatabaseC...
8,34,Man (cisgender),51-60,20.0,Blender and Python. I have a collection of hun...
9,46,Man (cisgender),31-35,12.0,how to run a Python future without blocking i....


In [2]:
from helpers.occurences import load_spacy_lanuage_model, count_bigrams
model = load_spacy_lanuage_model()

user_prompts['bigram_counts'] = user_prompts['conversational'].apply(lambda x: count_bigrams(x, spacy_model=model))
user_prompts['n_bigrams'] = user_prompts['conversational'].apply(lambda x: len(x.split())-1)
user_prompts['normalized_bigram_counts'] = user_prompts.apply(
    lambda row: {bigram: count / row['n_bigrams'] if row['n_bigrams'] > 0 else 0
                 for bigram, count in row['bigram_counts'].items()},
    axis=1
)
user_prompts['sorted_bigram_counts'] = user_prompts['normalized_bigram_counts'].apply(
    lambda x: dict(sorted(x.items(), key=lambda item: item[1], reverse=True))
)


user_prompts


Unnamed: 0,user_id,gender,age,work_exp_years,conversational,bigram_counts,n_bigrams,normalized_bigram_counts,sorted_bigram_counts
0,6,Man (cisgender),19-25,3.0,parsing data from python iterator how it could...,"{('parsing', 'data'): 1, ('data', 'from'): 1, ...",195,"{('parsing', 'data'): 0.005128205128205128, ('...","{('need', 'to'): 0.020512820512820513, ('like'..."
1,11,Woman (cisgender),26-30,1.0,Can you adapt the following code so that inste...,"{('can', 'you'): 8, ('you', 'adapt'): 3, ('ada...",192,"{('can', 'you'): 0.041666666666666664, ('you',...","{('can', 'you'): 0.041666666666666664, ('you',..."
2,15,Man (cisgender),26-30,6.0,SET_ALL_TABLES action is currently not fetchin...,"{('set_all_tables', 'action'): 1, ('action', '...",35,"{('set_all_tables', 'action'): 0.0285714285714...","{('set_all_tables', 'action'): 0.0285714285714..."
3,16,Woman (cisgender),19-25,,I want to use Dummy Hot encoding to replace th...,"{('i', 'want'): 6, ('want', 'to'): 5, ('to', '...",543,"{('i', 'want'): 0.011049723756906077, ('want',...","{('can', 'i'): 0.01289134438305709, ('i', 'wan..."
4,25,Man (cisgender),26-30,6.0,what is the best way to encode and compress a ...,"{('what', 'is'): 1, ('is', 'the'): 1, ('the', ...",107,"{('what', 'is'): 0.009345794392523364, ('is', ...","{('a', 'type'): 0.018691588785046728, ('i', 'w..."
5,28,Woman (cisgender),31-35,5.0,I have a pandas dataframe like this:\n\nI want...,"{('i', 'have'): 5, ('have', 'a'): 3, ('a', 'pa...",561,"{('i', 'have'): 0.008912655971479501, ('have',...","{('want', 'to'): 0.0213903743315508, ('i', 'wa..."
6,30,Non-binary,26-30,,as a NLP and LLM researcher I am recently down...,"{('as', 'a'): 1, ('a', 'nlp'): 1, ('nlp', 'and...",91,"{('as', 'a'): 0.01098901098901099, ('a', 'nlp'...","{('rename', 'all'): 0.02197802197802198, ('all..."
7,31,Man (cisgender),36-40,22.0,How can I make use of an ObservableHQDatabaseC...,"{('how', 'can'): 2, ('can', 'i'): 3, ('i', 'ma...",91,"{('how', 'can'): 0.02197802197802198, ('can', ...","{('can', 'i'): 0.03296703296703297, ('how', 'c..."
8,34,Man (cisgender),51-60,20.0,Blender and Python. I have a collection of hun...,"{('blender', 'and'): 1, ('and', 'python'): 1, ...",1308,"{('blender', 'and'): 0.0007645259938837921, ('...","{('of', 'the'): 0.00764525993883792, ('the', '..."
9,46,Man (cisgender),31-35,12.0,how to run a Python future without blocking i....,"{('how', 'to'): 3, ('to', 'run'): 1, ('run', '...",68,"{('how', 'to'): 0.04411764705882353, ('to', 'r...","{('how', 'to'): 0.04411764705882353, ('a', 'co..."


In [3]:
from helpers.occurences import merge_counts

genders = user_prompts[['gender']].drop_duplicates().reset_index(drop=True)
gender_bigram_dict = {}

for gender, group in user_prompts.groupby("gender"):
    counts_list = group["normalized_bigram_counts"].tolist()
    total_counts = merge_counts(counts_list)
    num_prompts = group.shape[0]
    print(gender, num_prompts)
    normalized_counts = {bigram: count / num_prompts for bigram, count in total_counts.items()}
    sorted_bigram_counts = dict(sorted(normalized_counts.items(), key=lambda item: item[1], reverse=True))
    gender_bigram_dict[gender] = sorted_bigram_counts

genders['normalized_bigram_counts'] = genders['gender'].map(gender_bigram_dict)
genders



Another gender 1
Man (cisgender) 14
Non-binary 1
Woman (cisgender) 12


Unnamed: 0,gender,normalized_bigram_counts
0,Man (cisgender),"{('of', 'the'): 0.005577501049396659, ('in', '..."
1,Woman (cisgender),"{('can', 'you'): 0.0126163459928249, ('i', 'wa..."
2,Non-binary,"{('rename', 'all'): 0.02197802197802198, ('all..."
3,Another gender,"{('values', 'in'): 0.01195219123505976, ('the'..."


In [4]:
age_classes = user_prompts[['age']].drop_duplicates().reset_index(drop=True)
gender_bigram_dict = {}

for age_class, group in user_prompts.groupby("age"):
    counts_list = group["normalized_bigram_counts"].tolist()
    total_counts = merge_counts(counts_list)
    num_prompts = group.shape[0]
    print(age_class, num_prompts)
    normalized_counts = {bigram: count / num_prompts for bigram, count in total_counts.items()}
    sorted_bigram_counts = dict(sorted(normalized_counts.items(), key=lambda item: item[1], reverse=True))
    gender_bigram_dict[age_class] = sorted_bigram_counts

counts = user_prompts.groupby(['age', 'gender']).size().unstack(fill_value=0)
print(counts)

age_classes['normalized_bigram_counts'] = age_classes['age'].map(gender_bigram_dict)
age_classes


19-25 8
26-30 11
31-35 5
36-40 3
51-60 1
gender  Another gender  Man (cisgender)  Non-binary  Woman (cisgender)
age                                                                   
19-25                0                3           0                  5
26-30                0                5           1                  5
31-35                0                3           0                  2
36-40                1                2           0                  0
51-60                0                1           0                  0


Unnamed: 0,age,normalized_bigram_counts
0,19-25,"{('how', 'can'): 0.007146784554767512, ('can',..."
1,26-30,"{('can', 'you'): 0.010127451372739378, ('of', ..."
2,31-35,"{('how', 'to'): 0.009929319095521267, ('want',..."
3,36-40,"{('can', 'i'): 0.012317032237350964, ('i', 'wa..."
4,51-60,"{('of', 'the'): 0.00764525993883792, ('the', '..."


In [5]:
bins = [-float('inf'), 1, 3, 5, 10, float('inf')]
labels = ['<=1', '1-3', '3-5', '5-10', '>10']
user_prompts['work_exp_years'] = pd.to_numeric(user_prompts['work_exp_years'], errors='coerce')
user_prompts['work_exp_group'] = (
    pd.cut(user_prompts['work_exp_years'], bins=bins, labels=labels, right=True)
    .astype(str)
    .replace("nan", "")
)

exp_classes = user_prompts[['work_exp_group']].drop_duplicates().reset_index(drop=True)

gender_bigram_dict = {}

for exp_class, group in user_prompts.groupby("work_exp_group"):
    counts_list = group["normalized_bigram_counts"].tolist()
    total_counts = merge_counts(counts_list)
    num_prompts = group.shape[0]
    print(exp_class, num_prompts)
    normalized_counts = {bigram: count / num_prompts for bigram, count in total_counts.items()}
    sorted_bigram_counts = dict(sorted(normalized_counts.items(), key=lambda item: item[1], reverse=True))
    gender_bigram_dict[exp_class] = sorted_bigram_counts

counts = user_prompts.groupby(['work_exp_group', 'gender']).size().unstack(fill_value=0)
print(counts)

exp_classes['normalized_bigram_counts'] = exp_classes['work_exp_group'].map(gender_bigram_dict)
exp_classes

 7
1-3 6
3-5 2
5-10 5
<=1 4
>10 4
gender          Another gender  Man (cisgender)  Non-binary  Woman (cisgender)
work_exp_group                                                                
                             0                1           1                  5
1-3                          0                3           0                  3
3-5                          1                0           0                  1
5-10                         0                4           0                  1
<=1                          0                2           0                  2
>10                          0                4           0                  0


Unnamed: 0,work_exp_group,normalized_bigram_counts
0,1-3,"{('of', 'the'): 0.011780353900220716, ('can', ..."
1,<=1,"{('can', 'you'): 0.014195278974991295, ('of', ..."
2,5-10,"{('in', 'the'): 0.009498946001535039, ('need',..."
3,,"{('how', 'can'): 0.008065633062215161, ('i', '..."
4,3-5,"{('want', 'to'): 0.01467925091079532, ('i', 'w..."
5,>10,"{('how', 'to'): 0.011029411764705883, ('can', ..."
