In [23]:
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity

In [39]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-en")
model = AutoModel.from_pretrained("BAAI/bge-large-en")

In [40]:
# List of words
words = ['doctor', 'engineer', 'scientist', 
         'nurse', 'teacher', 'receptionist', 
         'man', 'male', 'boy', 
         'woman', 'female', 'girl']

# Tokenize and generate embeddings
inputs = tokenizer(words, return_tensors='pt', padding=True, truncation=True)
with torch.no_grad():
    embeddings = model(**inputs).last_hidden_state.mean(dim=1)

# Convert embeddings to numpy array
embeddings = embeddings.numpy()

# Save embeddings in key-value pairs
embeddings_dict = {word: embedding for word, embedding in zip(words, embeddings)}

# Print the embeddings dictionary
for word, embedding in embeddings_dict.items():
    print(f"Word: {word}, Embedding: {embedding}")

Word: doctor, Embedding: [-0.12245136  0.09869877 -0.10951994 ... -0.43176612 -0.15773162
 -0.4168615 ]
Word: engineer, Embedding: [-0.42335233  0.2715545  -0.66090804 ... -0.3800297   0.2380309
 -0.44736806]
Word: scientist, Embedding: [-0.05474149  0.21385355 -0.13997476 ... -0.18834884 -0.35558525
 -0.00399417]
Word: nurse, Embedding: [ 0.03429724 -0.04092225 -0.6744635  ... -0.12078612  0.14811628
 -0.48568416]
Word: teacher, Embedding: [ 0.12671818  0.05624894 -0.25352243 ...  0.37910852 -0.5353386
 -0.2911121 ]
Word: receptionist, Embedding: [-0.48184884  0.01627805  0.01125895 ... -0.1010972  -0.4698733
 -0.3500704 ]
Word: man, Embedding: [ 0.21948044  0.23434158 -0.15649559 ... -0.09519699 -0.37215176
  0.11097046]
Word: male, Embedding: [ 0.3450103   0.0761928  -0.0778226  ... -0.3019996  -0.12306633
 -0.5901602 ]
Word: boy, Embedding: [ 0.11523373  0.25534704 -0.06905752 ... -0.19299932 -0.06677964
 -0.49130365]
Word: woman, Embedding: [-0.28373864  0.18534417 -0.54475147 ...

In [41]:

#let's define the sets
X = ['doctor', 'engineer', 'scientist']
Y = ['nurse', 'teacher', 'receptionist']
A = ['man', 'male', 'boy']
B = ['woman', 'female', 'girl']

### Computing Differential Association
- The function s computes the differential association of a word w with the sets X and Y.
- For each word in X, we compute its cosine similarity with w and then take the mean of these - values to get sim_X.
- Similarly, we compute the average cosine similarity between w and each word in Y to get sim_Y.
- The function returns the difference between sim_X and sim_Y.

In [42]:
def s(w, X, Y):
    sim_X = np.mean([cosine_similarity(embeddings_dict[w].reshape(1, -1), embeddings_dict[x].reshape(1, -1)) for x in X])
    sim_Y = np.mean([cosine_similarity(embeddings_dict[w].reshape(1, -1), embeddings_dict[y].reshape(1, -1)) for y in Y])
    return sim_X - sim_Y

In [43]:
s('man', X, Y)

0.020658374

### Calculating the WEAT Score
- For each word in set A, we compute its differential association with X and Y and sum these values.
- Similarly, we compute the sum of differential associations for each word in set B.
- The WEAT score is the difference between the two sums.
- A positive WEAT score indicates that, on average, words in A are more strongly associated with words in X than words in B are. Conversely, a negative score indicates a stronger association between B and X.

In [44]:
WEAT_score = sum([s(a, X, Y) for a in A]) - sum([s(b, X, Y) for b in B])
print(f"WEAT score: {WEAT_score}")

WEAT score: 0.09056484699249268


The positive WEAT score indicates that the words in set A (male-associated terms) have a stronger association with the occupations in set X (like 'doctor', 'engineer', 'scientist') than they do with occupations in set Y (like 'nurse', 'teacher', 'receptionist'). In contrast, the words in set B (female-associated terms) have a stronger association with occupations in set Y.