In [1]:
import gensim
import pandas as pd
import spacy
import numpy as np
from wefe.word_embedding_model import WordEmbeddingModel
from wefe.metrics import RIPA
from wefe.query import Query
import re
from gensim.models import KeyedVectors


In [None]:
embeddings_file = "/Users/matthijstentije/University/MSc_Data-Science/Thesis/MSc_Data_Science_Thesis/data/numberbatch-19.08.txt.gz"

print("Loading Numberbatch embeddings...")
original_model = gensim.models.KeyedVectors.load_word2vec_format(embeddings_file, binary=False)
print("Embeddings loaded!")

# Clean Tokens
def further_clean(token):
    token = token.replace('#', '').replace('_', ' ')
    token = re.sub(r'^\d+\s*', '', token)
    token = re.sub(r'\d+', '', token)
    token = token.strip(".,;:!?\"'")
    token = re.sub(r'\s+', ' ', token)
    token = token.strip()
    return token

# Filter en schoon Nederlandse tokens
prefix = '/c/nl/'
cleaned_words = {}
for word in original_model.index_to_key:
    if word.startswith(prefix):
        clean_word = further_clean(word[len(prefix):])
        if clean_word and len(clean_word) > 1 and not re.match(r'^[A-Za-z]\s', clean_word):
            # Gebruik alleen woorden die nog niet in de dictionary zitten
            if clean_word not in cleaned_words:
                cleaned_words[clean_word] = original_model[word]

print(f"Number of unique NL tokens: {len(cleaned_words)}")

# Maak een nieuw KeyedVectors-model aan
vector_size = original_model.vector_size
nl_model = KeyedVectors(vector_size=vector_size)

# Voeg de opgeschoonde Nederlandse vectors toe
for word, vec in cleaned_words.items():
    nl_model.add_vector(word, vec)

print(f"Clean model with number of {len(nl_model.index_to_key)} tokens!")

nl_model.save_word2vec_format("nl_numberbatch.bin", binary=True)  


Loading Numberbatch embeddings...


In [None]:
nl_model = KeyedVectors.load_word2vec_format("nl_numberbatch.txt", binary=False)


In [None]:
# Load spaCy Dutch NLP model
nlp = spacy.load('nl_core_news_lg')

In [None]:
df = pd.read_csv("/Users/matthijstentije/University/MSc_Data-Science/Thesis/MSc_Data_Science_Thesis/data/Corpus_Hedendaags_Nederlands_Adjectives.csv", sep=";")

In [None]:
# -----------------------
# 3. Your CSV adjective extraction function
def extract_adjectives_from_csv(file_path):
    """
    Reads a CSV file (with phrases/word groups in the first column),
    parses each phrase with spaCy to extract adjectives (ADJ),
    lemmatizes them, and returns a list of unique, lowercased adjectives.
    """
    df = pd.read_csv(file_path, delimiter=';', usecols=[0], names=["Group"], header=0)
    df.dropna(subset=["Group"], inplace=True)
    adjectives = []
    for phrase in df["Group"]:
        doc = nlp(phrase)
        for token in doc:
            if token.pos_ == "ADJ" and token.is_alpha:
                adjectives.append(token.lemma_.lower())
    adjectives = list(dict.fromkeys(adjectives))  # Remove duplicates while preserving order
    return adjectives

# -----------------------
# 4. Use the CSV file path
csv_file_path = "/Users/matthijstentije/University/MSc_Data-Science/Thesis/MSc_Data_Science_Thesis/data/Corpus_Hedendaags_Nederlands_Adjectives.csv"
result_adjectives = extract_adjectives_from_csv(csv_file_path)
print("Extracted adjectives from CSV:", result_adjectives[:10])
print("Total unique adjectives:", len(result_adjectives))

# -----------------------
# 5. Compare Extracted Adjectives with Cleaned Embeddings
missing_words = [word for word in result_adjectives if word not in nl_model]
print(f"Total missing words: {len(missing_words)}")
print("Sample missing words:", missing_words[:10])

# Hou alleen adjectives die in het embedding model zitten
filtered_adjectives = [word for word in result_adjectives if word in nl_model]

# (Optioneel) filter uit op basis van substrings in target-woorden
target_words = [
    "man", "kerel", "jongen", "vader", "zoon", "vent", "gast", "meneer", "opa", "oom",
    "vrouw", "dame", "meisje", "moeder", "dochter", "tante", "oma", "mevrouw", "meid"
]

filtered_adjectives = [
    adj for adj in filtered_adjectives if not any(target in adj for target in target_words)
]

print(f"Remaining adjectives after final filtering: {len(filtered_adjectives)}")
print("Sample:", filtered_adjectives[:10])


Total cleaned Dutch embeddings: 190099
Extracted adjectives from CSV: ['groot', 'vreemd', 'prachtig', 'onschuldig', 'vrouwelijk', 'angstig', 'koppig', 'uitzonderlijk', 'verkiesbaar', 'wilsonbekwame', 'ongerust', 'polygaum', 'wereldwijaz', 'praatgraag', 'teder', 'grof', 'grooot', 'schimmig', 'wereldwijze', 'polygame', 'onvriendelijk', 'wantrouwen', 'vervaarlijk', 'eervol', 'onsterfelijk', 'snobistisch', 'heimelijk', 'atypisch', 'wilsonbekwaam', 'wantrouwend', 'tegenstrijdig', 'rafelig', 'wereldwijs', 'polygaam', 'nadenken', 'panisch', 'onbestemd', 'eindig', 'onzelfzuchtig', 'droogkomisch', 'maatschappijkritisch', 'diepzinnig', 'vreemder', 'hondsbrutaal', 'ongrondwettig', 'etherisch', 'onalledaags', 'prinselijk', 'extern', 'onwerkelijk', 'wisselvallig', 'tumultueus', 'levenskrachtig', 'contraproductief', 'ongerussen', 'afdoende', 'pronkzuchtig', 'stemmig', 'mannek', 'wiebelig', 'geelgroen', 'grover', 'hengstig', 'gortdroog', 'zuidwaarts', 'manneke', 'geestelijk', 'alleenstaan', 'afwezig'

In [None]:
def cosine_similarity(v1: np.ndarray, v2: np.ndarray) -> float:
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def compute_association(w: np.ndarray, A: np.ndarray) -> float:
    return np.mean([cosine_similarity(w, a) for a in A])

def compute_attribute_association(X: np.ndarray, A: np.ndarray) -> np.ndarray:
    return np.array([compute_association(x, A) for x in X])

def compute_attribute_association_L2(A: np.ndarray, T: np.ndarray) -> np.ndarray:
    return np.array([compute_association(a, T) for a in A])

def compute_joint_std(X_Associations: np.ndarray, Y_Associations: np.ndarray) -> float:
    return np.std(np.concatenate([X_Associations, Y_Associations]), ddof=1)

def compute_p_value(X_Diff: np.ndarray, Y_Diff: np.ndarray, permutations: int=1000) -> float:
    test_statistic = np.sum(X_Diff) - np.sum(Y_Diff)
    empirical_distribution = np.array([np.random.choice(np.concatenate([X_Diff, Y_Diff]), size=len(X_Diff) + len(Y_Diff), replace=False) for _ in range(permutations)])
    empirical_differences = np.sum(empirical_distribution[:, :len(X_Diff)], axis=1) - np.sum(empirical_distribution[:, len(X_Diff):], axis=1)
    
    return 1-norm.cdf(test_statistic, loc=np.mean(empirical_differences), scale=np.std(empirical_differences, ddof=1))

def level_1(X: np.ndarray, Y: np.ndarray, A: np.ndarray, B: np.ndarray, permutations: int=1000) -> float:
    X_Associations_A = compute_attribute_association(X, A)
    X_Associations_B = compute_attribute_association(X, B)
    X_Differential_Associations = X_Associations_A - X_Associations_B

    Y_Associations_A = compute_attribute_association(Y, A)
    Y_Associations_B = compute_attribute_association(Y, B)
    Y_Differential_Associations = Y_Associations_A - Y_Associations_B

    X_Mean = np.mean(X_Differential_Associations)
    Y_Mean = np.mean(Y_Differential_Associations)

    p_value = compute_p_value(X_Differential_Associations, Y_Differential_Associations, permutations=permutations)

    return (X_Mean - Y_Mean) / compute_joint_std(X_Differential_Associations, Y_Differential_Associations), p_value

def level_2(T: np.ndarray, A: np.ndarray, B: np.ndarray, permutations: int=1000) -> float:
    A_Associations_T = compute_attribute_association_L2(A, T)
    B_Associations_T = compute_attribute_association_L2(B, T)

    p_value = compute_p_value(A_Associations_T, B_Associations_T, permutations=permutations)

    return (np.mean(A_Associations_T) - np.mean(B_Associations_T)) / compute_joint_std(A_Associations_T, B_Associations_T), p_value

def level_3(T: np.ndarray, A: np.ndarray) -> float:
    T_Associations_A = [cosine_similarity(t, a) for t in T for a in A]
    return np.mean(T_Associations_A), np.std(T_Associations_A, ddof=1)

def ML_EAT(A: np.ndarray, B: np.ndarray, X: np.ndarray, Y: np.ndarray, permutations: int=1000) -> dict:
    L1_effect_size, L1_p_value = level_1(X, Y, A, B, permutations=permutations)
    L2_effect_size_X, L2_p_value_X = level_2(X, A, B, permutations=permutations)
    L2_effect_size_Y, L2_p_value_Y = level_2(Y, A, B, permutations=permutations)
    L3_mean_AX, L3_std_AX = level_3(X, A)
    L3_mean_BX, L3_std_BX = level_3(X, B)
    L3_mean_AY, L3_std_AY = level_3(Y, A)
    L3_mean_BY, L3_std_BY = level_3(Y, B)

    return {
        'L1_effect_size': L1_effect_size,
        'L1_p_value': L1_p_value,
        'L2_effect_size_X': L2_effect_size_X,
        'L2_p_value_X': L2_p_value_X,
        'L2_effect_size_Y': L2_effect_size_Y,
        'L2_p_value_Y': L2_p_value_Y,
        'L3_mean_AX': L3_mean_AX,
        'L3_std_AX': L3_std_AX,
        'L3_mean_BX': L3_mean_BX,
        'L3_std_BX': L3_std_BX,
        'L3_mean_AY': L3_mean_AY,
        'L3_std_AY': L3_std_AY,
        'L3_mean_BY': L3_mean_BY,
        'L3_std_BY': L3_std_BY,
    }
def get_np_embeddings(target_words: list,
                      vocab_dict: dict,
                      embeddings: np.ndarray) -> np.ndarray:
    """
    Get the embeddings for the target words.
    """

    return np.array([embeddings[vocab_dict[word]] for word in target_words])

In [None]:
MALE_WORDS = [
    "man", "kerel", "jongen", "vader", "zoon", "vent", "gast", "meneer", "opa", "oom"
]
FEMALE_WORDS = [
    "vrouw", "dame", "meisje", "moeder", "dochter", "tante", "oma", "mevrouw", "meid"
]

In [None]:
def compute_individual_bias(
    adjectives,
    male_terms,
    female_terms,
    model,
    exclude_substrings=True
):

    if exclude_substrings:
        all_target_words = set(male_terms + female_terms)
        def has_target_substring(adj):
            return any(tw in adj for tw in all_target_words)
        adjectives = [adj for adj in adjectives if not has_target_substring(adj)]
    print(f"Aantal overgebleven adjectives na substring-filter: {len(adjectives)}")

    records = []

    for adj in adjectives:
        if adj not in model:
            print(f"'{adj}' niet in model — overslaan")
            continue

        adj_vec = model[adj]
        male_sims = [cosine_similarity(adj_vec, model[m]) for m in male_terms if m in model]
        female_sims = [cosine_similarity(adj_vec, model[f]) for f in female_terms if f in model]

        if len(male_sims) == 0 or len(female_sims) == 0:
            print(f"Onvoldoende vergelijkbare woorden voor '{adj}' — overslaan")
            continue

        male_mean = np.mean(male_sims)
        female_mean = np.mean(female_sims)
        bias_value = male_mean - female_mean

        print(f"'{adj}': bias = {bias_value:.4f}")
        records.append({
            "word": adj,
            "male_mean": male_mean,
            "female_mean": female_mean,
            "bias_value": bias_value
        })

    df_bias = pd.DataFrame(records)
    print(f"Totaal aantal berekende bias-waarden: {len(df_bias)}")
    return df_bias


individual_bias = compute_individual_bias(adjectives=filtered_adjectives, male_terms=MALE_WORDS, female_terms=FEMALE_WORDS, model=nl_model)
print(f"Totaal aantal berekende bias-waarden: {len(individual_bias)}")
    
    # Sorteer oplopend op 'bias_value'
df_sorted = individual_bias.sort_values("bias_value", ascending=True)

    # Selecteer de 30 laagste scores
lowest_30 = df_sorted.head(30)

    # Selecteer de 30 hoogste scores
highest_30 = df_sorted.tail(30)

print("=== 30 Laagste Scores (bias_value) ===")
for _, row in lowest_30.iterrows():
    print(f"{row['word']}: {row['bias_value']:.4f} "
            f"(male_mean={row['male_mean']:.4f}, female_mean={row['female_mean']:.4f})")

print("\n=== 30 Hoogste Scores (bias_value) ===")
for _, row in highest_30.iloc[::-1].iterrows():
    print(f"{row['word']}: {row['bias_value']:.4f} "
        f"(male_mean={row['male_mean']:.4f}, female_mean={row['female_mean']:.4f})")
    
df_sorted_male = individual_bias.sort_values("male_mean", ascending=False)
top_30_male_mean = df_sorted_male.head(30)

print("\n=== 30 Hoogste male_mean (onafhankelijk van female_mean) ===")
for _, row in top_30_male_mean.iterrows():
    print(f"{row['word']} - male_mean={row['male_mean']:.4f}, "
          f"female_mean={row['female_mean']:.4f}, bias={row['bias_value']:.4f}")

# En idem voor female_mean
df_sorted_female = individual_bias.sort_values("female_mean", ascending=False)
top_30_female_mean = df_sorted_female.head(30)

print("\n=== 30 Hoogste female_mean (onafhankelijk van male_mean) ===")
for _, row in top_30_female_mean.iterrows():
    print(f"{row['word']} - female_mean={row['female_mean']:.4f}, "
          f"male_mean={row['male_mean']:.4f}, bias={row['bias_value']:.4f}")

Aantal overgebleven adjectives na substring-filter: 2335
'groot' niet in model — overslaan
'vreemd' niet in model — overslaan
'prachtig' niet in model — overslaan
'onschuldig' niet in model — overslaan
'angstig' niet in model — overslaan
'koppig' niet in model — overslaan
'uitzonderlijk' niet in model — overslaan
'verkiesbaar' niet in model — overslaan
'ongerust' niet in model — overslaan
'praatgraag' niet in model — overslaan
'teder' niet in model — overslaan
'grof' niet in model — overslaan
'schimmig' niet in model — overslaan
'polygame' niet in model — overslaan
'onvriendelijk' niet in model — overslaan
'wantrouwen' niet in model — overslaan
'vervaarlijk' niet in model — overslaan
'eervol' niet in model — overslaan
'onsterfelijk' niet in model — overslaan
'snobistisch' niet in model — overslaan
'heimelijk' niet in model — overslaan
'atypisch' niet in model — overslaan
'wantrouwend' niet in model — overslaan
'tegenstrijdig' niet in model — overslaan
'rafelig' niet in model — overslaa

KeyError: 'bias_value'

In [None]:
individual_bias_dict = individual_bias.set_index("word")["bias_value"].to_dict()
search_words = ["sterk", "zacht", "moedig", "emotioneel", "dominant", 
                "zorgzaam", "aardig", "knap", "schattig"]

print("Bias scores voor specifieke woorden:")
for word in search_words:
    # Let op: als je bij de filtering alles lowercase hebt gemaakt, doe je hier ook word.lower()
    w_lower = word.lower()
    bias_value = individual_bias_dict.get(w_lower)

    if bias_value is not None:
        print(f"{word}: {bias_value:.3f}")
    else:
        print(f"{word}: Niet gevonden in df_bias (of model).")


In [None]:
# Debugging print statements
print(f"Number of adjectives (filtered_lemmas): {len(filtered_adjectives)}")

# Define the query
query = Query(
    target_sets=[
        ["man", "kerel", "jongen", "vader", "zoon", "vent", "meneer", "opa", "oom"],
        ["vrouw", "dame", "meisje", "moeder", "dochter", "tante", "oma", "mevrouw", "meid"]],
    attribute_sets=[filtered_adjectives],  # Ensure it's a list of lists
    target_sets_names=["Male Terms", "Female Terms"],
    attribute_sets_names=["Adjectives"],
)

# Instantiate the metric and run the query
ripa = RIPA()
result = ripa.run_query(query, nl_model)

# Print results
print(result)

In [None]:
# 'result["word_values"]' bevat nu een dictionary {woord: {'mean': x, 'std': y}, ...}
# Daar kunnen we een DataFrame van maken:
df_ripa = pd.DataFrame({
    'Word': result["word_values"].keys(),
    'Mean Score': [val['mean'] for val in result["word_values"].values()],
    'Std Dev': [val['std'] for val in result["word_values"].values()],
})

# Sorteer op Mean Score (die RIPA per woord toekent) en bekijk
df_ripa = df_ripa.sort_values(by="Mean Score", ascending=False).reset_index(drop=True)
for word in search_words:
    if word in result["word_values"]:
        mean_val = result["word_values"][word]["mean"]
        std_val = result["word_values"][word]["std"]
        print(f"{word}: Mean={mean_val:.3f}, Std={std_val:.3f}")
    else:
        print(f"{word}: niet gevonden in RIPA-result.")


In [None]:
# 1) Voeg een Z-score-kolom toe:
mean_of_scores = df_ripa["Mean Score"].mean()
std_of_scores = df_ripa["Mean Score"].std()

df_ripa["Z-Score"] = (df_ripa["Mean Score"] - mean_of_scores) / std_of_scores

# 2) Als je daarna wilt sorteren op Z-Score (hoog -> laag), doe je:
df_ripa = df_ripa.sort_values("Z-Score", ascending=False).reset_index(drop=True)

# 3) Print de eerste rijen om te zien hoe de Z-scores eruitzien:
print(df_ripa.head(10))
print(df_ripa.tail(10))