In [None]:
# 0.  Imports & Logging
import logging
from pathlib import Path
import pandas as pd
import spacy
from rapidfuzz import fuzz
from collections import Counter
from IPython.display import display

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)-8s %(name)s: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("sentence_cleaner")

# %% 
# 1.  Configuration
CSV_IN       = Path("/Users/matthijstentije/University/MSc_Data-Science/Thesis/MSc_Data_Science_Thesis/Notebooks/Phase_02/aggregated_sentences.csv")
CSV_OUT      = Path("output/sentences_cleaned.csv")
ENCODING     = "utf-8"
FUZZY_THRESH = 70

# %% 
# 2.  Preflight & Load
if not CSV_IN.exists():
    logger.error(f"Input file not found: {CSV_IN}")
    raise FileNotFoundError(f"{CSV_IN} does not exist")
logger.info(f"Reading input CSV from {CSV_IN}")
df = pd.read_csv(CSV_IN, encoding=ENCODING, on_bad_lines="warn")
logger.info(f"Loaded {len(df)} rows")

# %% df['sentence'] = df['sentence'].fillna('').astype(str).str.strip()


mask_dots   = df['sentence'].str.match(r'^\.+\s*$', na=False)
mask_colons = df['sentence'].str.match(r'^:+\s*$', na=False)
mask_empty  = df['sentence'].eq('')
mask_removed =  mask_dots | mask_colons | mask_empty 

removed_df = df[mask_removed]
logger.info(f"Found {len(removed_df)} sentences to remove:")
display(removed_df[['sentence']])

# Log breakdown per category
logger.info(
    "Removal breakdown – empty: %d, only dots: %d, only colons: %d",
    mask_empty.sum(),
    mask_dots.sum(),
    mask_colons.sum()
)

# %% 
import re
import unicodedata

def clean_punctuation(s: str) -> str:
    # 1. Unicode-normaliseren zodat accenten gesplitst worden
    s = unicodedata.normalize("NFKD", s)
    # 2. alles wat geen letter, cijfer of spatie is, weghalen
    s = re.sub(r"[^\w\s]", "", s)
    # 3. meerdere spaties terugbrengen tot één
    s = re.sub(r"\s+", " ", s).strip()
    # 4. naar lowercase
    return s.lower()

df = df[~mask_removed].copy()
df['sentence'] = (
    df['sentence']
      .str.strip()
      .str.replace(r'\.+', '', regex=True)
      .str.replace(r'"',   '', regex=True)
      .apply(clean_punctuation)    # ← nieuwe stap
)
logger.info(f"{len(df)} rows remain after cleaning")

# 4.  Prepare spaCy & adjective sets
logger.info("Loading spaCy model")
nlp = spacy.load("nl_core_news_sm", disable=["ner","parser"])

male_adjs = {
    "corrupt", "onoverwinnelijk", "plaatsvervangend", "impopulair", "goddeloos",
    "incompetent", "misdadig", "bekwaam", "sadistisch", "gewetenloos",
    "steenrijk", "vooraanstaand", "voortvluchtig", "geniaal", "planmatig", "bekwaamheid","genialiteit"
}
female_adjs = {
    "blond", "beeldschoon", "bloedmooie", "donkerharig", "ongehuwd",
    "kinderloos", "glamoureus", "beeldig", "sensueel", "platinablond",
    "voorlijk", "feministisch", "stijlvol", "tuttig", "rimpelig"
}

SUFFIXES     = ("heid", "iteit", "eerder", "er", "st")

# %% 
# 5.  Core counting function — match on lemma ongeacht POS
def count_adjs(text: str):
    """
    Returns: male_count, female_count, male_hits, female_hits
    """
    doc = nlp(text)
    male_hits = []
    female_hits = []

    for tok in doc:
        lemma = tok.lemma_.lower()

        # 1) Exact lemma match in male/female lijsten
        if lemma in male_adjs:
            male_hits.append(lemma)
        elif lemma in female_adjs:
            female_hits.append(lemma)
        else:
            # 2) Fuzzy fallback op lemma
            best_male = max(male_adjs,   key=lambda a: fuzz.ratio(lemma, a))
            best_fem  = max(female_adjs, key=lambda a: fuzz.ratio(lemma, a))
            score_m   = fuzz.ratio(lemma, best_male)
            score_f   = fuzz.ratio(lemma, best_fem)

            if score_m >= FUZZY_THRESH and score_m >= score_f:
                male_hits.append(best_male)
            elif score_f >= FUZZY_THRESH:
                female_hits.append(best_fem)

    # per zin dedupliceren
    return len(set(male_hits)), len(set(female_hits)), male_hits, female_hits


# %% 
# 6.  Apply to DataFrame & log spaCy results
logger.info("Beginning sentence-level adjective counting")
results = df["sentence"].apply(count_adjs)
df[["male_count","female_count","male_matches","female_matches"]] = \
    pd.DataFrame(results.tolist(), index=df.index)
df["total_in_lists"] = df["male_count"] + df["female_count"]

# Overall stats
total_sentences = len(df)
total_male_adjs = df["male_count"].sum()
total_female_adjs = df["female_count"].sum()
logger.info(
    "Processed %d sentences: total male adjectives=%d, total female adjectives=%d",
    total_sentences, total_male_adjs, total_female_adjs
)

# Top 5 adjectives by frequency
male_flat   = Counter(adj for lst in df["male_matches"]   for adj in lst)
female_flat = Counter(adj for lst in df["female_matches"] for adj in lst)

# %% 
# 7.  Save cleaned & counted data
CSV_OUT.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(CSV_OUT, index=False, encoding=ENCODING)
logger.info(f"Saved cleaned & counted data to {CSV_OUT}")

In [None]:
from pathlib import Path

# define path for the no‑hit file
CSV_NO_HITS = Path("output/sentences_no_hits.csv")

# ensure output directory exists
CSV_NO_HITS.parent.mkdir(parents=True, exist_ok=True)

# filter again just to be safe
no_hits_df = df[df["total_in_lists"] == 0]

# log how many and save only the 'sentence' column (or whole row if you prefer)
logger.info(f"Saving {len(no_hits_df)} sentences with no adjective hits to {CSV_NO_HITS}")
no_hits_df[["sentence"]].to_csv(CSV_NO_HITS, index=False, encoding=ENCODING)

CSV_OUT.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(CSV_OUT, index=False, encoding=ENCODING)
logger.info(f"Saved cleaned & counted data to {CSV_OUT}")

logger.info("Done.")

In [None]:
# %% 
# 9.  Summary by noun_gender × adjective_gender (with percentages)
total_sentences = len(df)

summary = (
    df
    .groupby(['noun_gender', 'adjective_gender'])
    .size()
    .reset_index(name='sentence_count')
    .assign(
        percentage=lambda d: (d['sentence_count'] / total_sentences) * 100
    )
    .sort_values('sentence_count', ascending=False)
)

# Display in notebook
print("Sentence counts by noun_gender and adjective_gender (with % of total):")
print(summary)

# Save summary to CSV
summary_path = CSV_OUT.parent / "summary_by_gender.csv"
summary.to_csv(summary_path, index=False, encoding=ENCODING)
logger.info(f"Saved summary breakdown (with percentages) to {summary_path}")


In [None]:

df = pd.read_csv("output/sentences_cleaned.csv", encoding="utf-8")

# Summary by model × noun_gender × adjective_gender × temperature
summary = (
    df
    .groupby(['model', 'temperature', 'noun_gender', 'adjective_gender'])
    .size()
    .reset_index(name='sentence_count')
)

# Compute percentage within each model-temperature group
summary['percentage'] = (
    summary
    .groupby(['model', 'temperature'])['sentence_count']
    .transform(lambda x: x / x.sum() * 100)
)

# Sort for readability
summary = summary.sort_values(['model', 'temperature', 'sentence_count'], ascending=[True, True, False])

# Display the summary
print("Sentence counts by model, temperature, noun_gender & adjective_gender", summary)

# Save to CSV
summary.to_csv("output/summary_by_model_temp_gender.csv", index=False, encoding="utf-8")


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Ensure sentence_count is integer for formatting
summary["sentence_count"] = summary["sentence_count"].round().astype(int)

# Create a compact label for gender pair
summary["gender_pair"] = summary["noun_gender"].str[0] + "/" + summary["adjective_gender"].str[0]
models = summary["model"].unique()
n_models = len(models)

# Seaborn aesthetics
sns.set(style="white", font_scale=1.2)

# Figure & shared colorbar
fig, axes = plt.subplots(1, n_models, figsize=(5.5 * n_models, 5), sharey=True)
vmin = summary["sentence_count"].min()
vmin = int(vmin)
vmax = summary["sentence_count"].max()
vmax = int(vmax)
cbar_ax = fig.add_axes([0.93, 0.15, 0.02, 0.7])  # Colorbar at right

# Create one subplot per model
for i, (ax, model) in enumerate(zip(axes, models)):
    df_model = summary[summary["model"] == model]
    heatmap_data = df_model.pivot(index="gender_pair", columns="temperature", values="sentence_count")

    sns.heatmap(
        heatmap_data,
        annot=True,
        fmt="d",  # Now safe to use integer formatting
        cmap="YlGnBu",
        linewidths=0.5,
        linecolor="white",
        cbar=i == n_models - 1,
        cbar_ax=cbar_ax if i == n_models - 1 else None,
        vmin=vmin,
        vmax=vmax,
        square=True,
        ax=ax
    )

    ax.set_title(model, fontsize=13)
    ax.set_xlabel("Temperature")
    if i == 0:
        ax.set_ylabel("Noun / Adjective Gender")
    else:
        ax.set_ylabel("")

# Add main title and adjust layout
plt.suptitle("Valid Sentence Count by Gender Pair and Temperature", fontsize=15, y=1.03)
plt.tight_layout(rect=[0, 0, 0.92, 1])
plt.show()


In [None]:
# %% 
# 10.  Summary by model × noun_gender × adjective_gender (with per-model %)
# (no need to re-read the CSV if you still have `df` in memory)

summary = (
    df
    .groupby(['model', 'noun_gender', 'adjective_gender'])
    .size()
    .reset_index(name='sentence_count')
)

# Compute percentage within each model
summary['percentage'] = (
    summary
    .groupby('model')['sentence_count']
    .transform(lambda x: x / x.sum() * 100)
)

# Sort for readability
summary = summary.sort_values(['model', 'sentence_count'], ascending=[True, False])

# Display the summary
print("Sentence counts by model, noun_gender & adjective_gender (out of {:,} sentences)\n".format(len(df)))
print(summary.to_string(index=False))

# Save to disk
summary.to_csv("output/summary_by_model_gender.csv", index=False, encoding="utf-8")
print("\nSaved summary to output/summary_by_model_gender.csv")


In [None]:
import pandas as pd

# Load your cleaned dataset (adjust path if needed)
df = pd.read_csv("output/sentences_cleaned.csv", encoding="utf-8")

# a) Total number of rows per word
word_counts = (
    df['word']
    .value_counts()
    .reset_index()
    .rename(columns={'index': 'word', 'word': 'occurrence_count'})
)

# d) Inspect the first few rows
print(word_counts)

In [None]:
# ---------------------------------------------------------------------
# 7.  Leakage summary
# ---------------------------------------------------------------------
pct_multi = (df['total_in_lists'] > 1).mean() * 100
# ---------------------------------------------------------------------
# 8.  Print example sentences with both male & female adjectives
# ---------------------------------------------------------------------
# Print to console (and log)
print(f"% sentences with more than one adjective: {pct_multi:.2f}%")
logger.info("Leakage summary — >1 count: %.2f%%;",
            pct_multi)

In [None]:
# Aggregate leakage by model × temperature
agg_model_temp = (
    df
    .groupby(['model', 'temperature'])
    .apply(lambda sub: (sub['total_in_lists'] > 1).mean() * 100)
    .reset_index(name='pct_multi')
)

# Plot each model in its own color
plt.figure()
for model in agg_model_temp['model'].unique():
    subset = agg_model_temp[agg_model_temp['model'] == model]
    plt.plot(
        subset['temperature'],
        subset['pct_multi'],
        marker='o',
        label=model
    )

plt.xlabel('Temperature')
plt.ylabel('Leakage (%)')
plt.title('Leakage vs. Temperature by Model')
plt.legend(title="Model")
plt.tight_layout()
plt.show()

In [None]:
# ——— 1. derive prompt_type ———
# Map “male”→“M”, “female”→“F” and build e.g. “M→F”, “F→M”, etc.
gender_map = {"male":"M", "female":"F"}
df["prompt_type"] = (
    df["adjective_gender"].map(gender_map)
  + "→"
  + df["noun_gender"].map(gender_map)
)

# ——— 3. Prompt‑structure interaction ———
pt = (
    df
    .groupby("prompt_type")
    .apply(lambda sub: pd.Series({
        "pct_multi": (sub["total_in_lists"]>1).mean()*100,
    
    }))
    .round(2)
)
print("\nLeakage by prompt structure:\n", pt)

import numpy as np
# grouped bar chart
labels = pt.index.tolist()
x = np.arange(len(labels))
plt.bar(x-0.15, pt["pct_multi"], width=0.3, label="Multi‑adj")
plt.xticks(x, labels); plt.ylabel("Leakage (%)")
plt.title("Leakage by Prompt Structure")
plt.legend(); plt.tight_layout()
plt.show()


In [None]:
# leakage by prompt_type
pt = df.groupby('prompt_type').apply(lambda sub: pd.Series({
    'pct_multi': (sub['total_in_lists']>1).mean()*100,
    'pct_co':    ((sub['male_count']>0)&(sub['female_count']>0)).mean()*100
})).round(2)

print(pt)

import numpy as np

labels = pt.index.tolist()
multi = pt['pct_multi'].values
co    = pt['pct_co'].values
x = np.arange(len(labels))

plt.figure(figsize=(6,4))
plt.bar(x-0.15, multi, width=0.3, label='Multi‑adj')
plt.bar(x+0.15, co,    width=0.3, label='Co‑occur')
plt.xticks(x, labels)
plt.ylabel('Leakage (%)')
plt.title('Leakage by Prompt Structure')
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# 1. Count each category
count_MM = len(df[(df['noun_gender']=='male') & (df['adjective_gender']=='male')])
count_FF = len(df[(df['noun_gender']=='female') & (df['adjective_gender']=='female')])
count_MF = len(df[(df['noun_gender']=='male') & (df['adjective_gender']=='female')])
count_FM = len(df[(df['noun_gender']=='female') & (df['adjective_gender']=='male')])

# 2. Totals
total_S = count_MM + count_FF
total_C = count_MF + count_FM
grand_total = len(df)

# 3. Percentages (relative to each half of the dataset)
pct_MM = count_MM / total_S * 100
pct_FF = count_FF / total_S * 100
pct_MF = count_MF / total_C * 100
pct_FM = count_FM / total_C * 100

# 4. Build summary table
summary = pd.DataFrame({
    '': [
        'Consistent with gender stereotype (S)',
        'Contradictory to gender stereotype (S)',
        'Total'
    ],
    '#MM': [f"{count_MM} ({pct_MM:.1f}%)", f"{count_MF} ({pct_MF:.1f}%)", ''],
    '#FF': [f"{count_FF} ({pct_FF:.1f}%)", f"{count_FM} ({pct_FM:.1f}%)", ''],
    '#Total': [f"{total_S} ({total_S/grand_total*100:.1f}%)",
               f"{total_C} ({total_C/grand_total*100:.1f}%)",
               f"{grand_total}"]
})

# 5. Display the table
print("Table 2: Labeling details with size & distribution", summary)