In [None]:
import sys
from pathlib import Path

project_root = Path.cwd().parent
sys.path.append(str(project_root))

import pyterrier as pt
import pandas as pd

if not pt.started():
    pt.init()

  if not pt.started():


In [5]:
from src.run_experiments import bm25_fair

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
dataset = pt.get_dataset("irds:trec-fair/2021/eval")
topics = dataset.get_topics()

There are multiple query fields available: ('text', 'keywords', 'scope'). To use with pyterrier, provide variant or modify dataframe to add query column.


In [None]:
bm25_path = project_root / "data/processed/bm25_results.parquet"

if bm25_path.exists():
    print("Loaded cached BM25 results")
    bm25_results = pd.read_parquet(bm25_path)
else:
    print("⏳ Running BM25...")
    bm25 = bm25_fair()
    topics_renamed = topics.rename(columns={"text": "query"})
    bm25_results = bm25.transform(topics_renamed)
    bm25_results.to_parquet(bm25_path, index=False)
    print("Saved BM25 results to disk")

✅ Loaded cached BM25 results


In [None]:
import ir_datasets
from tqdm import tqdm
import pickle

ir_dataset = ir_datasets.load("trec-fair/2021")

def get_region(doc):
    if doc.geographic_locations:
        return doc.geographic_locations[0]
    return "Unknown"

def get_quality(doc):
    return doc.quality_score_disk if doc.quality_score_disk else "Unknown"

doc_metadata_map = {
    doc.doc_id: {
        "region": get_region(doc),
        "quality": get_quality(doc)
    }
    for doc in tqdm(ir_dataset.docs_iter(), desc="Building document metadata map")
}

output_path = project_root / "data/processed/doc_metadata_map.pkl"
with open(output_path, "wb") as f:
    pickle.dump(doc_metadata_map, f)

Building document metadata map: 0it [00:00, ?it/s]

Building document metadata map: 6280328it [02:42, 38580.42it/s]


In [26]:
with open(project_root / "data/processed/doc_metadata_map.pkl", "rb") as f:
    doc_metadata_map = pickle.load(f)

bm25_results["region"] = bm25_results["docno"].map(lambda d: doc_metadata_map.get(d, {}).get("region", "Unknown"))
bm25_results["quality"] = bm25_results["docno"].map(lambda d: doc_metadata_map.get(d, {}).get("quality", "Unknown"))

bm25_results.head()


Unnamed: 0,qid,docid,docno,rank,score,query,keywords,scope,group,label,iteration,region,quality
0,101,38196,80143,0,16.305211,Mathematicians,"[mathematician, arithmetician, trigonometricia...",WikiProject Mathematicians aims to improve Wik...,Unknown,1.0,0.0,Unknown,Stub
1,101,5361888,54715230,1,16.015399,Mathematicians,"[mathematician, arithmetician, trigonometricia...",WikiProject Mathematicians aims to improve Wik...,Europe,1.0,0.0,Europe,B
2,101,1608787,11353703,2,15.89062,Mathematicians,"[mathematician, arithmetician, trigonometricia...",WikiProject Mathematicians aims to improve Wik...,Europe,1.0,0.0,Europe,B
3,101,3237125,28842347,3,15.767974,Mathematicians,"[mathematician, arithmetician, trigonometricia...",WikiProject Mathematicians aims to improve Wik...,Europe,0.0,,Europe,B
4,101,9167,18902,4,15.415633,Mathematicians,"[mathematician, arithmetician, trigonometricia...",WikiProject Mathematicians aims to improve Wik...,Unknown,0.0,,Unknown,B


In [27]:
import numpy as np

def exposure(rank):
    return 1 / np.log2(rank + 2)

def compute_disparate_exposure(df, group_col="group"):
    df = df.copy()
    df["exposure"] = df["rank"].apply(exposure)
    group_exposure = df.groupby(group_col)["exposure"].sum()
    group_exposure /= group_exposure.sum()
    return group_exposure

def compute_disparate_impact(df, k=10, group_col="group"):
    topk = df[df["rank"] < k]
    group_success = topk[group_col].value_counts(normalize=True)
    return group_success

In [28]:
exposure_by_region = compute_disparate_exposure(bm25_results, group_col="region")
impact_by_region = compute_disparate_impact(bm25_results, group_col="region")

exposure_by_quality = compute_disparate_exposure(bm25_results, group_col="quality")
impact_by_quality = compute_disparate_impact(bm25_results, group_col="quality")

print("Disparate Exposure by Region:\n", exposure_by_region)
print("\nDisparate Impact by Region:\n", impact_by_region)

print("\nDisparate Exposure by Quality:\n", exposure_by_quality)
print("\nDisparate Impact by Quality:\n", impact_by_quality)

Disparate Exposure by Region:
 region
Africa                             0.000445
Asia                               0.002720
Europe                             0.939906
Latin America and the Caribbean    0.000528
Northern America                   0.011220
Oceania                            0.001425
Unknown                            0.043755
Name: exposure, dtype: float64

Disparate Impact by Region:
 Europe              0.906314
Unknown             0.085540
Northern America    0.006110
Oceania             0.002037
Name: region, dtype: float64

Disparate Exposure by Quality:
 quality
B        0.941955
C        0.020851
FA       0.002684
GA       0.005572
Start    0.018492
Stub     0.010447
Name: exposure, dtype: float64

Disparate Impact by Quality:
 B        0.922607
Start    0.028513
C        0.024440
Stub     0.012220
FA       0.006110
GA       0.006110
Name: quality, dtype: float64


In [29]:
from collections import Counter
import pandas as pd

corpus_region = Counter([v["region"] for v in doc_metadata_map.values()])
corpus_quality = Counter([v["quality"] for v in doc_metadata_map.values()])

corpus_region_dist = pd.Series(corpus_region).sort_index() / sum(corpus_region.values())
corpus_quality_dist = pd.Series(corpus_quality).sort_index() / sum(corpus_quality.values())

print("Corpus Distribution by Region:\n", corpus_region_dist)
print("\nCorpus Distribution by Quality:\n", corpus_quality_dist)

Corpus Distribution by Region:
 Africa                             0.000641
Antarctica                         0.000018
Asia                               0.003233
Europe                             0.950950
Latin America and the Caribbean    0.001068
Northern America                   0.016598
Oceania                            0.001333
Unknown                            0.026160
dtype: float64

Corpus Distribution by Quality:
 B        0.950151
C        0.018797
FA       0.002669
GA       0.003763
Start    0.015244
Stub     0.009377
dtype: float64


In [30]:
bm25_results["region_quality"] = bm25_results["region"] + " | " + bm25_results["quality"]

exposure_by_intersection = compute_disparate_exposure(bm25_results, group_col="region_quality")
impact_by_intersection = compute_disparate_impact(bm25_results, group_col="region_quality")

print("Disparate Exposure by Region + Quality:\n", exposure_by_intersection)
print("\nDisparate Impact (Top-10) by Region + Quality:\n", impact_by_intersection)

Disparate Exposure by Region + Quality:
 region_quality
Africa | B                                 0.000035
Africa | C                                 0.000189
Africa | Start                             0.000183
Africa | Stub                              0.000038
Asia | B                                   0.000374
Asia | C                                   0.000670
Asia | FA                                  0.000189
Asia | GA                                  0.000209
Asia | Start                               0.001039
Asia | Stub                                0.000240
Europe | B                                 0.931558
Europe | C                                 0.003412
Europe | FA                                0.000385
Europe | GA                                0.000519
Europe | Start                             0.003302
Europe | Stub                              0.000731
Latin America and the Caribbean | B        0.000099
Latin America and the Caribbean | C        0.000144
Latin Am

In [31]:
intersection_counts = Counter([
    f"{v['region']} | {v['quality']}" 
    for v in doc_metadata_map.values()
])

corpus_intersection_dist = pd.Series(intersection_counts).sort_index() / sum(intersection_counts.values())

print("Corpus Distribution by Region + Quality:\n", corpus_intersection_dist)

Corpus Distribution by Region + Quality:
 Africa | B                                 1.058862e-04
Africa | C                                 2.202114e-04
Africa | FA                                3.391543e-05
Africa | GA                                4.410598e-05
Africa | Start                             1.874106e-04
Africa | Stub                              4.904202e-05
Antarctica | B                             1.114591e-06
Antarctica | C                             3.025320e-06
Antarctica | FA                            1.592274e-07
Antarctica | GA                            1.114591e-06
Antarctica | Start                         4.776821e-06
Antarctica | Stub                          7.483686e-06
Asia | B                                   5.405769e-04
Asia | C                                   9.868911e-04
Asia | FA                                  2.077917e-04
Asia | GA                                  2.512608e-04
Asia | Start                               7.802140e-04
Asia |

In [33]:
qrels = dataset.get_qrels()

# Rename to match internal expectations
qrels = qrels.rename(columns={"relevance": "label"})

# Merge into results
bm25_results = bm25_results.merge(qrels, on=["qid", "docno"], how="left")
bm25_results["label"] = bm25_results["label"].fillna(0)

In [34]:
# Average relevance score per region
relevance_by_region = bm25_results.groupby("region")["label"].mean().sort_values(ascending=False)
relevance_by_quality = bm25_results.groupby("quality")["label"].mean().sort_values(ascending=False)
relevance_by_intersection = bm25_results.groupby("region_quality")["label"].mean().sort_values(ascending=False)

print("Average Relevance by Region:\n", relevance_by_region)
print("\nAverage Relevance by Quality:\n", relevance_by_quality)
print("\nAverage Relevance by Region + Quality:\n", relevance_by_intersection)

Average Relevance by Region:
 region
Unknown                            0.924819
Europe                             0.160242
Asia                               0.157143
Latin America and the Caribbean    0.142857
Africa                             0.083333
Northern America                   0.061620
Oceania                            0.044776
Name: label, dtype: float64

Average Relevance by Quality:
 quality
GA       0.991925
FA       0.258065
B        0.160970
C        0.079179
Start    0.048206
Stub     0.013972
Name: label, dtype: float64

Average Relevance by Region + Quality:
 region_quality
Unknown | GA                               0.994748
Oceania | GA                               0.666667
Asia | FA                                  0.666667
Asia | GA                                  0.545455
Latin America and the Caribbean | GA       0.500000
Africa | B                                 0.500000
Unknown | FA                               0.280000
Latin America and the Caribbean

In [None]:
bm25_results["click_prob"] = 1 / np.log2(bm25_results["rank"] + 2)

clicks_by_region = bm25_results.groupby("region")["click_prob"].sum()
clicks_by_quality = bm25_results.groupby("quality")["click_prob"].sum()
clicks_by_intersection = bm25_results.groupby("region_quality")["click_prob"].sum()

clicks_by_region /= clicks_by_region.sum()
clicks_by_quality /= clicks_by_quality.sum()
clicks_by_intersection /= clicks_by_intersection.sum()

print("Simulated Click Share by Region:\n", clicks_by_region)
print("\nSimulated Click Share by Quality:\n", clicks_by_quality)
print("\nSimulated Click Share by Region + Quality:\n", clicks_by_intersection)


Simulated Click Share by Region:
 region
Africa                             0.000217
Asia                               0.001327
Europe                             0.528215
Latin America and the Caribbean    0.000257
Northern America                   0.005473
Oceania                            0.000695
Unknown                            0.463815
Name: click_prob, dtype: float64

Simulated Click Share by Quality:
 quality
B        0.529215
C        0.010171
FA       0.001309
GA       0.445189
Start    0.009020
Stub     0.005096
Name: click_prob, dtype: float64

Simulated Click Share by Region + Quality:
 region_quality
Africa | B                                 0.000017
Africa | C                                 0.000092
Africa | Start                             0.000089
Africa | Stub                              0.000018
Asia | B                                   0.000182
Asia | C                                   0.000327
Asia | FA                                  0.000092
Asia | GA