# Code Documentation of CheckThat! Subtask 4b Neural Representation learning

## Base-setup trials

These trials employ word2vec without and with spaCy preprocessing. Reproduce the base-setup results here by running the following cells.

Approach with Word2Vec without preprocessing

In [None]:
import os
import numpy as np
import pandas as pd
import random
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime

# ---------------- REPRODUCIBILITY ------------------------------------------------
seed = 42
random.seed(seed)
np.random.seed(seed)

# ---------------- CONFIG --------------------------------------------------------
exp = {
    "experiment_name":   "word2vec_no_preprocessing",
    "encoder_model":     "-",
    "query_field":       "-",
    "normalize":         "-",
    "fine_tune":         "-",
    "epochs":            "-",
    "batch_size":        "-",
    "lr":                "-",
    "use_hard_negatives": "-"
}

RUN_ID   = datetime.now().strftime("%Y%m%d_%H%M%S")
SAVE_DIR = f"../models/{exp['experiment_name']}_{RUN_ID}"
os.makedirs(SAVE_DIR, exist_ok=True)

DATA_DIR = "../../data"
OUT_CSV  = "../experiment_results/clef_neural_rep_exp_results_with_testdata.csv"

# ---------------- LOAD DATA ----------------------------------------------------
df_collection  = pd.read_pickle(f"{DATA_DIR}/subtask4b_collection_data.pkl")
df_query_train = pd.read_csv(f"{DATA_DIR}/subtask4b_query_tweets_train.tsv", sep="\t")
df_query_dev   = pd.read_csv(f"{DATA_DIR}/subtask4b_query_tweets_dev.tsv",   sep="\t")
df_query_test  = pd.read_csv(f"{DATA_DIR}/subtask4b_query_tweets_test_gold.tsv", sep="\t")  # TEST DATA

corpus = df_collection[['title', 'abstract']].apply(
    lambda x: f"{x['title']} {x['abstract']}", axis=1
).tolist()
cord_uids = df_collection['cord_uid'].tolist()

# ---------------- TOKENIZATION -------------------------------------------------
tokenized_corpus = [doc.lower().split() for doc in corpus]

# ---------------- WORD2VEC TRAINING --------------------------------------------
w2v_model = Word2Vec(
    sentences=tokenized_corpus,
    vector_size=100, 
    window=5, 
    min_count=2,
    workers=4,
    seed=seed
)

# ---------------- EMBEDDINGS ---------------------------------------------------
def get_avg_embedding(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

doc_embeddings = np.array([
    get_avg_embedding(tokens, w2v_model) for tokens in tokenized_corpus
])

def encode_queries_w2v(queries, model):
    tokenized = [q.lower().split() for q in queries]
    return np.array([
        get_avg_embedding(tokens, model) for tokens in tokenized
    ])

print("Encoding queries (Word2Vec) …")
q_emb_train_w2v = encode_queries_w2v(df_query_train['tweet_text'].tolist(), w2v_model)
q_emb_dev_w2v   = encode_queries_w2v(df_query_dev['tweet_text'].tolist(),   w2v_model)
q_emb_test_w2v  = encode_queries_w2v(df_query_test['tweet_text'].tolist(),  w2v_model)  # TEST

# ---------------- METRICS ------------------------------------------------------
def compute_metrics(q_emb, doc_emb, gt_ids, doc_ids):
    sims = cosine_similarity(q_emb, doc_emb)
    ranks = []
    rr1 = []
    rr5 = []
    rr10 = []
    for i, row in enumerate(sims):
        order = np.argsort(-row)  # sort descending
        gt_idx = doc_ids.index(gt_ids[i])
        rank = int(np.where(order == gt_idx)[0][0]) + 1
        ranks.append(rank)
        rr1.append(1.0/rank if rank <= 1 else 0.0)
        rr5.append(1.0/rank if rank <= 5 else 0.0)
        rr10.append(1.0/rank if rank <= 10 else 0.0)
    ranks = np.array(ranks)
    metrics = {
        "MRR@1":       float(np.mean(rr1)),
        "MRR@5":       float(np.mean(rr5)),
        "MRR@10":      float(np.mean(rr10)),
        "Recall@5":    float((ranks <= 5).mean()),
        "Recall@10":   float((ranks <= 10).mean()),
        "MedianRank":  float(np.median(ranks))
    }
    return metrics

print("Computing metrics for Word2Vec …")
train_metrics_w2v = compute_metrics(
    q_emb_train_w2v, doc_embeddings,
    df_query_train['cord_uid'].tolist(),
    cord_uids
)
dev_metrics_w2v = compute_metrics(
    q_emb_dev_w2v, doc_embeddings,
    df_query_dev['cord_uid'].tolist(),
    cord_uids
)
test_metrics_w2v = compute_metrics( # TEST DATA
    q_emb_test_w2v, doc_embeddings,
    df_query_test['cord_uid'].tolist(),
    cord_uids
)

# ---------------- PRINT METRICS ------------------------------------------------
print("=== Word2Vec Train Metrics ===")
for k, v in train_metrics_w2v.items():
    print(f"{k:8s}: {v:.4f}")
print("=== Word2Vec Dev Metrics ===")
for k, v in dev_metrics_w2v.items():
    print(f"{k:8s}: {v:.4f}")#
print("=== Word2Vec Test Metrics ===")
for k, v in test_metrics_w2v.items():
    print(f"{k:8s}: {v:.4f}")

# ---------------- Optional: LOGGING ---------------------------------------------
# log_w2v = {
#     **exp,
#     **{f"train_{k}": v for k, v in train_metrics_w2v.items()},
#     **{f"dev_{k}":   v for k, v in dev_metrics_w2v.items()},
#     **{f"test_{k}":  v for k, v in test_metrics_w2v.items()},
#     "timestamp": datetime.now().isoformat()
# }

# pd.DataFrame([log_w2v]).to_csv(
#     OUT_CSV, mode="a",
#     header=not os.path.exists(OUT_CSV),
#     index=False
# )

# print("Logged Word2Vec (no preprocessing) results to", OUT_CSV)

Encoding queries (Word2Vec) …
Computing metrics for Word2Vec …
=== Word2Vec Train Metrics ===
MRR@1   : 0.0591
MRR@5   : 0.0845
MRR@10  : 0.0902
Recall@5: 0.1299
Recall@10: 0.1737
MedianRank: 209.0000
=== Word2Vec Dev Metrics ===
MRR@1   : 0.0464
MRR@5   : 0.0737
MRR@10  : 0.0800
Recall@5: 0.1193
Recall@10: 0.1686
MedianRank: 211.5000
=== Word2Vec Test Metrics ===
MRR@1   : 0.0477
MRR@5   : 0.0674
MRR@10  : 0.0711
Recall@5: 0.1010
Recall@10: 0.1300
MedianRank: 404.0000


Approach with Word2Vec with spacy preprocessing

In [None]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
import os
import spacy

# ---------------- REPRODUCIBILITY ------------------------------------------------
seed = 42
random.seed(seed)
np.random.seed(seed)

# ---------------- CONFIG --------------------------------------------------------
exp = {
    "experiment_name":   "word2vec_spacy_preprocessing",
    "encoder_model":     "-",
    "query_field":       "-",
    "normalize":         "-",
    "fine_tune":         "-",
    "epochs":            "-",
    "batch_size":        "-",
    "lr":                "-",
    "use_hard_negatives": "-"
}

# ---------------- PREPROCESSING SPACY -------------------------------------------
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

def preprocess(text):
    """
    Tokenize input text, lemmatize each token, lowercase it,
    and filter out tokens that are either alphabetic or numeric (no stopwords).
    """
    doc = nlp(text)
    tokens = []
    for tok in doc:
        if (tok.is_alpha or tok.like_num) and not tok.is_stop:
            tokens.append(tok.lemma_.lower())
    return tokens

corpus = (
    df_collection[['title', 'abstract']]
    .apply(lambda x: f"{x['title']} {x['abstract']}", axis=1)
    .tolist()
)
cord_uids = df_collection['cord_uid'].tolist()

# Tokenization
print("Tokenizing corpus …")
tokenized_corpus = [preprocess(doc) for doc in corpus]

# ---------------- WORD2VEC TRAINING --------------------------------------------
print("Training Word2Vec model …")
w2v_model = Word2Vec(
    sentences=tokenized_corpus,
    vector_size=100, 
    window=5, 
    min_count=2, 
    workers=4,
    seed=seed
)

# ---------------- EMBEDDINGS ---------------------------------------------------
def get_avg_embedding(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

print("Computing document embeddings …")
doc_embeddings = [get_avg_embedding(doc, w2v_model) for doc in tokenized_corpus]
doc_embeddings = np.array(doc_embeddings)

def encode_queries_w2v(queries, model):
    tokenized = [preprocess(q) for q in queries]
    return np.array([get_avg_embedding(tokens, model) for tokens in tokenized])

print("Encoding queries (Word2Vec) …")
q_emb_train_w2v = encode_queries_w2v(df_query_train['tweet_text'].tolist(), w2v_model)
q_emb_dev_w2v   = encode_queries_w2v(df_query_dev['tweet_text'].tolist(), w2v_model)
q_emb_test_w2v   = encode_queries_w2v(df_query_test['tweet_text'].tolist(), w2v_model) # TEST DATA

# ---------------- METRICS ------------------------------------------------------
def compute_metrics(q_emb, doc_emb, gt_ids, doc_ids):
    sims = cosine_similarity(q_emb, doc_emb)
    ranks = []
    rr1 = []
    rr5 = []
    rr10 = []
    for i, row in enumerate(sims):
        order = np.argsort(-row)  # sort descending
        gt_idx = doc_ids.index(gt_ids[i])
        rank = int(np.where(order == gt_idx)[0][0]) + 1
        ranks.append(rank)
        rr1.append(1.0/rank if rank <= 1 else 0.0)
        rr5.append(1.0/rank if rank <= 5 else 0.0)
        rr10.append(1.0/rank if rank <= 10 else 0.0)
    ranks = np.array(ranks)
    metrics = {
        "MRR@1":       float(np.mean(rr1)),
        "MRR@5":       float(np.mean(rr5)),
        "MRR@10":      float(np.mean(rr10)),
        "Recall@5":    float((ranks <= 5).mean()),
        "Recall@10":   float((ranks <= 10).mean()),
        "MedianRank":  float(np.median(ranks))
    }
    return metrics

print("Computing metrics for Word2Vec with spaCy-preprocessing …")
train_metrics_w2v = compute_metrics(
    q_emb_train_w2v, doc_embeddings,
    df_query_train['cord_uid'].tolist(),
    cord_uids
)
dev_metrics_w2v   = compute_metrics(
    q_emb_dev_w2v, doc_embeddings,
    df_query_dev['cord_uid'].tolist(),
    cord_uids
)
test_metrics_w2v = compute_metrics( # TEST DATA
    q_emb_test_w2v, doc_embeddings,
    df_query_test['cord_uid'].tolist(),
    cord_uids
)

print("=== Word2Vec (spaCy-preprocessed) Train Metrics ===")
for k, v in train_metrics_w2v.items():
    print(f"{k:8s}: {v:.4f}")
print("=== Word2Vec (spaCy-preprocessed) Dev Metrics ===")
for k, v in dev_metrics_w2v.items():
    print(f"{k:8s}: {v:.4f}")
print("=== Word2Vec (spaCy-preprocessed) Test Metrics ===")
for k, v in test_metrics_w2v.items():
    print(f"{k:8s}: {v:.4f}")

# ---------------- Optional: LOGGING ---------------------------------------------------
# log_w2v = {
#     **exp,
#     **{f"train_{k}": v for k, v in train_metrics_w2v.items()},
#     **{f"dev_{k}":   v for k, v in dev_metrics_w2v.items()},
#     **{f"test_{k}":  v for k, v in test_metrics_w2v.items()},
#     "timestamp": datetime.now().isoformat()
# }

# pd.DataFrame([log_w2v]).to_csv(
#     OUT_CSV, mode="a",
#     header=not os.path.exists(OUT_CSV),
#     index=False
# )

# print("Logged Word2Vec (spaCy-preprocessed) results to", OUT_CSV)

Tokenizing corpus …
Training Word2Vec model …
Computing document embeddings …
Encoding queries (Word2Vec) …
Computing metrics for Word2Vec with spaCy-preprocessing …
=== Word2Vec (spaCy-preprocessed) Train Metrics ===
MRR@1   : 0.1080
MRR@5   : 0.1512
MRR@10  : 0.1603
Recall@5: 0.2272
Recall@10: 0.2948
MedianRank: 52.0000
=== Word2Vec (spaCy-preprocessed) Dev Metrics ===
MRR@1   : 0.1107
MRR@5   : 0.1455
MRR@10  : 0.1551
Recall@5: 0.2121
Recall@10: 0.2850
MedianRank: 55.0000
=== Word2Vec (spaCy-preprocessed) Test Metrics ===
MRR@1   : 0.0740
MRR@5   : 0.1064
MRR@10  : 0.1137
Recall@5: 0.1632
Recall@10: 0.2192
MedianRank: 97.5000
