In [1]:
!pip install --no-cache-dir numpy==1.24.3 scipy==1.10.1 gensim==4.3.3

# Block 1: Import necessary libraries and setup environment
import os
import re
import json
import pickle
import random
import string
import unicodedata
import torch
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path
from collections import Counter
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from torch.utils.data import random_split, DataLoader, Dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split

# Check if we're in Colab
try:
    import google.colab
    IN_COLAB = True
    print("Running in Google Colab")
except:
    IN_COLAB = False
    print("Running in local environment")

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('all', quiet=True)

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Running in Google Colab


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Using device: cuda


In [2]:
from google.colab import drive
drive.mount('/content/drive')
!ls "/content/drive/My Drive/Colab_Notebooks/nlp_a3/data"
from google.colab import drive
drive.mount('/content/drive')
!mkdir -p ./data
!ls "/content/drive/My Drive/Colab_Notebooks/nlp_a3/data"
!cp "/content/drive/My Drive/Colab_Notebooks/nlp_a3/data/evidence.json" ./data/evidence.json
!cp "/content/drive/My Drive/Colab_Notebooks/nlp_a3/data/dev-claims-baseline.json" ./data/ev-claims-baseline.json
!cp "/content/drive/My Drive/Colab_Notebooks/nlp_a3/data/dev-claims.json" ./data/dev-claims.json
!cp "/content/drive/My Drive/Colab_Notebooks/nlp_a3/data/train-claims.json" ./data/train-claims.json
!cp "/content/drive/My Drive/Colab_Notebooks/nlp_a3/data/test-claims-unlabelled.json" ./data/test-claims-unlabelled.json

Mounted at /content/drive
dev-claims-baseline.json  evidence.json  test-claims-unlabelled.json
dev-claims.json		  evidence.md	 train-claims.json
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
dev-claims-baseline.json  evidence.json  test-claims-unlabelled.json
dev-claims.json		  evidence.md	 train-claims.json


In [3]:
# Block 2: Load data functions
def load_json_data(file_path):
    """Load JSON data from file"""
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def load_all_data(data_dir='data'):
    """Load all necessary data files"""
    data_dir = Path(data_dir)
    train_claims = load_json_data(data_dir / 'train-claims.json')
    dev_claims = load_json_data(data_dir / 'dev-claims.json')
    test_claims = load_json_data(data_dir / 'test-claims-unlabelled.json')
    evidences = load_json_data(data_dir / 'evidence.json')
    return train_claims, dev_claims, test_claims, evidences

try:
    train_data, dev_data, test_data, evidence_data = load_all_data()
    print(f"Loaded {len(train_data)} training claims, {len(dev_data)} development claims, {len(test_data)} test claims, and {len(evidence_data)} evidence passages")
except Exception as e:
    print(f"Error loading data: {e}")
    print("Please ensure data files are in the 'data' directory")


Loaded 1228 training claims, 154 development claims, 153 test claims, and 1208827 evidence passages


In [4]:
# Block 3: Text preprocessing functions
def get_wordnet_pos(treebank_tag):
    """Convert Penn Treebank POS tags to WordNet POS tags"""
    if treebank_tag.startswith("J"):
        return wordnet.ADJ
    elif treebank_tag.startswith("V"):
        return wordnet.VERB
    elif treebank_tag.startswith("N"):
        return wordnet.NOUN
    elif treebank_tag.startswith("R"):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clean_and_tokenize_text(
    text,
    remove_stopwords=False,
    lemmatize=False,
    stem=False,
    lowercase=True,
    remove_punctuation=True,
    pos_aware_lemmatization=False,
    replace_numbers=False,
    normalize_unicode=False,
    normalize_whitespace=False,
    return_pos_tags=False
):
    """Clean and tokenize text with various preprocessing options"""
    if not isinstance(text, str):
        return ""

    if normalize_unicode:
        text = unicodedata.normalize("NFKC", text)

    if normalize_whitespace:
        text = re.sub(r'\s+', ' ', text).strip()

    if lowercase:
        text = text.lower()

    if remove_punctuation:
        text = re.sub(r"[^\w\s]", " ", text)

    tokens = word_tokenize(text)

    if replace_numbers:
        tokens = ["<NUM>" if token.isdigit() else token for token in tokens]

    if remove_stopwords:
        stop_words = set(stopwords.words("english"))
        tokens = [word for word in tokens if word not in stop_words]

    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        if pos_aware_lemmatization:
            tagged = pos_tag(tokens)
            tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tagged]
        else:
            tokens = [lemmatizer.lemmatize(word) for word in tokens]

    if stem:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]

    if return_pos_tags:
        tagged = pos_tag(tokens)
        return list(zip(tokens, [tag for _, tag in tagged]))

    return " ".join(tokens)


In [5]:
# Block 4: Load Word2Vec model
from gensim.models import KeyedVectors
import gensim.downloader as api

print("Loading Google News Word2Vec model...")
try:
    try:
        word2vec
        print("Word2Vec model already loaded")
    except NameError:
        # Try to load the model
        word2vec = api.load('word2vec-google-news-300')
        print("Word2Vec model loaded successfully")
except Exception as e:
    print(f"Error loading Word2Vec model: {e}")

    # Fallback to simple word embeddings if Word2Vec fails
    print("Falling back to simple TF-IDF representation")
    use_tfidf_fallback = True
else:
    use_tfidf_fallback = False


Loading Google News Word2Vec model...
Word2Vec model loaded successfully


In [6]:
# Block 5: Word2Vec embedding extraction and SVD dimensionality reduction
def get_word2vec_embedding(text, word2vec_model, dim=300):
    """Get average Word2Vec vector representation for text"""
    tokens = clean_and_tokenize_text(text).split()
    vectors = []

    for token in tokens:
        if token in word2vec_model:
            vectors.append(word2vec_model[token])

    if len(vectors) == 0:
        return np.zeros(dim)

    # Average embeddings
    return np.mean(vectors, axis=0)

def create_embeddings_with_svd(texts, word2vec_model, n_components=256):
    """Create embeddings for all texts and apply SVD dimensionality reduction"""
    # Get raw embeddings
    print(f"Creating embeddings for {len(texts)} texts...")
    embeddings = []
    for text in tqdm(texts):
        embedding = get_word2vec_embedding(text, word2vec_model)
        embeddings.append(embedding)

    embeddings = np.array(embeddings)

    # Apply SVD
    print(f"Applying SVD dimensionality reduction to {embeddings.shape}...")
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    reduced_embeddings = svd.fit_transform(embeddings)

    print(f"SVD complete. Shape after reduction: {reduced_embeddings.shape}")
    return reduced_embeddings, svd

# TF-IDF fallback if Word2Vec fails
def create_tfidf_embeddings(texts, n_components=256):
    """Create TF-IDF embeddings with SVD dimensionality reduction"""
    print(f"Creating TF-IDF embeddings for {len(texts)} texts...")

    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(
        preprocessor=lambda x: clean_and_tokenize_text(
            x, remove_stopwords=True, lemmatize=True
        ),
        max_features=10000
    )

    # Fit and transform texts
    tfidf_matrix = vectorizer.fit_transform(texts)

    # Apply SVD
    print(f"Applying SVD dimensionality reduction to {tfidf_matrix.shape}...")
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    reduced_embeddings = svd.fit_transform(tfidf_matrix)

    print(f"SVD complete. Shape after reduction: {reduced_embeddings.shape}")
    return reduced_embeddings, svd, vectorizer

In [7]:
# Block 6: Evidence retrieval functions
def retrieve_top_k_evidence_word2vec(claim_text, evidence_dict, word2vec_model, svd_model, evidence_embeddings, k=5):
    """Retrieve top-k evidence using Word2Vec embeddings and cosine similarity"""
    # Get claim embedding
    claim_embedding = get_word2vec_embedding(claim_text, word2vec_model)
    claim_embedding = claim_embedding.reshape(1, -1)

    # Apply SVD reduction
    claim_embedding_reduced = svd_model.transform(claim_embedding)

    # Calculate cosine similarity
    similarities = cosine_similarity(claim_embedding_reduced, evidence_embeddings).flatten()

    # Get top-k evidence IDs
    ev_ids = list(evidence_dict.keys())
    top_k_idx = similarities.argsort()[-k:][::-1]

    # Return evidence IDs and their similarities for re-ranking
    return [(ev_ids[i], similarities[i]) for i in top_k_idx]

def retrieve_top_k_evidence_tfidf(claim_text, evidence_dict, vectorizer, svd_model, evidence_embeddings, k=5):
    """Retrieve top-k evidence using TF-IDF embeddings and cosine similarity"""
    # Get claim embedding
    claim_vector = vectorizer.transform([claim_text])
    claim_embedding_reduced = svd_model.transform(claim_vector)

    # Calculate cosine similarity
    similarities = cosine_similarity(claim_embedding_reduced, evidence_embeddings).flatten()

    # Get top-k evidence IDs
    ev_ids = list(evidence_dict.keys())
    top_k_idx = similarities.argsort()[-k:][::-1]

    # Return evidence IDs and their similarities for re-ranking
    return [(ev_ids[i], similarities[i]) for i in top_k_idx]

In [8]:
# Modify TransformerReranker Class (Block 7)
class TransformerReranker(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_heads, num_layers, dropout=0.4):  
        super().__init__()

        # Input projection layer
        self.input_projection = nn.Linear(input_dim, hidden_dim)
        self.input_dropout = nn.Dropout(dropout)  

        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim,
            nhead=num_heads,
            dropout=dropout,  # Using increased dropout value
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Attention pooling layer to replace average pooling
        self.attention_pooling = AttentionPooling(hidden_dim)

        self.hidden_layer = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),  # Layer normalization helps training stability
            nn.ReLU(),
            nn.Dropout(dropout)
        )

        # Output layer
        self.dropout = nn.Dropout(dropout)

        self.ranking_head = nn.Linear(hidden_dim, 1)

        # Classification head (4 classes)
        self.classification_head = nn.Linear(hidden_dim, 4)

    def forward(self, x, task='ranking', return_attention=False):
        x = self.input_projection(x)
        x = self.input_dropout(x)  

        x = self.transformer(x)

        # Use attention pooling to replace average pooling
        pooled_x, attention_weights = self.attention_pooling(x)

        # NEW: Pass through additional hidden layer
        pooled_x = self.hidden_layer(pooled_x)

        pooled_x = self.dropout(pooled_x)

        if task == 'ranking':
            # Binary classification (relevant or not)
            logits = self.ranking_head(pooled_x)
            # Use log-sigmoid for numerical stability
            result = F.logsigmoid(logits)
        else:
            # Multi-class, use log-softmax for numerical stability
            result = F.log_softmax(self.classification_head(pooled_x), dim=1)

        if return_attention:
            return result, attention_weights
        return result

In [9]:
class AttentionPooling(nn.Module):
    def __init__(self, hidden_dim, attention_dim=None, dropout=0.4): 
        super().__init__()
        if attention_dim is None:
            attention_dim = hidden_dim // 2

        # Attention calculation layers
        self.attention = nn.Sequential(
            nn.Linear(hidden_dim, attention_dim),
            nn.Dropout(dropout/2),  
            nn.Tanh(),
            nn.Linear(attention_dim, 1)
        )
        self.dropout = nn.Dropout(dropout) 

    def forward(self, x):
        attn_weights = self.attention(x)  
        attn_weights = F.softmax(attn_weights, dim=1)  # Softmax along sequence dimension
        attn_weights = self.dropout(attn_weights)  

        # Apply attention weights
        weighted_output = x * attn_weights  # [batch_size, seq_len, hidden_dim]

        # Sum along sequence dimension
        output = weighted_output.sum(dim=1)  # [batch_size, hidden_dim]

        return output, attn_weights.squeeze(-1)  # Return pooled result and attention weights

In [10]:
# Block 8: Dataset for re-ranking
class RerankingDataset(Dataset):
    def __init__(self, claim_evidence_pairs, labels, embedding_func, embedding_params):

        self.pairs = claim_evidence_pairs
        self.labels = labels
        self.embedding_func = embedding_func
        self.embedding_params = embedding_params

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        claim, evidence = self.pairs[idx]

        # Get embeddings using the provided function
        embedding = self.embedding_func(claim, evidence, **self.embedding_params)

        # Convert to tensors
        embedding_tensor = torch.tensor(embedding, dtype=torch.float32)
        label_tensor = torch.tensor(self.labels[idx], dtype=torch.float32)

        return embedding_tensor, label_tensor


In [11]:
# Block 9: Embedding functions for the dataset
def get_word2vec_combined_embedding(claim, evidence, word2vec_model, svd_model):
    """Get combined embedding for claim and evidence using Word2Vec"""
    claim_embedding = get_word2vec_embedding(claim, word2vec_model)
    evidence_embedding = get_word2vec_embedding(evidence, word2vec_model)

    # Concatenate and reshape
    combined_embedding = np.concatenate([claim_embedding, evidence_embedding])
    combined_embedding = combined_embedding.reshape(2, -1)

    # Apply SVD reduction
    combined_embedding_reduced = svd_model.transform(combined_embedding)

    return combined_embedding_reduced

def get_tfidf_combined_embedding(claim, evidence, vectorizer, svd_model):
    """Get combined embedding for claim and evidence using TF-IDF"""
    claim_vector = vectorizer.transform([claim])
    evidence_vector = vectorizer.transform([evidence])

    claim_embedding = svd_model.transform(claim_vector)[0]
    evidence_embedding = svd_model.transform(evidence_vector)[0]

    # Stack embeddings
    combined_embedding = np.vstack([claim_embedding, evidence_embedding])

    return combined_embedding

In [12]:
# Block 10: Training function for re-ranker
def train_reranker(model, train_loader, val_loader, num_epochs, device, patience=4):
    """Train the re-ranking model"""
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
    # Use BCE with logits loss for numerical stability
    criterion = nn.BCEWithLogitsLoss()

    best_val_acc = 0.0
    patience_counter = 0

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        total_loss = 0.0
        total_correct = 0
        total_count = 0

        for batch_x, batch_y in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            optimizer.zero_grad()
            log_probs = model(batch_x, task='ranking').squeeze()
            # Convert log probabilities to actual probabilities for the loss
            probs = log_probs.exp()
            loss = criterion(log_probs, batch_y)
            loss.backward()
            optimizer.step()

            # Training metrics
            preds = (probs > 0.5).float()
            total_correct += (preds == batch_y).sum().item()
            total_count += len(batch_y)
            total_loss += loss.item() * len(batch_y)

        train_loss = total_loss / total_count
        train_acc = total_correct / total_count

        # Validation phase
        model.eval()
        val_total_correct = 0
        val_total_count = 0
        val_total_loss = 0.0

        with torch.no_grad():
            for batch_x, batch_y in tqdm(val_loader, desc="Validation"):
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                log_probs = model(batch_x, task='ranking').squeeze()
                # Convert log probabilities to actual probabilities
                probs = log_probs.exp()
                loss = criterion(log_probs, batch_y)
                preds = (probs > 0.5).float()
                val_total_correct += (preds == batch_y).sum().item()
                val_total_count += len(batch_y)
                val_total_loss += loss.item() * len(batch_y)

        val_loss = val_total_loss / val_total_count
        val_acc = val_total_correct / val_total_count

        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"  Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.3f}")
        print(f"  Val Loss: {val_loss:.3f}, Val Acc: {val_acc:.3f}")

        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            patience_counter = 0
            torch.save(model.state_dict(), 'best_reranker.pth')
            print("  Saved new best model")
        else:
            patience_counter += 1
            print(f"  No improvement for {patience_counter} epochs")

        # Early stopping
        if patience_counter >= patience:
            print(f"Early stopping after {epoch+1} epochs")
            break

    # Load best model
    model.load_state_dict(torch.load('best_reranker.pth'))
    return model, best_val_acc


In [13]:
# Block 11: Build training data for re-ranking
def build_reranking_data(train_claims, evidence_dict):
    """Build training data for the re-ranking model"""
    pairs = []
    labels = []

    for cid, item in tqdm(train_claims.items(), desc="Building training data"):
        if "claim_label" not in item or "evidences" not in item:
            continue

        claim_text = item["claim_text"]
        relevant_evidence_ids = set(item["evidences"])

        # Sample positive examples (relevant evidence)
        for eid in relevant_evidence_ids:
            if eid in evidence_dict:
                evidence_text = evidence_dict[eid]
                pairs.append((claim_text, evidence_text))
                labels.append(1.0)  # Relevant

        # Sample negative examples (random evidence)
        all_evidence_ids = list(evidence_dict.keys())
        num_negatives = min(len(relevant_evidence_ids) * 3, 15)  # Increased negative samples

        for _ in range(num_negatives):
            random_eid = random.choice(all_evidence_ids)
            if random_eid not in relevant_evidence_ids:
                evidence_text = evidence_dict[random_eid]
                pairs.append((claim_text, evidence_text))
                labels.append(0.0)  # Not relevant

    return pairs, labels


In [14]:
class ClaimClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim=4, dropout=0.4):  
        """Claim classifier using evidence embeddings"""
        super().__init__()

        # Input projection
        self.input_projection = nn.Linear(input_dim, hidden_dim)
        self.input_dropout = nn.Dropout(dropout) 

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim,
            nhead=8,
            dropout=dropout,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=3)  

        # Attention pooling layer
        self.attention_pooling = AttentionPooling(hidden_dim, dropout=dropout)  

        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(hidden_dim, hidden_dim)
        self.layer_norm = nn.LayerNorm(hidden_dim) 
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.fc3 = nn.Linear(hidden_dim // 2, output_dim) 

    def forward(self, x, return_attention=False):
        x = self.input_projection(x)
        x = self.input_dropout(x)  

        x = self.transformer(x)

        # Use attention pooling
        pooled_x, attention_weights = self.attention_pooling(x)

        x = self.fc1(pooled_x)
        x = self.layer_norm(x) 
        x = F.relu(x)
        x = self.dropout(x)

        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        logits = self.fc3(x)  

        # Apply log-softmax for numerical stability
        result = F.log_softmax(logits, dim=1)

        if return_attention:
            return result, attention_weights
        return result

In [15]:
# Block 13: Dataset for classification
class ClassificationDataset(Dataset):
    def __init__(self, claims, evidence_dict, label_map, embedding_func, embedding_params):
        self.samples = []
        self.labels = []

        for cid, item in tqdm(claims.items(), desc="Building classification dataset"):
            if "claim_label" in item and "evidences" in item and "claim_text" in item:
                claim_text = item["claim_text"]
                evidence_ids = item["evidences"]
                label = item["claim_label"]

                # Get evidence texts
                evidence_texts = []
                for eid in evidence_ids:
                    if eid in evidence_dict:
                        evidence_texts.append(evidence_dict[eid])

                if evidence_texts:
                    # Concatenate all evidence for this claim
                    combined_evidence = " ".join(evidence_texts[:5]) 

                    # Save sample and label
                    self.samples.append((claim_text, combined_evidence))
                    self.labels.append(label_map[label])

        self.embedding_func = embedding_func
        self.embedding_params = embedding_params

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        claim_text, evidence_text = self.samples[idx]
        label = self.labels[idx]

        # Get embedding using the provided function
        embedding = self.embedding_func(claim_text, evidence_text, **self.embedding_params)

        # Convert to tensors
        embedding_tensor = torch.tensor(embedding, dtype=torch.float32)
        label_tensor = torch.tensor(label, dtype=torch.long)

        return embedding_tensor, label_tensor

In [16]:
# Block 14: Training function for classifier
def train_classifier(model, train_loader, val_loader, num_epochs, device, patience=3):
    """Train the classification model"""
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
    # Use NLLLoss since our model outputs log-softmax probabilities
    criterion = nn.NLLLoss()

    best_val_acc = 0.0
    patience_counter = 0

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        total_loss = 0.0
        total_correct = 0
        total_count = 0

        for batch_x, batch_y in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            optimizer.zero_grad()
            log_probs = model(batch_x)  
            loss = criterion(log_probs, batch_y)
            loss.backward()
            optimizer.step()

            # Training metrics - still use argmax since highest log probability corresponds to highest probability
            preds = torch.argmax(log_probs, dim=1)
            total_correct += (preds == batch_y).sum().item()
            total_count += len(batch_y)
            total_loss += loss.item() * len(batch_y)

        train_loss = total_loss / total_count
        train_acc = total_correct / total_count

        # Validation phase
        model.eval()
        val_total_correct = 0
        val_total_count = 0
        val_total_loss = 0.0

        with torch.no_grad():
            for batch_x, batch_y in tqdm(val_loader, desc="Validation"):
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                log_probs = model(batch_x)
                loss = criterion(log_probs, batch_y)
                preds = torch.argmax(log_probs, dim=1)
                val_total_correct += (preds == batch_y).sum().item()
                val_total_count += len(batch_y)
                val_total_loss += loss.item() * len(batch_y)

        val_loss = val_total_loss / val_total_count
        val_acc = val_total_correct / val_total_count

        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"  Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.3f}")
        print(f"  Val Loss: {val_loss:.3f}, Val Acc: {val_acc:.3f}")

        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            patience_counter = 0
            torch.save(model.state_dict(), 'best_classifier.pth')
            print("  Saved new best model")
        else:
            patience_counter += 1
            print(f"  No improvement for {patience_counter} epochs")

        # Early stopping
        if patience_counter >= patience:
            print(f"Early stopping after {epoch+1} epochs")
            break

    # Load best model
    model.load_state_dict(torch.load('best_classifier.pth'))
    return model, best_val_acc


In [17]:
# Block 15: Main training pipeline
def main_training_pipeline(top_k=5):
    """Main training pipeline with re-ranking and classification"""
    LABEL_MAP = {"SUPPORTS": 0, "REFUTES": 1, "NOT_ENOUGH_INFO": 2, "DISPUTED": 3}
    ID2LABEL = {v: k for k, v in LABEL_MAP.items()}

    # Decide which embedding method to use based on Word2Vec availability
    if use_tfidf_fallback:
        print("Using TF-IDF embeddings for training")
        # Create TF-IDF representations for all evidence
        evidence_texts = list(evidence_data.values())
        evidence_embeddings, svd_model, vectorizer = create_tfidf_embeddings(
            evidence_texts, n_components=256
        )

        # Use TF-IDF embedding functions
        retrieve_func = lambda claim, ev_dict, k: retrieve_top_k_evidence_tfidf(
            claim, ev_dict, vectorizer, svd_model, evidence_embeddings, k
        )

        embedding_func = get_tfidf_combined_embedding
        embedding_params = {"vectorizer": vectorizer, "svd_model": svd_model}
    else:
        print("Using Word2Vec embeddings for training")
        # Create evidence embeddings using Word2Vec
        evidence_texts = list(evidence_data.values())
        evidence_embeddings, svd_model = create_embeddings_with_svd(
            evidence_texts, word2vec, n_components=256
        )

        # Use Word2Vec embedding functions
        retrieve_func = lambda claim, ev_dict, k: retrieve_top_k_evidence_word2vec(
            claim, ev_dict, word2vec, svd_model, evidence_embeddings, k
        )

        embedding_func = get_word2vec_combined_embedding
        embedding_params = {"word2vec_model": word2vec, "svd_model": svd_model}

    # Prepare re-ranking data
    print("Building re-ranking training data...")
    rerank_pairs, rerank_labels = build_reranking_data(train_data, evidence_data)

    # Create re-ranking dataset
    rerank_dataset = RerankingDataset(rerank_pairs, rerank_labels, embedding_func, embedding_params)

    # Split into train and validation sets
    train_size = int(0.8 * len(rerank_dataset))
    val_size = len(rerank_dataset) - train_size
    train_set, val_set = random_split(rerank_dataset, [train_size, val_size])

    # Data loaders for re-ranker
    train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=32)

    # Initialize re-ranking model
    reranker = TransformerReranker(
        input_dim=256,  # SVD reduced dimension
        hidden_dim=256,
        num_heads=8,
        num_layers=3
    )

    # Train re-ranker
    print(f"Training re-ranker on {device}")
    reranker, best_reranker_acc = train_reranker(
        reranker, train_loader, val_loader, num_epochs=5, device=device
    )
    print(f"Best re-ranker validation accuracy: {best_reranker_acc:.3f}")

    # Generate enhanced evidence for all training data
    print("Retrieving and re-ranking evidence for training set...")
    train_data_with_retrieved = {}

    for claim_id, item in tqdm(train_data.items()):
        if "claim_text" not in item:
            continue

        claim_text = item["claim_text"]

        # Step 1: Retrieve initial evidence
        initial_evidence = retrieve_func(claim_text, evidence_data, top_k*2)

        # Step 2: Re-rank with transformer
        reranked_evidence = []

        with torch.no_grad():
            for eid, initial_score in initial_evidence:
                evidence_text = evidence_data[eid]

                # Create input for re-ranker
                embedding = embedding_func(claim_text, evidence_text, **embedding_params)
                input_tensor = torch.tensor(embedding, dtype=torch.float32).unsqueeze(0).to(device)

                # Get re-ranking score
                relevance_score = reranker(input_tensor, task='ranking').item()

                # Combine initial similarity and re-ranking score
                final_score = initial_score * 0.3 + relevance_score * 0.7
                reranked_evidence.append((eid, final_score))

        # Sort and take top-k
        reranked_evidence.sort(key=lambda x: x[1], reverse=True)
        top_evidence_ids = [eid for eid, _ in reranked_evidence[:top_k]]

        # Create enhanced item with retrieved evidence
        train_data_with_retrieved[claim_id] = {
            "claim_text": claim_text,
            "claim_label": item.get("claim_label", ""),
            "evidences": top_evidence_ids  # Use retrieved evidence
        }

    # Prepare classification dataset
    print("Building classification dataset...")
    classification_dataset = ClassificationDataset(
        train_data_with_retrieved, evidence_data, LABEL_MAP, embedding_func, embedding_params
    )

    # Split classification dataset
    train_size = int(0.8 * len(classification_dataset))
    val_size = len(classification_dataset) - train_size
    train_clf_set, val_clf_set = random_split(classification_dataset, [train_size, val_size])

    # Data loaders for classifier
    train_clf_loader = DataLoader(train_clf_set, batch_size=16, shuffle=True)
    val_clf_loader = DataLoader(val_clf_set, batch_size=16)

    # Initialize classifier
    classifier = ClaimClassifier(
        input_dim=256,  # SVD reduced dimension
        hidden_dim=256
    )

    # Train classifier
    print(f"Training classifier on {device}")
    classifier, best_clf_acc = train_classifier(
        classifier, train_clf_loader, val_clf_loader, num_epochs=5, device=device
    )
    print(f"Best classifier validation accuracy: {best_clf_acc:.3f}")

    return reranker, classifier, retrieve_func, embedding_func, embedding_params, ID2LABEL

In [18]:

# Block 16: Evaluation with re-ranking and classification
def evaluate(reranker, classifier, retrieve_func, embedding_func, embedding_params,
           id2label, data, evidence_data, device, top_k=5):
    """Evaluate models on given data"""
    reranker.eval()
    classifier.eval()
    reranker.to(device)
    classifier.to(device)

    results = {}

    for claim_id, item in tqdm(data.items(), desc="Evaluating"):
        if "claim_text" not in item:
            continue

        claim_text = item["claim_text"]

        # Step 1: Retrieve initial evidence with similarities
        initial_evidence = retrieve_func(claim_text, evidence_data, top_k*2)

        # Step 2: Re-rank evidence using transformer
        reranked_evidence = []

        with torch.no_grad():
            for eid, initial_score in initial_evidence:
                evidence_text = evidence_data[eid]

                # Create input for re-ranker
                embedding = embedding_func(claim_text, evidence_text, **embedding_params)
                input_tensor = torch.tensor(embedding, dtype=torch.float32).unsqueeze(0).to(device)

                # Get re-ranking score (convert from log space to probability)
                log_relevance_score = reranker(input_tensor, task='ranking').item()
                relevance_score = np.exp(log_relevance_score)  # Convert log probability to probability

                # Combine initial similarity and re-ranking score
                final_score = initial_score * 0.3 + relevance_score * 0.7
                reranked_evidence.append((eid, final_score))

        # Sort by final score and take top-k
        reranked_evidence.sort(key=lambda x: x[1], reverse=True)
        final_evidence_ids = [eid for eid, _ in reranked_evidence[:top_k]]

        # Step 3: Classification using top-k evidence
        evidence_texts = [evidence_data[eid] for eid in final_evidence_ids]
        combined_evidence = " ".join(evidence_texts)

        # Create input for classifier
        embedding = embedding_func(claim_text, combined_evidence, **embedding_params)
        input_tensor = torch.tensor(embedding, dtype=torch.float32).unsqueeze(0).to(device)

        # Classify using log probabilities
        with torch.no_grad():
            log_probs = classifier(input_tensor) 
            pred_idx = torch.argmax(log_probs, dim=1).item()  # argmax works the same on log probs
            pred_label = id2label[pred_idx]

        results[claim_id] = {
            "claim_label": pred_label,
            "evidences": final_evidence_ids
        }

    return results


In [19]:
# Block 17: Save predictions and evaluate
def save_and_evaluate(predictions, output_path, groundtruth_path=None):
    """Save predictions and optionally run evaluation"""
    # Save predictions
    with open(output_path, 'w') as f:
        json.dump(predictions, f, indent=2)

    print(f"Predictions saved to {output_path}")

    # Run evaluation if groundtruth is provided
    if groundtruth_path and os.path.exists(groundtruth_path):
        try:
            # Try to import and run eval directly
            import sys
            sys.path.append(os.path.dirname(os.path.abspath(__file__)))

            try:
                from eval import main as eval_main

                class Args:
                    def __init__(self):
                        self.predictions = output_path
                        self.groundtruth = groundtruth_path
                        self.verbose = False

                print("\nRunning evaluation...")
                eval_main(Args())
            except ImportError:
                # Fallback to subprocess
                import subprocess
                cmd = f"python eval.py --predictions {output_path} --groundtruth {groundtruth_path}"

                result = subprocess.run(cmd.split(), capture_output=True, text=True)
                print("\nEvaluation Results:")
                print(result.stdout)
        except Exception as e:
            print(f"Error running evaluation: {e}")
            print(f"Please run manually: python eval.py --predictions {output_path} --groundtruth {groundtruth_path}")

In [None]:
# Block 18: Main execution
if __name__ == "__main__":
    # Run full pipeline with different top-k values
    print("\n" + "="*80)
    print("STARTING MAIN TRAINING PIPELINE")
    print("="*80)

    # Train models
    reranker, classifier, retrieve_func, embedding_func, embedding_params, id2label = main_training_pipeline()

    # Evaluate on development set with top-k=4
    print("\n" + "="*80)
    print("EVALUATING WITH TOP-K=4")
    print("="*80)
    dev_predictions_top4 = evaluate(reranker, classifier, retrieve_func, embedding_func,
                                   embedding_params, id2label, dev_data, evidence_data, device, top_k=4)
    save_and_evaluate(dev_predictions_top4, "dev-claims-predictions-top4.json", "data/dev-claims.json")

In [21]:
!python eval.py --predictions dev-claims-predictions-top4.json --groundtruth data/dev-claims.json

Evidence Retrieval F-score (F)    = 0.07549989692846835
Claim Classification Accuracy (A) = 0.44155844155844154
Harmonic Mean of F and A          = 0.12895108479679399
