In [247]:
import os
import re
import json
import pickle
import random
import string
import unicodedata
import torch
import torch.nn as nn
from pathlib import Path
from collections import Counter
import numpy as np
import pandas as pd
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from torch.utils.data import random_split, DataLoader
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer
from torch.utils.data import Dataset

In [248]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/duhaozhou/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/duhaozhou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/duhaozhou/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/duhaozhou/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Part 1 : Exploratory Data Analysis (EDA session)

### (a). load data

In [249]:
def load_json_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def load_all_data(data_dir='data'):
    data_dir = Path(data_dir)

    train_claims = load_json_data(data_dir / 'train-claims.json')
    dev_claims = load_json_data(data_dir / 'dev-claims.json')
    test_claims = load_json_data(data_dir / 'test-claims-unlabelled.json')
    evidences = load_json_data(data_dir / 'evidence.json')

    return train_claims, dev_claims, test_claims, evidences
train_data, dev_data, test_data, evidence_data = load_all_data()

### (b). Breif summary of train dataset & evidence dataset (Max /Min /Mean / count)

In [250]:
def summarize_train(train_data):
    claim_lengths = [len(c["claim_text"]) for c in train_data.values()]
    evidence_counts = [len(c["evidences"]) for c in train_data.values()]
    labels = [c["claim_label"] for c in train_data.values()]

    print(f"\nTrain claim count: {len(train_data)}")
    print(f"Max claim length: {max(claim_lengths)}")
    print(f"Min claim length: {min(claim_lengths)}")
    print(f"Mean claim length: {np.mean(claim_lengths)}")

    print(f"Max evidence count per claim: {max(evidence_counts)}")
    print(f"Min evidence count per claim: {min(evidence_counts)}")
    print(f"Mean evidence count per claim: {np.mean(evidence_counts)}")

    print(f"Label distribution: {Counter(labels)}")

def summarize_evidence(evidence_data):
    evidence_lengths = [len(evi) for evi in evidence_data.values()]
    print(f"\nTotal evidence paragraphs: {len(evidence_data)}")
    print(f"Max evidence length: {max(evidence_lengths)}")
    print(f"Min evidence length: {min(evidence_lengths)}")
    print(f"Mean evidence length: {np.mean(evidence_lengths)}")

summarize_train(train_data)
summarize_evidence(evidence_data)


Train claim count: 1228
Max claim length: 332
Min claim length: 26
Mean claim length: 122.95521172638436
Max evidence count per claim: 5
Min evidence count per claim: 1
Mean evidence count per claim: 3.3566775244299674
Label distribution: Counter({'SUPPORTS': 519, 'NOT_ENOUGH_INFO': 386, 'REFUTES': 199, 'DISPUTED': 124})

Total evidence paragraphs: 1208827
Max evidence length: 3148
Min evidence length: 1
Mean evidence length: 119.51412319546138


# Global Part

## 1. Text preprocess function

In [251]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith("J"):
        return wordnet.ADJ
    elif treebank_tag.startswith("V"):
        return wordnet.VERB
    elif treebank_tag.startswith("N"):
        return wordnet.NOUN
    elif treebank_tag.startswith("R"):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clean_and_tokenize_text(
    text,
    remove_stopwords=False,
    lemmatize=False,
    stem=False,
    lowercase=True,
    remove_punctuation=True,
    pos_aware_lemmatization=False,
    replace_numbers=False,
    normalize_unicode=False,
    normalize_whitespace=False,
    return_pos_tags=False
):
    if not isinstance(text, str):
        return ""
        
    if normalize_unicode: # Normalization Form Compatibility Composition e.g. (1 / 2km -> 1/2km)
        text = unicodedata.normalize("NFKC", text)
    
    if normalize_whitespace: # remove tabs / whitespace
        text = re.sub(r'\s+', ' ', text).strip()

    if lowercase:
        text = text.lower()

    if remove_punctuation:
        text = re.sub(r"[^\w\s]", " ", text)

    tokens = word_tokenize(text)

    if replace_numbers: # regard all numbers as same token <NUM>, when not care about different numbers meaning
        tokens = ["<NUM>" if token.isdigit() else token for token in tokens]

    if remove_stopwords:
        stop_words = set(stopwords.words("english"))
        tokens = [word for word in tokens if word not in stop_words]

    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        if pos_aware_lemmatization:
            tagged = pos_tag(tokens)
            tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tagged]
        else:
            tokens = [lemmatizer.lemmatize(word) for word in tokens]

    if stem:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]
        
    if return_pos_tags: # part of speech 
        tagged = pos_tag(tokens)
        return list(zip(tokens, [tag for _, tag in tagged]))

    return " ".join(tokens)


## 2. Construct the dataframe to store the information

In [252]:
X_train_text = [v["claim_text"] for v in train_data.values()]
Y_train_evidence = [v["evidences"] for v in train_data.values()]
Y_train_label = [v["claim_label"] for v in train_data.values()]

X_dev_text = [v["claim_text"] for v in dev_data.values()]
Y_dev_evidence = [v["evidences"] for v in dev_data.values()]
Y_dev_label = [v["claim_label"] for v in dev_data.values()]

X_test_text = [v["claim_text"] for v in test_data.values()]

evidence_content = [v for v in evidence_data.values()]

# train dataset
df_train = pd.DataFrame({
    "claim_id": list(train_data.keys()), # claim id
    "claim_text": X_train_text,
    "evidences": Y_train_evidence,
    "label": Y_train_label
})

# development dataframe
df_dev = pd.DataFrame({
    "claim_id": list(dev_data.keys()), # claim ID
    "claim_text": X_dev_text,
    "evidences": Y_dev_evidence,
    "label": Y_dev_label
})

# test dataset
df_test = pd.DataFrame({
    "claim_id": list(test_data.keys()),
    "claim_text": X_test_text
})

# evidence dataframe
df_evidence = pd.DataFrame({
    "evidence_id": list(evidence_data.keys()), # evidence ID
    "evidence_content": evidence_content
})


# LSTM Model

## Build the Vocabulary

In [253]:
# build the vocabulary
# pairs is the list of tuples, tuple is (claim, evidence)
# min_freq means that just remain the words appears two times
def build_vocab(pairs, min_freq = 2):
    # Count the frequency of words
    counter = Counter()
    # after text cleaning and split the text to word, count the frequency of words
    for claim, evidence in pairs:
        text = clean_and_tokenize_text(claim) + " " + clean_and_tokenize_text(evidence)
        tokens = text.split()
        counter.update(tokens)
    
    # just remain the frequency of words is more than two times
    vocab = {"<PAD>": 0, "<UNK>": 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab

## Build Claim and Evidence Dataset

In [254]:
# Set the maximum length for claim and evidence
claim_lengths = [len(c["claim_text"]) for c in train_data.values()]
evidence_lengths = [len(evi) for evi in evidence_data.values()]
max_len_claim_evi = np.mean(claim_lengths) + np.mean(evidence_lengths) + 1

max_len_claim_evi = int(max_len_claim_evi)

In [255]:
# use self-defined pytorh dataset to change the text dataset
class ClaimEvidenceDataset(Dataset):
    def __init__(self, pairs, labels, vocab, max_len = max_len_claim_evi):
        self.pairs = pairs
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len
    
    # get the frequency of pairs in dataset
    def __len__(self): return len(self.pairs)
    
    # convert the word to the number ids
    def text_to_ids(self, text):
        # convert the cleaning dataset to list of id
        tokens = clean_and_tokenize_text(text).split()
        ids = [self.vocab.get(t, self.vocab["<UNK>"]) for t in tokens]
        # if the length is not enough, then add '<PAD>'
        # if the length is longer, the longer parts will be deleted
        if len(ids) < self.max_len:
            ids += [self.vocab["<PAD>"]] * (self.max_len - len(ids))
        else:
            ids = ids[:self.max_len]
        return ids
    
    # get the claim and evidence of idx sample and add separate '[SEP]' between claim and evidence
    def __getitem__(self, idx):
        claim, evidence = self.pairs[idx]
        input_ids = self.text_to_ids(claim + " [SEP] " + evidence)
        return torch.tensor(input_ids), torch.tensor(self.labels[idx])

## Build the LSTM Model

In [207]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, padding_idx):
        super().__init__()

        # Embedding layer
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,
            # ensures the padding token does not affect learning
            padding_idx=padding_idx  
        )

        # LSTM layer
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,

            # input shape will be (batch_size, seq_len, embedding_dim)
            batch_first=True, 
                
            bidirectional=False    
        )

        # Dropout layer: helps prevent overfitting
        self.dropout = nn.Dropout(p=0.3)

        # Fully connected layer: maps LSTM hidden states to output classes
        self.fc = nn.Linear(in_features=hidden_dim, out_features=output_dim)

    def forward(self, input_ids):
        
        # Convert token IDs to embeddings [B, T, E]
        embedded = self.embedding(input_ids)

        # Feed embeddings into the LSTM
        # hidden: [1, B, H] (we ignore the full output sequence and the cell state)
        _, (hidden, _) = self.lstm(embedded)

        # Remove the first dimension (num_layers = 1) to [B, H]
        hidden = hidden.squeeze(0)

        # Apply dropout
        dropped = self.dropout(hidden)

        # Final linear layer
        output = self.fc(dropped)

        return output


## Train Model

In [208]:
def build_training_pairs(train_claims, evidence_dict):
    #Mapping from textual labels to numeric labels for classification
    LABEL_MAP = {
        "SUPPORTS": 0,
        "REFUTES": 1,
        "NOT_ENOUGH_INFO": 2,
        "DISPUTED": 3
    }

    #Initialize lists to store (claim, evidence) pairs and labels
    pairs = []
    labels = []

    #Iterate over all claims in the dataset
    for cid, item in train_claims.items():
        #Skip claims without a label or without evidence references
        if "claim_label" not in item or "evidences" not in item:
            continue

        #Extract the claim text and convert its label to numeric
        claim_text = item["claim_text"]
        label = LABEL_MAP[item["claim_label"]]

        #For each associated evidence ID, retrieve the evidence text and form a training pair
        for eid in item["evidences"]:
            evidence_text = evidence_dict.get(eid, "")
            pairs.append((claim_text, evidence_text))
            labels.append(label)

    #Return the list of claim-evidence pairs and their labels
    return pairs, labels

In [209]:
# train the Pytorch model
def train_model(model, train_loader, val_loader, num_epochs, device):
    # initialise the the model
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss()
    
    # start to train the epoch
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        # loop the small batch of data from training dataset for training
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            optimizer.zero_grad()
            logits = model(batch_x)
            loss = criterion(logits, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        # Then evaluate the model with validation dataset
        model.eval()
        total_correct = 0
        total_count = 0
        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                preds = model(batch_x).argmax(dim=1)
                total_correct += (preds == batch_y).sum().item()
                total_count += len(batch_y)

        print(f"Epoch {epoch+1}. Training Loss: {total_loss:.3f}. Validation Accuracy: {total_correct / total_count:.3f}")


In [210]:
if __name__ == "__main__":

    #create training pairs
    pairs, labels = build_training_pairs(train_data, evidence_data)

    #create vocabulary
    vocab = build_vocab(pairs)

    #create dataset + dataLoader
    full_dataset = ClaimEvidenceDataset(pairs, labels, vocab)
    train_size = int(0.8 * len(full_dataset))
    val_size = len(full_dataset) - train_size
    train_set, val_set = random_split(full_dataset, [train_size, val_size])

    train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=32)

    #initialize model
    model = LSTMClassifier(
        vocab_size=len(vocab),
        embedding_dim=100,
        hidden_dim=128,
        output_dim=4,
        padding_idx=vocab["<PAD>"]
    )

    #train model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_model(model, train_loader, val_loader, num_epochs=5, device=device)

Epoch 1. Training Loss: 127.261. Validation Accuracy: 0.458
Epoch 2. Training Loss: 124.532. Validation Accuracy: 0.458
Epoch 3. Training Loss: 124.683. Validation Accuracy: 0.458
Epoch 4. Training Loss: 125.923. Validation Accuracy: 0.458
Epoch 5. Training Loss: 124.598. Validation Accuracy: 0.458


## Test Model

In [211]:
#Use TF-IDF and Cosine Similarity
def retrieve_top_k_evidence_ids(claim_text, evidence_dict, k=5):
 
    #Extract evidence IDs and their corresponding texts
    ev_ids = list(evidence_dict.keys())
    ev_texts = [evidence_dict[eid] for eid in ev_ids]

    #Vectorize claim and all evidences using TF-IDF
    all_texts = [claim_text] + ev_texts
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf = vectorizer.fit_transform(all_texts)

    # Separate claim vector and evidence vectors
    claim_vec = tfidf[0]           # Vector for the claim
    ev_vecs = tfidf[1:]            # Vectors for all evidences

    #Compute cosine similarity between claim and each evidence
    sims = cosine_similarity(claim_vec, ev_vecs).flatten()

    #Get indices of top-k highest similarity scores
    top_k_idx = sims.argsort()[-k:][::-1]

    #Return corresponding evidence IDs
    return [ev_ids[i] for i in top_k_idx]


def predict_and_write_json(model, data_dict, evidence_dict, vocab, output_path, device):
    model.eval()
    model.to(device)

    # Mapping from numeric label to text label
    ID2LABEL = {0: "SUPPORTS", 1: "REFUTES", 2: "NOT_ENOUGH_INFO", 3: "DISPUTED"}
    results = {}

    for claim_id, item in data_dict.items():
        claim_text = item["claim_text"]

        #Retrieve top-k similar evidences using TF-IDF
        evidence_ids = retrieve_top_k_evidence_ids(claim_text, evidence_dict, k=5)

        #Connect the first top-3 evidence
        evidence_texts = [evidence_dict.get(eid, "") for eid in evidence_ids[:3]]
        evidence_text = " ".join(evidence_texts)


        #Convert text to token IDs
        tokens = clean_and_tokenize_text(claim_text + " [SEP] " + evidence_text).split()
        ids = [vocab.get(tok, vocab["<UNK>"]) for tok in tokens]
        ids = ids[:100] + [vocab["<PAD>"]] * max(0, 100 - len(ids))

        input_tensor = torch.tensor([ids]).to(device)

        #Run the model to get prediction
        with torch.no_grad():
            logits = model(input_tensor)
            pred_id = logits.argmax(dim=1).item()

        #Store prediction result
        results[claim_id] = {
            "claim_label": ID2LABEL[pred_id],
            "evidences": evidence_ids[:5]
        }

    #Write all predictions to a JSON file
    with open(output_path, "w") as f:
        json.dump(results, f, indent=2)
  

In [212]:
if __name__ == "__main__":
    
    # get the training pairs and labels
    pairs, labels = build_training_pairs(train_data, evidence_data)
    # create the vocabulary from training pairs
    vocab = build_vocab(pairs)
    
    # split the 80% training datasets and 20% validation datasets
    full_dataset = ClaimEvidenceDataset(pairs, labels, vocab)
    train_size = int(0.8 * len(full_dataset))
    val_size = len(full_dataset) - train_size
    train_set, val_set = random_split(full_dataset, [train_size, val_size])
    # use dataloader to batch training datasets and validation datasets
    train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=32)

    # initialize the LSTM model
    model = LSTMClassifier(
        vocab_size = len(vocab),
        embedding_dim = 100,
        hidden_dim = 128,
        output_dim = 4,
        padding_idx = vocab["<PAD>"]
    )

    # select the device to train the model
    if torch.cuda.is_available():
        device_type = "cuda"
    else:
        device_type = "cpu"
    device = torch.device(device_type)
    train_model(model, train_loader, val_loader, num_epochs=5, device=device)

    # predict the dev set and write in json file
    predict_and_write_json(
        model = model,
        data_dict = dev_data,    
        evidence_dict = evidence_data,
        vocab = vocab,
        output_path = "dev_pred.json",
        device = device
    )

Epoch 1. Training Loss: 125.293. Validation Accuracy: 0.472
Epoch 2. Training Loss: 123.688. Validation Accuracy: 0.472
Epoch 3. Training Loss: 125.107. Validation Accuracy: 0.472
Epoch 4. Training Loss: 123.902. Validation Accuracy: 0.472
Epoch 5. Training Loss: 124.020. Validation Accuracy: 0.472


# Bidirectional LSTM Model

In [256]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, padding_idx):
        super().__init__()

        # Embedding layer
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,
            # ensures the padding token does not affect learning
            padding_idx=padding_idx  
        )

        # LSTM layer
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,

            # input shape will be (batch_size, seq_len, embedding_dim)
            batch_first=True, 
            # change the bidirectional is True
            bidirectional=True   
        )

        # Dropout layer: helps prevent overfitting
        self.dropout = nn.Dropout(p=0.3)

        # Fully connected layer: maps LSTM hidden states to output classes
        # Change the hidden states become 2, beacuse here is bidirectional
        self.fc = nn.Linear(in_features=hidden_dim * 2, out_features=output_dim)
        
    def forward(self, input_ids):
        
        # Convert token IDs to embeddings [B, T, E]
        embedded = self.embedding(input_ids)

        # Feed embeddings into the LSTM
        # hidden: [1, B, H] (we ignore the full output sequence and the cell state)
        _, (hidden, _) = self.lstm(embedded) 
        
        # hidden is bidirectional is [B, 2*H]
        hidden = torch.cat((hidden[0], hidden[1]), dim=1)

        # Apply dropout
        dropped = self.dropout(hidden)

        # Final linear layer
        output = self.fc(dropped) 

        return output

## Train Model

In [257]:
def build_training_pairs(train_claims, evidence_dict):
    #Mapping from textual labels to numeric labels for classification
    LABEL_MAP = {
        "SUPPORTS": 0,
        "REFUTES": 1,
        "NOT_ENOUGH_INFO": 2,
        "DISPUTED": 3
    }

    #Initialize lists to store (claim, evidence) pairs and labels
    pairs = []
    labels = []

    #Iterate over all claims in the dataset
    for cid, item in train_claims.items():
        #Skip claims without a label or without evidence references
        if "claim_label" not in item or "evidences" not in item:
            continue

        #Extract the claim text and convert its label to numeric
        claim_text = item["claim_text"]
        label = LABEL_MAP[item["claim_label"]]

        #For each associated evidence ID, retrieve the evidence text and form a training pair
        for eid in item["evidences"]:
            evidence_text = evidence_dict.get(eid, "")
            pairs.append((claim_text, evidence_text))
            labels.append(label)

    #Return the list of claim-evidence pairs and their labels
    return pairs, labels

In [258]:
# train the Pytorch model
def train_model(model, train_loader, val_loader, num_epochs, device):
    # initialise the the model
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss()
    
    # start to train the epoch
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        # loop the small batch of data from training dataset for training
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            optimizer.zero_grad()
            logits = model(batch_x)
            loss = criterion(logits, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        # Then evaluate the model with validation dataset
        model.eval()
        total_correct = 0
        total_count = 0
        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                preds = model(batch_x).argmax(dim=1)
                total_correct += (preds == batch_y).sum().item()
                total_count += len(batch_y)

        print(f"Epoch {epoch+1}. Training Loss: {total_loss:.3f}. Validation Accuracy: {total_correct / total_count:.3f}")

In [259]:
if __name__ == "__main__":

    #create training pairs
    pairs, labels = build_training_pairs(train_data, evidence_data)

    #create vocabulary
    vocab = build_vocab(pairs)

    #create dataset + dataLoader
    full_dataset = ClaimEvidenceDataset(pairs, labels, vocab)
    train_size = int(0.8 * len(full_dataset))
    val_size = len(full_dataset) - train_size
    train_set, val_set = random_split(full_dataset, [train_size, val_size])

    train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=32)

    #initialize model
    model = LSTMClassifier(
        vocab_size=len(vocab),
        embedding_dim=100,
        hidden_dim=128,
        output_dim=4,
        padding_idx=vocab["<PAD>"]
    )

    #train model
    if torch.cuda.is_available():
        device_type = "cuda"
    else:
        device_type = "cpu"
    device = torch.device(device_type)
    train_model(model, train_loader, val_loader, num_epochs=5, device=device)

Epoch 1. Training Loss: 119.555. Validation Accuracy: 0.587
Epoch 2. Training Loss: 94.493. Validation Accuracy: 0.732
Epoch 3. Training Loss: 59.411. Validation Accuracy: 0.796
Epoch 4. Training Loss: 37.382. Validation Accuracy: 0.887
Epoch 5. Training Loss: 20.888. Validation Accuracy: 0.920


## Test Model

In [260]:
#Use TF-IDF and Cosine Similarity
def retrieve_top_k_evidence_ids(claim_text, evidence_dict, k=5):
 
    #Extract evidence IDs and their corresponding texts
    ev_ids = list(evidence_dict.keys())
    ev_texts = [evidence_dict[eid] for eid in ev_ids]

    #Vectorize claim and all evidences using TF-IDF
    all_texts = [claim_text] + ev_texts
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf = vectorizer.fit_transform(all_texts)

    #Separate claim vector and evidence vectors
    claim_vec = tfidf[0]
    ev_vecs = tfidf[1:]  

    #Compute cosine similarity between claim and each evidence
    sims = cosine_similarity(claim_vec, ev_vecs).flatten()

    #Get indices of top-k highest similarity scores
    top_k_idx = sims.argsort()[-k:][::-1]

    return [ev_ids[i] for i in top_k_idx]


def predict_and_write_json(model, data_dict, evidence_dict, vocab, output_path, device):
    model.eval()
    model.to(device)

    # Mapping from numeric label to text label
    ID2LABEL = {0: "SUPPORTS", 1: "REFUTES", 2: "NOT_ENOUGH_INFO", 3: "DISPUTED"}
    results = {}

    for claim_id, item in data_dict.items():
        claim_text = item["claim_text"]

        #Retrieve top-k similar evidences using TF-IDF
        evidence_ids = retrieve_top_k_evidence_ids(claim_text, evidence_dict, k=5)

        #Connect the first top-3 evidence
        evidence_text = evidence_dict.get(evidence_ids[0], "")

        #Convert text to token IDs
        tokens = clean_and_tokenize_text(claim_text + " [SEP] " + evidence_text).split()
        ids = [vocab.get(tok, vocab["<UNK>"]) for tok in tokens]
        ids = ids[:100] + [vocab["<PAD>"]] * max(0, 100 - len(ids))

        input_tensor = torch.tensor([ids]).to(device)

        #Run the model to get prediction
        with torch.no_grad():
            logits = model(input_tensor)
            pred_id = logits.argmax(dim=1).item()

        #Store prediction result
        results[claim_id] = {
            "claim_label": ID2LABEL[pred_id],
            "evidences": evidence_ids[:5]
        }

    #Write all predictions to a JSON file
    with open(output_path, "w") as f:
        json.dump(results, f, indent=2)


In [261]:
if __name__ == "__main__":
    
    # get the training pairs and labels
    pairs, labels = build_training_pairs(train_data, evidence_data)
    # create the vocabulary from training pairs
    vocab = build_vocab(pairs)
    
    # split the 80% training datasets and 20% validation datasets
    full_dataset = ClaimEvidenceDataset(pairs, labels, vocab)
    train_size = int(0.8 * len(full_dataset))
    val_size = len(full_dataset) - train_size
    train_set, val_set = random_split(full_dataset, [train_size, val_size])
    # use dataloader to batch training datasets and validation datasets
    train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=32)

    # initialize the LSTM model
    model = LSTMClassifier(
        vocab_size = len(vocab),
        embedding_dim = 100,
        hidden_dim = 128,
        output_dim = 4,
        padding_idx = vocab["<PAD>"]
    )
    
    # select the device to train the model
    if torch.cuda.is_available():
        device_type = "cuda"
    else:
        device_type = "cpu"
    device = torch.device(device_type)
    train_model(model, train_loader, val_loader, num_epochs=5, device=device)

    # predict the dev set and write in json file
    predict_and_write_json(
        model = model,
        data_dict = dev_data,    
        evidence_dict = evidence_data,
        vocab = vocab,
        output_path = "dev_pred_bidirect.json",
        device = device
    )  

Epoch 1. Training Loss: 119.587. Validation Accuracy: 0.572
Epoch 2. Training Loss: 91.762. Validation Accuracy: 0.698
Epoch 3. Training Loss: 54.661. Validation Accuracy: 0.813
Epoch 4. Training Loss: 30.520. Validation Accuracy: 0.891
Epoch 5. Training Loss: 16.540. Validation Accuracy: 0.916
