In [1]:
import os
import re
import json
import pickle
import random
import string
import unicodedata
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
from collections import Counter
import numpy as np
import pandas as pd
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/duhaozhou/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/duhaozhou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/duhaozhou/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/duhaozhou/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Part 1 : Exploratory Data Analysis (EDA session)

### (a). load data

In [3]:
def load_json_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def load_all_data(data_dir='data'):
    data_dir = Path(data_dir)

    train_claims = load_json_data(data_dir / 'train-claims.json')
    dev_claims = load_json_data(data_dir / 'dev-claims.json')
    test_claims = load_json_data(data_dir / 'test-claims-unlabelled.json')
    evidences = load_json_data(data_dir / 'evidence.json')

    return train_claims, dev_claims, test_claims, evidences
train_data, dev_data, test_data, evidence_data = load_all_data()

### (b). Breif summary of train dataset & evidence dataset (Max /Min /Mean / count)

In [4]:
def summarize_train(train_data):
    claim_lengths = [len(c["claim_text"]) for c in train_data.values()]
    evidence_counts = [len(c["evidences"]) for c in train_data.values()]
    labels = [c["claim_label"] for c in train_data.values()]

    print(f"\nTrain claim count: {len(train_data)}")
    print(f"Max claim length: {max(claim_lengths)}")
    print(f"Min claim length: {min(claim_lengths)}")
    print(f"Mean claim length: {np.mean(claim_lengths)}")

    print(f"Max evidence count per claim: {max(evidence_counts)}")
    print(f"Min evidence count per claim: {min(evidence_counts)}")
    print(f"Mean evidence count per claim: {np.mean(evidence_counts)}")

    print(f"Label distribution: {Counter(labels)}")

def summarize_evidence(evidence_data):
    evidence_lengths = [len(evi) for evi in evidence_data.values()]
    print(f"\nTotal evidence paragraphs: {len(evidence_data)}")
    print(f"Max evidence length: {max(evidence_lengths)}")
    print(f"Min evidence length: {min(evidence_lengths)}")
    print(f"Mean evidence length: {np.mean(evidence_lengths)}")

summarize_train(train_data)
summarize_evidence(evidence_data)


Train claim count: 1228
Max claim length: 332
Min claim length: 26
Mean claim length: 122.95521172638436
Max evidence count per claim: 5
Min evidence count per claim: 1
Mean evidence count per claim: 3.3566775244299674
Label distribution: Counter({'SUPPORTS': 519, 'NOT_ENOUGH_INFO': 386, 'REFUTES': 199, 'DISPUTED': 124})

Total evidence paragraphs: 1208827
Max evidence length: 3148
Min evidence length: 1
Mean evidence length: 119.51412319546138


# Use Hugging Face BERT Embedding & Transformer Encoder

## Build the Model

In [5]:
# use Huggingface Transformers for model loading and inference in the context of in-context learning
if torch.cuda.is_available():
        device_type = "cuda"
else:
    device_type = "cpu"
device = torch.device(device_type)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("bert-base-uncased").to(device)
for param in bert_model.parameters():
    param.requires_grad = False
bert_model.eval()

#Transformer Classifier
class Transformer(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=512, num_classes=4, num_layers=3, dropout=0.3):
        super().__init__()
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=input_dim,
            nhead=4, 
            dim_feedforward=hidden_dim,
            dropout=dropout, 
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.classifier = nn.Linear(input_dim, num_classes)

    def forward(self, cls_embed): 
        #[B, 1, input_dim]
        x = self.encoder(cls_embed)
        #Use [CLS] token only
        return self.classifier(x[:, 0])

#Dataset
class BERTDataset(Dataset):
    def __init__(self, pairs, labels):
        self.pairs = pairs
        self.labels = labels

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        claim, evidence = self.pairs[idx]
        text = claim + " [SEP] " + evidence
        return {
            "text": text,
            "label": torch.tensor(self.labels[idx], dtype=torch.long)
        }

#Bert embedding
def get_cls_embedding(texts, tokenizer, model, device):
    #Set the maximum length is 128, because claims and evidences are short text
    encoding = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model(**encoding)
        return output.last_hidden_state[:, 0].unsqueeze(1)

## Train Model

In [6]:
def train_model(model, bert, train_loader, val_loader, num_epochs, device):
    # put model on GPU or CPU
    model.to(device)
    # the learning rate is from the default value 1e-3
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss()
    
    # start Training the model
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        for batch in train_loader:
            texts = batch["text"]
            labels = batch["label"].to(device)
            # get the Bert's Embeddings as input factors
            cls_embed = get_cls_embedding(texts, tokenizer, bert, device)
            optimizer.zero_grad()
            logits = model(cls_embed)
            # compute classification loss
            loss = criterion(logits, labels)
            loss.backward()
            # update model parameters
            optimizer.step()
            # add the loss for each loop
            total_loss += loss.item()
        
        # this is the validation part
        model.eval()
        total_correct = 0
        total_count = 0
        # the calculation of disable gradient
        with torch.no_grad():
            for batch in val_loader:
                texts = batch["text"]
                labels = batch["label"].to(device)
                cls_embed = get_cls_embedding(texts, tokenizer, bert, device)
                preds = model(cls_embed).argmax(dim=1)
                total_correct += (preds == labels).sum().item()
                total_count += len(labels)

        print(f"Epoch {epoch+1}: Training Loss = {total_loss:.3f}, Validation Accuracy = {total_correct / total_count:.3f}")

## Test Model

In [7]:
#Use TF-IDF and Cosine Similarity
def retrieve_top_k_evidence_ids(claim_text, evidence_dict, k=5):
    
    #Extract evidence IDs and their corresponding texts
    ev_ids = list(evidence_dict.keys())
    ev_texts = [evidence_dict[eid] for eid in ev_ids]
    
    #Vectorize claim and all evidences using TF-IDF
    all_texts = [claim_text] + ev_texts
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf = vectorizer.fit_transform(all_texts)

    # Separate claim vector and evidence vectors
    claim_vec = tfidf[0]
    ev_vecs = tfidf[1:]
    
    #Compute cosine similarity between claim and each evidence
    sims = cosine_similarity(claim_vec, ev_vecs).flatten()
    
    #Get indices of top-k highest similarity scores
    top_k_idx = sims.argsort()[-k:][::-1]
    
    return [ev_ids[i] for i in top_k_idx]


def predict_and_write_json(model, data_dict, evidence_dict, output_path, device):
    model.eval()
    model.to(device)
    
    #Mapping from numeric label to text label
    ID2LABEL = {0: "SUPPORTS", 1: "REFUTES", 2: "NOT_ENOUGH_INFO", 3: "DISPUTED"}
    results = {}

    for claim_id, item in data_dict.items():
        claim_text = item["claim_text"]

        #Retrieve top-k similar evidences using TF-IDF
        evidence_ids = retrieve_top_k_evidence_ids(claim_text, evidence_dict, k=5)
        
        #Connect the first top-3 evidence
        evidence_texts = [evidence_dict.get(eid, "") for eid in evidence_ids[:3]]
        combined_text = claim_text + " [SEP] " + " ".join(evidence_texts)

        #Get the Bert's cls embedding
        cls_embed = get_cls_embedding([combined_text], tokenizer, bert_model, device)
        
        #Run the model to get prediction
        with torch.no_grad():
            logits = model(cls_embed)
            pred_id = logits.argmax(dim=1).item()
    
        #Store prediction result
        results[claim_id] = {
            "claim_label": ID2LABEL[pred_id],
            "evidences": evidence_ids[:3]
        }
    
    #Write all predictions to a JSON file
    with open(output_path, "w") as f:
        json.dump(results, f, indent=2)


In [8]:
# get the claim and evidence pairs from training original datasets
# return the list of pairs and labels for training
def build_training_pairs(claims_dict, evidence_dict):
    pairs, labels = [], []
    label_map = {"SUPPORTS": 0, 
                 "REFUTES": 1, 
                 "NOT_ENOUGH_INFO": 2, 
                 "DISPUTED": 3}
    
    # iterate of each claim
    for item in claims_dict.values():
        claim = item["claim_text"]
        lbl = label_map[item["claim_label"]]
        
        # when claims with linked evidence IDs
        if item["evidences"]:
            for eid in item["evidences"]:
                ev_text = evidence_dict.get(eid, "")
                
                # for non-empty evidence, then add their pairs and labels
                if ev_text:
                    pairs.append((claim, ev_text))
                    labels.append(lbl)
                    
    return pairs, labels

# create the training pairs and labels for Pytorch datasets
pairs, labels = build_training_pairs(train_data, evidence_data)
dataset = BERTDataset(pairs, labels)

# split the dataset with 0.8 training and 0.2 validation
train_size = int(0.8 * len(dataset))
train_set, val_set = random_split(dataset, [train_size, len(dataset) - train_size])
train_loader = DataLoader(train_set, batch_size=16, shuffle=True)
val_loader = DataLoader(val_set, batch_size=16)

# train the transformer model and predit labels for dev set to json file
model = Transformer()
train_model(model, bert_model, train_loader, val_loader, num_epochs=5, device=device)
predict_and_write_json(model, dev_data, evidence_data, "dev_pred_bert_large.json", device)


Epoch 1: Training Loss = 273.363, Validation Accuracy = 0.492
Epoch 2: Training Loss = 253.531, Validation Accuracy = 0.492
Epoch 3: Training Loss = 254.710, Validation Accuracy = 0.492
Epoch 4: Training Loss = 251.255, Validation Accuracy = 0.492
Epoch 5: Training Loss = 249.477, Validation Accuracy = 0.492
