## COMP90042 Project 2023 - Automated Fact Checking For Climate Science Claims

Author: Jiahao Chen

Student ID: 1118749

The script is capable of running on Colab using the basic T4 GPU. If running out of CUDA memory, please restart the kernel and run all. Previous progress are saved for fast recovery.

#### Install and import requirements

In [1]:
%%capture
!pip install torch torchvision transformers 
!pip install pandas numpy sklearn nltk
!pip install ipywidgets tqdm

In [2]:
%%capture
import os
import re
import json
import time
import random
from tqdm.notebook import tqdm
from collections import Counter

import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer, BertModel
from transformers import DistilBertTokenizer, DistilBertModel

In [3]:
# If running on Colab, uncomment the following lines
# from google.colab import drive
# drive.mount('/content/drive')

In [4]:
# Global variables

data_dir = "data/"
# Change if on Colab
# data_dir = "drive/MyDrive/data/"

outputs_path = "outputs/"
prediction_path = "prediction/"

train_path = f"{data_dir}train-claims.json"
dev_path = f"{data_dir}dev-claims.json"
evidence_path = f"{data_dir}evidence.json"

token_len = 256
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

gpu = 0
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [5]:
# Check if file or directory exists
def check_file(path):
    if os.path.exists(path):
        return True
    else:
        print(f"{path} does not exist.")
        return False

def check_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)
        print(f"Created directory: {path}")
    else:
        print(f"Directory already exists: {path}")

check_dir(data_dir)
check_dir(outputs_path)
check_dir(prediction_path)

Directory already exists: data/
Directory already exists: outputs/
Directory already exists: prediction/


### Section I: Evidence Retrieval

In [6]:
# Load claims, related evidences and labels
# If label is False, return only claims and claim ids
def load_data(path, label=False):
    claimid_list = []
    claim_list = []
    if label:
        evidences_list = []
        label_list = []
    
    with open(path, 'r') as f:
        data = json.load(f)
    for item in data:
        claimid_list.append(item)
        claim_list.append(data[item]['claim_text'])
        if label:
            evidences_list.append(data[item]['evidences'])
            label_list.append(data[item]['claim_label'])
    
    if label:
        return claimid_list, claim_list, evidences_list, label_list
    return claimid_list, claim_list

# Load evidences
def load_evidence(path):
    evidence_list = []
    with open(path, 'r') as f:
        data = json.load(f)
    for item in data:
        evidence_list.append(data[item])
    return evidence_list

In [7]:
train_claim_ids, train_claims, train_evidences, train_labels = load_data(train_path, label=True)
dev_claim_ids, dev_claims, dev_evidences, dev_labels = load_data(dev_path, label=True)
evidence_src = load_evidence(evidence_path)

#### Text preprocessing

In [8]:
# Clean texts using nltk
stop_words = set(stopwords.words('english'))
lemmatizer = nltk.stem.WordNetLemmatizer()

clean_pattern = r'[^a-zA-Z0-9\s]+'

def text_clean(text):
    clean_text = re.sub(clean_pattern, '', text)
    words = nltk.word_tokenize(clean_text.lower())
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    clean_text = ' '.join(words)
    return clean_text

# Clean all evidences
def evidence_clean(texts):
    clean_texts = []
    num_texts = len(texts)

    print("Cleaning texts ...")
    pbar = tqdm(total=num_texts, dynamic_ncols=True, miniters=10000)
    for text in texts:
        clean_text = text_clean(text)
        clean_texts.append(clean_text)
        pbar.update(1)
    pbar.close()
    with open(outputs_path + 'clean_evidence.json', 'w') as f:
        json.dump(clean_texts, f)
    print(f"Saved to {outputs_path}clean_evidence.json")
    # return clean_texts

In [9]:
# Load cleaned evidences
if not check_file(f"{outputs_path}clean_evidence.json"):
    evidence_clean(evidence_src) 
with open(outputs_path + 'clean_evidence.json', 'r') as f:
    clean_evidence_src = json.load(f)
    print(f"Loaded from {outputs_path}clean_evidence.json")

Loaded from outputs/clean_evidence.json


#### Jaccard similarity

In [10]:
# Compute jaccard similarity between two texts
def jaccard_similarity(s1, s2):
    set1 = set(s1.split())
    set2 = set(s2.split())
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    similarity = len(intersection) / len(union)
    return similarity

# Use Jaccard similarity to filter evidences
def jaccard_filter(claim, k=100):
    clean_claim = text_clean(claim)
    res = []
    for i, ev in enumerate(clean_evidence_src):
        res.append((i, jaccard_similarity(clean_claim, ev)))
    return sorted(res, key = lambda x: x[1], reverse=True)[:k]

#### TF-IDF

In [11]:
# Perform TF-IDF on given texts
def tfidf_evidence(texts):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectors = tfidf_vectorizer.fit_transform(texts)
    return tfidf_vectors, tfidf_vectorizer

# Cosine similarity between TF-IDF vectors of claim and evidence
def tfidf_similarity(claim, ev_id):
    evidence = clean_evidence_src[ev_id]
    claim_vector = ev_tfidf_vectorizer.transform([claim])
    evidence_vector = ev_tfidf_vectorizer.transform([evidence])
    similarity = cosine_similarity(claim_vector, evidence_vector)
    return similarity.item()

# Use TF-IDF similarity to filter evidences
def tfidf_filter(claim, k=100):
    claim = text_clean(claim)
    claim_vector = ev_tfidf_vectorizer.transform([claim])
    similarities = cosine_similarity(claim_vector, ev_tfidf_vectors)
    top_k_indices = np.argsort(similarities, axis=-1)[:, -k:].flatten()
    top_k_scores = np.sort(similarities, axis=-1)[:, -k:].flatten()
    return list(zip(top_k_indices, top_k_scores))

In [12]:
# Perform TF-IDF on all evidences
ev_tfidf_vectors, ev_tfidf_vectorizer = tfidf_evidence(clean_evidence_src)
print("TF-IDF vectors generated.")

TF-IDF vectors generated.


#### DistilBERT

In [13]:
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)

# Generate embeddings for evidences using DistilBERT
def get_text_embedding(evidences, model, tokenizer, batch_size=64, show=True):
    model.eval()
    embeddings = []
    batches = [evidences[i:i+batch_size] for i in range(0, len(evidences), batch_size)]
    pbar = tqdm(total=len(batches), dynamic_ncols=True, miniters=1000) if show else None

    for batch in batches:
        tokenized = tokenizer.batch_encode_plus(batch, padding=True, truncation=True, max_length=token_len, return_tensors='pt')
        input_ids = tokenized['input_ids']
        attention_masks = tokenized['attention_mask']
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        
        # Compute the embeddings for the batch
        with torch.no_grad():
            outputs = model(input_ids, attention_masks)
            # batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            batch_embeddings = outputs[0][:, 0, :].cpu().numpy()
        
        embeddings.append(batch_embeddings)
        pbar.update(1) if show else None
    pbar.close() if show else None

    embeddings = np.vstack(embeddings)
    return embeddings

In [14]:
# Load embeddings for all evidences
ev_embeddings = None
if not check_file(f"{outputs_path}ev_distilbert.npy"):
    print("Generating embeddings for evidences using DistilBERT ...")
    ev_embeddings = get_text_embedding(evidence_src, distilbert_model, distilbert_tokenizer)
    np.save(outputs_path + "ev_distilbert.npy", ev_embeddings)
else:
    ev_embeddings = np.load(outputs_path + "ev_distilbert.npy")
    print(f"Loaded from {outputs_path}ev_distilbert.npy")

Loaded from outputs/ev_distilbert.npy


In [15]:
# Filter evidences based on cosine similarity between DistilBERT embeddings of claim and evidence
def distilbert_filter(claim, k=100):
    claim_embedding = get_text_embedding([claim], distilbert_model, distilbert_tokenizer, show=False)
    similarities = cosine_similarity(claim_embedding, ev_embeddings)
    top_k_indices = np.argsort(similarities, axis=-1)[:, -k:].flatten()
    top_k_scores = np.sort(similarities, axis=-1)[:, -k:].flatten()
    return list(zip(top_k_indices, top_k_scores))

#### Evaluation on filters (Jaccard, TF-IDF, DistilBERT)

In [16]:
# Evalutate performance of different filters on a claim
# k: number of evidences to be retrieved
def eval_filter(index, filter_func, k=100, show=True):
    claim = dev_claims[index]
    truths = dev_evidences[index]
    evidences = [item[0] for item in filter_func(claim, k)]
    t = 0
    f = 0
    for truth in truths:
        if int(truth[9:]) in evidences:
            t += 1
            print(f"In: {truth}") if show else None
        else:
            f += 1
            print(f"Out: {truth}") if show else None
    print(f"In: {t}, Out: {f}") if show else None
    return t, f

# Evalutate performance of different filters on all claims in dev set
def eval_filter_dev(filter_func, k=100):
    t = 0
    f = 0
    pbar = tqdm(total=len(dev_claims), dynamic_ncols=True)
    for i in range(len(dev_claims)):
        t_, f_ = eval_filter(i, filter_func, k, False)
        t += t_
        f += f_
        pbar.update(1)
    pbar.close()
    print(f"In: {t}, Out: {f}, Total: {t+f}")

In [17]:
# eval_filter_dev(jaccard_filter, 100)

In [18]:
# eval_filter_dev(tfidf_filter, 100)

In [19]:
# eval_filter_dev(distilbert_filter, 100)

#### BERT for evidence classification

In [20]:
# Randomly sample n negative cases
num_evidences = len(evidence_src) - 1
def get_rand_negative_ids(evidence_ids, n):
    res = []
    for i in range(n):
        temp_id = random.randint(0, num_evidences)
        while temp_id in evidence_ids or temp_id in res:
            temp_id = random.randint(0, num_evidences)
        res.append(temp_id)
    return res

# Sample n negative cases from evidences that have highest similarity with the claim
def get_hard_negative_ids(claim, evidence_ids, filter_func, n):
    res = []
    similar_ids = filter_func(claim, 50)
    cnt = 0
    for i in range(n):
        temp_id = similar_ids[cnt][0]
        while temp_id in evidence_ids:
            cnt += 1
            temp_id = similar_ids[cnt][0]
        res.append(temp_id)
        cnt += 1
    return res

# Pair claim with evidences and labels
def pair_claim_evidence(claims, evs, hard=False, show=False):
    claim_list = []
    ev_list = []
    labels = []
    
    pbar = tqdm(total=len(claims), dynamic_ncols=True) if show else None
    for i, claim in enumerate(claims):
        # Add positive cases
        raw_evidence_ids = evs[i]
        evidence_ids = []
        for num in raw_evidence_ids:
            evidence_id = int(num[9:])
            evidence_ids.append(evidence_id)
            evidence_text = evidence_src[evidence_id]
            claim_list.append(claim)
            ev_list.append(evidence_text)
            labels.append(1)
        
        # Add negative cases
        num_positive = len(evidence_ids)
        if hard:
            negative_ids = get_hard_negative_ids(claim, evidence_ids, tfidf_filter, num_positive)
        else:
            negative_ids = get_rand_negative_ids(evidence_ids, num_positive)
        for num in negative_ids:
            evidence_text = evidence_src[num]
            claim_list.append(claim)
            ev_list.append(evidence_text)
            labels.append(0)
        
        pbar.update(1) if show else None
    pbar.close() if show else None
    return pd.DataFrame({'Claim': claim_list, 'Evidence': ev_list, 'Label': labels})

In [21]:
# Create training and dev sets using random sampling
train_evcls = pair_claim_evidence(train_claims, train_evidences)
dev_evcls = pair_claim_evidence(dev_claims, dev_evidences)

In [22]:
# Merge train and dev sets (random sampling)
train_evcls_f = pd.merge(train_evcls, dev_evcls, on=['Claim', 'Evidence', 'Label'], how='outer')

In [23]:
# Create training and dev sets using hard negative sampling
train_evcls_hard = None
dev_evcls_hard = None
train_evcls_hard_data = "evcls_hard_train.csv"
dev_evcls_hard_data = "evcls_hard_dev.csv"

if not check_file(f"{outputs_path}{train_evcls_hard_data}"):
    train_evcls_hard = pair_claim_evidence(train_claims, train_evidences, hard=True, show=True)
    train_evcls_hard.to_csv(f"{outputs_path}{train_evcls_hard_data}", index=False)
    print(f"Saved to {outputs_path}{train_evcls_hard_data}")
else:
    train_evcls_hard = pd.read_csv(f"{outputs_path}{train_evcls_hard_data}")

if not check_file(f"{outputs_path}{dev_evcls_hard_data}"):
    dev_evcls_hard = pair_claim_evidence(dev_claims, dev_evidences, hard=True, show=True)
    dev_evcls_hard.to_csv(f"{outputs_path}{dev_evcls_hard_data}", index=False)
    print(f"Saved to {outputs_path}{dev_evcls_hard_data}")
else:
    dev_evcls_hard = pd.read_csv(f"{outputs_path}{dev_evcls_hard_data}")

In [24]:
# Merge train and dev sets (hard negative sampling)
train_evcls_hard_f = pd.merge(train_evcls_hard, dev_evcls_hard, on=['Claim', 'Evidence', 'Label'], how='outer')

In [25]:
# Convert data to BERT input format
def convert_input(claim, evidence, tokenizer, maxlen):
    claim_tokens = tokenizer.tokenize(claim)
    evidence_tokens = tokenizer.tokenize(evidence)

    while len(claim_tokens) + len(evidence_tokens) > maxlen - 3:
        if len(claim_tokens) > len(evidence_tokens):
            claim_tokens.pop()
        else:
            evidence_tokens.pop()

    tokens = ['[CLS]'] + evidence_tokens + ['[SEP]'] + claim_tokens + ['[SEP]']
    if len(tokens) < maxlen:
        tokens = tokens + ['[PAD]' for _ in range(maxlen - len(tokens))]

    attn_mask = [0 if token == '[PAD]' else 1 for token in tokens]
    seg_ids = [0] * (len(evidence_tokens) + 2) + [1] * (maxlen - len(evidence_tokens) - 2)
    tokens_ids = tokenizer.convert_tokens_to_ids(tokens)

    tokens_ids_t = torch.tensor(tokens_ids)
    attn_mask_t = torch.tensor(attn_mask)
    seg_ids_t   = torch.tensor(seg_ids)

    return tokens_ids_t, attn_mask_t, seg_ids_t

In [26]:
# Create dataset for evidence classification
class EVCLSDataset(Dataset):
    
    def __init__(self, datasrc, maxlen):
        self.data = datasrc
        self.tokenizer = bert_tokenizer
        self.maxlen = maxlen
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        claim = self.data.loc[index, 'Claim']
        evidence = self.data.loc[index, 'Evidence']
        label = self.data.loc[index, 'Label']

        tokens_ids_t, attn_mask_t, seg_ids_t = convert_input(claim, evidence, self.tokenizer, self.maxlen)
        return tokens_ids_t, attn_mask_t, seg_ids_t, label

In [27]:
# Create training and dev data loader (random sampling)
train_evcls_set = EVCLSDataset(train_evcls, token_len)
dev_evcls_set = EVCLSDataset(dev_evcls, token_len)
train_evcls_loader = DataLoader(train_evcls_set, batch_size = 32, shuffle=True)
dev_evcls_loader = DataLoader(dev_evcls_set, batch_size = 32, shuffle=True)

In [28]:
# Create training and dev data loader (hard negative sampling)
train_evcls_hard_set = EVCLSDataset(train_evcls_hard, token_len)
dev_evcls_hard_set = EVCLSDataset(dev_evcls_hard, token_len)
train_evcls_hard_loader = DataLoader(train_evcls_hard_set, batch_size = 32, shuffle=True)
dev_evcls_hard_loader = DataLoader(dev_evcls_hard_set, batch_size = 32, shuffle=True)

In [29]:
# Create data loader for merged dataset (random sampling)
train_evcls_f_set = EVCLSDataset(train_evcls_f, token_len)
train_evcls_f_loader = DataLoader(train_evcls_f_set, batch_size = 32, shuffle=True)

In [30]:
# Create data loader for merged dataset (hard negative sampling)
train_evcls_hard_f_set = EVCLSDataset(train_evcls_hard_f, token_len)
train_evcls_hard_f_loader = DataLoader(train_evcls_hard_f_set, batch_size = 32, shuffle=True)

In [31]:
# Define the sturcture for model of evidence classification
class EvidenceClassifier(nn.Module):

    def __init__(self):
        super(EvidenceClassifier, self).__init__()
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        self.cls_layer = nn.Linear(768, 1)

    def forward(self, seq, attn_masks, seg_ids):
        outputs = self.bert_layer(seq, attention_mask = attn_masks, token_type_ids = seg_ids, return_dict=True)
        cont_reps = outputs.last_hidden_state
        cls_rep = cont_reps[:, 0]
        logits = self.cls_layer(cls_rep)
        return logits

In [32]:
# Compute accuracy for model of evidence classification
def acc_sigmoid(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc

# Evaluate model performance using dev set
# acc_func: accuracy function (sigmoid or softmax)
# cls_type: 0 for evidence classfication, 1 for claim classification
def evaluate(model, criterion, devloader, acc_func, cls_type=0):
    model.eval()
    mean_acc, mean_loss = 0, 0
    count = 0
    with torch.no_grad():
        for seq, attn_masks, seg_ids, labels in devloader:
            seq, attn_masks, seg_ids, labels = seq.cuda(gpu), attn_masks.cuda(gpu), seg_ids.cuda(gpu), labels.cuda(gpu)
            logits = model(seq, attn_masks, seg_ids)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item() if cls_type == 0 else criterion(logits, labels).item()
            mean_acc += acc_func(logits, labels)
            count += 1
    return mean_acc / count, mean_loss / count

In [33]:
# Encaplusate the training process
# acc_func: accuracy function (sigmoid or softmax)
# cls_type: 0 for evidence classfication, 1 for claim classification
# full: True for using the merged dataset
# Based on the code from week 7's workshop
def train(model, criterion, optimizer, train_loader, dev_loader, max_eps, acc_func, name, cls_type=0, full=False):
    if full:
        print("Using merged dataset ...")

    best_acc = 0
    st = time.time()
    for ep in range(max_eps):
        print(f"Epoch {ep} ...")
        model.train()
        for i, (seq, attn_masks, seg_ids, labels) in enumerate(train_loader):
            optimizer.zero_grad()  
            seq, attn_masks, seg_ids, labels = seq.cuda(gpu), attn_masks.cuda(gpu), seg_ids.cuda(gpu), labels.cuda(gpu)
            logits = model(seq, attn_masks, seg_ids)
            loss = criterion(logits.squeeze(-1), labels.float()) if cls_type == 0 else criterion(logits, labels)
            loss.backward()
            optimizer.step()

            if i % 100 == 0:
                acc = acc_func(logits, labels)
                print(f"Iteration {i} of epoch {ep} complete. Loss: {loss.item()}; Accuracy: {acc}; Time: {round((time.time() - st), 2)}s")
                st = time.time()

        if not full:
            # Evaluate the model using the dev set when the epoch is finished
            dev_acc, dev_loss = evaluate(model, criterion, dev_loader, acc_func, cls_type=cls_type)
            print(f"\nEpoch {ep} completed. Development Accuracy: {dev_acc}; Development Loss: {dev_loss}\n")
            # Save the model if accuracy is improved
            if dev_acc > best_acc:
                print(f"Best accuracy is improved from {best_acc} to {dev_acc}")
                best_acc = dev_acc
                torch.save(model.state_dict(), f"{outputs_path}{name}.dat")
                print(f"Model is saved to {outputs_path}{name}.dat\n")
        else:
            torch.save(model.state_dict(), f"{outputs_path}{name}.dat")
            print(f"Model is saved to {outputs_path}{name}.dat\n")

In [34]:
# Train: random sampling
evcls_name = "evcls"
if not check_file(f"{outputs_path}{evcls_name}.dat"):
    evcls_model = EvidenceClassifier()
    evcls_model.to(device)
    evcls_criterion = nn.BCEWithLogitsLoss()
    evcls_optimizer = optim.Adam(evcls_model.parameters(), lr=2e-5)
    num_epoch = 2
    train(evcls_model, evcls_criterion, evcls_optimizer, train_evcls_loader, dev_evcls_loader, num_epoch, acc_sigmoid, evcls_name)
else:
    print(f"{outputs_path}{evcls_name}.dat exists")

outputs/evcls.dat exists


In [35]:
# Train: hard negative sampling
evcls_hard_name = "evcls_hard"
if not check_file(f"{outputs_path}{evcls_hard_name}.dat"):
    evcls_hard_model = EvidenceClassifier()
    evcls_hard_model.to(device)
    evcls_hard_criterion = nn.BCEWithLogitsLoss()
    evcls_hard_optimizer = optim.Adam(evcls_hard_model.parameters(), lr=2e-5)
    num_epoch = 2
    train(evcls_hard_model, evcls_hard_criterion, evcls_hard_optimizer, train_evcls_hard_loader, dev_evcls_hard_loader, num_epoch, acc_sigmoid, evcls_hard_name)
else:
    print(f"{outputs_path}{evcls_hard_name}.dat exists")

outputs/evcls_hard.dat exists


In [36]:
# Train: random sampling (merged)
evcls_f_name = "evcls_f"
if not check_file(f"{outputs_path}{evcls_f_name}.dat"):
    evcls_f_model = EvidenceClassifier()
    evcls_f_model.to(device)
    evcls_f_criterion = nn.BCEWithLogitsLoss()
    evcls_f_optimizer = optim.Adam(evcls_f_model.parameters(), lr=2e-5)
    num_epoch = 1
    train(evcls_f_model, evcls_f_criterion, evcls_f_optimizer, train_evcls_f_loader, None, num_epoch, acc_sigmoid, evcls_f_name, full=True)
else:
    print(f"{outputs_path}{evcls_f_name}.dat exists")

outputs/evcls_f.dat exists


In [37]:
# Train: hard negative sampling (merged)
evcls_hard_f_name = "evcls_hard_f"
if not check_file(f"{outputs_path}{evcls_hard_f_name}.dat"):
    evcls_hard_f_model = EvidenceClassifier()
    evcls_hard_f_model.to(device)
    evcls_hard_f_criterion = nn.BCEWithLogitsLoss()
    evcls_hard_f_optimizer = optim.Adam(evcls_hard_f_model.parameters(), lr=2e-5)
    num_epoch = 1
    train(evcls_hard_f_model, evcls_hard_f_criterion, evcls_hard_f_optimizer, train_evcls_hard_f_loader, None, num_epoch, acc_sigmoid, evcls_hard_f_name, full=True)
else:
    print(f"{outputs_path}{evcls_hard_f_name}.dat exists")

outputs/evcls_hard_f.dat exists


In [38]:
# Load: random sampling
evcls_path = f"{outputs_path}{evcls_name}.dat"
evcls_model = EvidenceClassifier()
evcls_model.load_state_dict(torch.load(evcls_path))
evcls_model.to(device)
evcls_model.eval()
print(f"Loaded {evcls_path}")

Loaded outputs/evcls.dat


In [39]:
# Load: hard negative sampling
evcls_hard_path = f"{outputs_path}{evcls_hard_name}.dat"
evcls_hard_model = EvidenceClassifier()
evcls_hard_model.load_state_dict(torch.load(evcls_hard_path))
evcls_hard_model.to(device)
evcls_hard_model.eval()
print(f"Loaded {evcls_hard_path}")

Loaded outputs/evcls_hard.dat


In [40]:
# Load: random sampling (merged)
evcls_f_path = f"{outputs_path}{evcls_f_name}.dat"
evcls_f_model = EvidenceClassifier()
evcls_f_model.load_state_dict(torch.load(evcls_f_path))
evcls_f_model.to(device)
evcls_f_model.eval()
print(f"Loaded {evcls_f_path}")

Loaded outputs/evcls_f.dat


In [41]:
# Load: hard negative sampling (merged)
evcls_hard_f_path = f"{outputs_path}{evcls_hard_f_name}.dat"
evcls_hard_f_model = EvidenceClassifier()
evcls_hard_f_model.load_state_dict(torch.load(evcls_hard_f_path))
evcls_hard_f_model.to(device)
evcls_hard_f_model.eval()
print(f"Loaded {evcls_hard_f_path}")

Loaded outputs/evcls_hard_f.dat


#### Evidence retrieval

In [42]:
# Classify if the evidence is relevant to the claim
def evcls(claim, evidence, model, tokenizer):
    seq, attn_mask, seg_id = convert_input(claim, evidence, tokenizer, token_len)
    seq = seq.unsqueeze(0).cuda(gpu)
    attn_mask = attn_mask.unsqueeze(0).cuda(gpu)
    seg_id = seg_id.unsqueeze(0).cuda(gpu)
    with torch.no_grad():
        return torch.sigmoid(model(seq, attn_mask, seg_id))[0][0]

# Retrieve evidences for a claim
# n: number of candidates to retrieve
# top: number of evidences to extract from candidates
# model: model to use (random sampling)
# hard_model: model to use (hard negative sampling)
def ev_retrieve(claim, model, hard_model, n=100, top=5):
    # Extract evidences which have high similarity with the claim
    candidates = tfidf_filter(claim, n)
    claim_emd = get_text_embedding([claim], distilbert_model, distilbert_tokenizer, show=False)[0]
    claim_clean = text_clean(claim)
    
    res = []
    for candidate in candidates:
        ev_id = candidate[0]
        ev = evidence_src[ev_id]
        ev_clean = clean_evidence_src[ev_id]

        tfidf = candidate[1]
        jaccard = jaccard_similarity(claim_clean, ev_clean)
        distil_sim = cosine_similarity([claim_emd], [ev_embeddings[ev_id]]).item()
        pred = evcls(claim, ev, model=model, tokenizer=bert_tokenizer).item()
        hard_pred = evcls(claim, ev, model=hard_model, tokenizer=bert_tokenizer).item()
        
        score = 0.05 * tfidf + 0.05 * jaccard + 0.2 * distil_sim + 0.4 * pred + 0.3 * hard_pred
        res.append((ev_id, score))
    res = sorted(res, key = lambda x: x[1], reverse=True)
    return res[:top]

In [43]:
# Evaulate the performance of evidence retrieval on a claim in dev set
# show; True for printing stats
# full: True for using models trained on merged dataset
def eval_dev_evcls(index, n=100, top=5, show=True, full=False):
    # Select model
    model = evcls_f_model if full else evcls_model
    hard_model = evcls_hard_f_model if full else evcls_hard_model
    
    claim = dev_claims[index]
    truths = dev_evidences[index]
    evidences = [item[0] for item in ev_retrieve(claim, model, hard_model, n, top)]
    print(evidences) if show else None
    t = 0
    f = 0
    for truth in truths:
        if int(truth[9:]) in evidences:
            t += 1
            print(f"In: {truth}") if show else None
        else:
            f += 1
            print(f"Out: {truth}") if show else None
    print(f"In: {t}, Out: {f}") if show else None
    return t, f

# Retrieve evidences for all claims from a dataset
# check: True for checking number of correct retrieved evidences
# full: True for using models trained on merged dataset
def ev_retrieve_src(datasrc, n=100, top=2, check=False, full=False):
    # Select model
    model = evcls_f_model if full else evcls_model
    hard_model = evcls_hard_f_model if full else evcls_hard_model

    evidences = []
    t = 0
    f = 0
    pbar = tqdm(total=len(datasrc), desc="Retrieving evidences", dynamic_ncols=True)
    for i in range(len(datasrc)):
        claim = datasrc[i]
        evidences.append([item[0] for item in ev_retrieve(claim, model, hard_model, n, top)])
        if check:
            truths = dev_evidences[i]
            for truth in truths:
                if int(truth[9:]) in evidences[-1]:
                    t += 1
                else:
                    f += 1
        pbar.update(1)
    pbar.close()
    print(f"Total: {t + f}, In: {t}, Out: {f}") if check else None
    return evidences

### Section II: Claim Classification

In [44]:
label2id = {'SUPPORTS': 0, 'REFUTES': 1, 'NOT_ENOUGH_INFO': 2, 'DISPUTED': 3}
id2label = {0: 'SUPPORTS', 1: 'REFUTES', 2: 'NOT_ENOUGH_INFO', 3: 'DISPUTED'}

#### BERT for claim classification

In [45]:
# Create data for claim classification
def create_claimcls_data(claims, evs, labels):
    claim_list = []
    ev_list = []
    label_list = []
    
    for i, claim in enumerate(claims):
        ev_ids = evs[i]
        for num in ev_ids:
            ev_id = int(num[9:])
            evidence_text = evidence_src[ev_id]
            claim_list.append(claim)
            ev_list.append(evidence_text)
            label_list.append(labels[i])

    return pd.DataFrame({'Claim': claim_list, 'Evidence': ev_list, 'Label': label_list})

In [46]:
# Create training and development datasets for claim classification
train_claimcls = create_claimcls_data(train_claims, train_evidences, train_labels)
dev_claimcls = create_claimcls_data(dev_claims, dev_evidences, dev_labels)

In [47]:
# Merge training and development datasets for claim classification
train_claimcls_f = pd.merge(train_claimcls, dev_claimcls, on=['Claim', 'Evidence', 'Label'], how='outer')

In [48]:
# Create dataset for claim classification
class ClaimCLSDataset(Dataset):

    def __init__(self, data, maxlen):
        self.df = data
        self.tokenizer = bert_tokenizer
        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        claim = self.df.loc[index, 'Claim']
        evidence = self.df.loc[index, 'Evidence']
        label = torch.tensor(label2id[self.df.loc[index, 'Label']])

        tokens_ids_t, attn_mask_t, seg_ids_t = convert_input(claim, evidence, self.tokenizer, self.maxlen)
        return tokens_ids_t, attn_mask_t, seg_ids_t, label

In [49]:
# Create training and development dataloaders for claim classification
train_claimcls_set = ClaimCLSDataset(train_claimcls, token_len)
dev_claimcls_set = ClaimCLSDataset(dev_claimcls, token_len)
train_claimcls_loader = DataLoader(train_claimcls_set, batch_size = 32, shuffle=True)
dev_claimcls_loader = DataLoader(dev_claimcls_set, batch_size = 32, shuffle=True)

In [50]:
# Create dataloader for merged dataset
train_claimcls_f_set = ClaimCLSDataset(train_claimcls_f, token_len)
train_claimcls_f_loader = DataLoader(train_claimcls_f_set, batch_size = 32, shuffle=True)

In [51]:
# Define the sturcture of the model for Claim classification
class ClaimClassifier(nn.Module):

    def __init__(self):
        super(ClaimClassifier, self).__init__()
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        self.cls_layer = nn.Linear(768, 4)

    def forward(self, seq, attn_masks, seg_ids):
        outputs = self.bert_layer(seq, attention_mask = attn_masks, token_type_ids = seg_ids, return_dict=True)
        cont_reps = outputs.last_hidden_state
        cls_rep = cont_reps[:, 0]
        logits = self.cls_layer(cls_rep)
        return logits

In [52]:
# Compute accuracy for model of claim classification
def acc_softmax(logits, labels):
    probs = torch.softmax(logits, dim=1)
    _, pred_labels = torch.max(probs, dim=1)
    total = labels.size(0)
    correct = torch.sum(pred_labels == labels).item()
    acc = correct / total
    return acc

In [53]:
# Train
claimcls_name = "claimcls"
if not check_file(f"{outputs_path}{claimcls_name}.dat"):
    claimcls_model = ClaimClassifier()
    claimcls_model.to(device)
    claimcls_criterion = nn.CrossEntropyLoss()
    claimcls_optimizer = optim.Adam(claimcls_model.parameters(), lr=2e-5)
    num_epoch = 2
    train(claimcls_model, claimcls_criterion, claimcls_optimizer, train_claimcls_loader, dev_claimcls_loader, num_epoch, acc_softmax, claimcls_name, cls_type=1)
else:
    print(f"{outputs_path}{claimcls_name}.dat exists")

outputs/claimcls.dat exists


In [54]:
# Train on merged dataset
claimcls_f_name = "claimcls_f"
if not check_file(f"{outputs_path}{claimcls_f_name}.dat"):
    claimcls_f_model = ClaimClassifier()
    claimcls_f_model.to(device)
    claimcls_f_criterion = nn.CrossEntropyLoss()
    claimcls_f_optimizer = optim.Adam(claimcls_f_model.parameters(), lr=2e-5)
    num_epoch = 1
    train(claimcls_f_model, claimcls_f_criterion, claimcls_f_optimizer, train_claimcls_f_loader, None, num_epoch, acc_softmax, claimcls_f_name, cls_type=1, full=True)
else:
    print(f"{outputs_path}{claimcls_f_name}.dat exists")

outputs/claimcls_f.dat exists


In [55]:
# Load model for claim classification
claimcls_path = f"{outputs_path}{claimcls_name}.dat"
claimcls_model = ClaimClassifier()
claimcls_model.load_state_dict(torch.load(claimcls_path))
claimcls_model.to(device)
claimcls_model.eval()
print(f"Loaded {claimcls_path}")

Loaded outputs/claimcls.dat


In [56]:
# Load model for claim classification (merged dataset)
claimcls_f_path = f"{outputs_path}{claimcls_f_name}.dat"
claimcls_f_model = ClaimClassifier()
claimcls_f_model.load_state_dict(torch.load(claimcls_f_path))
claimcls_f_model.to(device)
claimcls_f_model.eval()
print(f"Loaded {claimcls_f_path}")

Loaded outputs/claimcls_f.dat


#### Claim classification

In [57]:
# Classify a claim
def claimcls(claim, evidences, model, tokenizer):
    res = []
    for ev in evidences:
        seq, attn_mask, seg_id = convert_input(claim, evidence_src[ev], tokenizer, token_len)
        seq = seq.unsqueeze(0).cuda(gpu)
        attn_mask = attn_mask.unsqueeze(0).cuda(gpu)
        seg_id = seg_id.unsqueeze(0).cuda(gpu)
        with torch.no_grad():
            logits = model(seq, attn_mask, seg_id)
            _, pred = torch.max(torch.softmax(logits, dim=1), dim=1)
            pred = pred.item()
            if pred != 2:
                res.append(pred)
    if len(res) == 0:
        return 2
    return Counter(res).most_common(1)[0][0]

In [58]:
# Evaluate performance on a claim in dev set
# show: True for printing predicton and truth
def eval_dev_claimcls(index, ev_retrieve, model, tokenizer, show=True):
    pred = claimcls(dev_claims[index], ev_retrieve[index], model, tokenizer)
    truth = label2id[dev_labels[index]]
    print(f"Pred: {pred}, Truth: {truth}") if show else None
    return pred

# Classify all claims in the dataset based on the retrived evidences
# check: True for checking accuracy
def claimcls_src(claims, evs, model, tokenizer, check=False):
    preds = []
    t = 0
    f = 0
    pbar = tqdm(total=len(claims), desc="Predicting claims", dynamic_ncols=True)
    for i in range(len(claims)):
        pred = claimcls(claims[i], evs[i], model, tokenizer)
        preds.append(pred)
        if check:
            truth = label2id[dev_labels[i]]
            if pred == truth:
                t += 1
            else:
                f += 1
        pbar.update(1)
    pbar.close()
    print(f"Total: {t + f}, Correct: {t}, Wrong: {f}, Accuracy: {round(t / (t + f), 2)}") if check else None
    return preds

### Section III: Evaluation & Prediction

In [59]:
# Format evidences for saving
def format_evidences(evidences):
    res = []
    for ev in evidences:
        res.append(f"evidence-{ev}")
    return res

# Formatting and saving results
def save_results(claim_ids, claims, labels, evidences, filename):
    data = {}
    for i in range(len(claim_ids)):
        claim_id = claim_ids[i]
        claim_text = claims[i]
        claim_label = id2label[labels[i]]
        evs = format_evidences(evidences[i])
        data[claim_id] = {"claim_text": claim_text,
                          "claim_label": claim_label,
                          "evidences": evs}

    with open(filename, "w") as f:
        json.dump(data, f)

#### Evaluation on dev set

In [60]:
# Retrieve evidences for claims in dev set
dev_ev_retrieve = ev_retrieve_src(dev_claims, 150, 5, check=True, full=False)

Retrieving evidences:   0%|          | 0/154 [00:00<?, ?it/s]

Total: 491, In: 120, Out: 371


In [61]:
# Classify claims in dev set
dev_claimcls = claimcls_src(dev_claims, dev_ev_retrieve, claimcls_model, bert_tokenizer, check=True)

Predicting claims:   0%|          | 0/154 [00:00<?, ?it/s]

Total: 154, Correct: 71, Wrong: 83, Accuracy: 0.46


In [62]:
dev_pred_output = "dev-pred.json"
dev_save_path = f"{prediction_path}{dev_pred_output}"
save_results(dev_claim_ids, dev_claims, dev_claimcls, dev_ev_retrieve, dev_save_path)
print(f"Saved to {dev_save_path}")

Saved to prediction/dev-pred.json


In [63]:
if check_file("eval.py"):
    !python eval.py --predictions data/dev-claims.json --groundtruth prediction/dev-pred.json

Evidence Retrieval F-score (F)    = 0.18772933415790563
Claim Classification Accuracy (A) = 0.461038961038961
Harmonic Mean of F and A          = 0.26681494091949504


#### Prediction on test set

In [64]:
test_path = f"{data_dir}test-claims-unlabelled.json"
test_claim_ids, test_claims = load_data(test_path)

In [65]:
# Retrieve evidences for claims in test set
test_ev_retrieve = ev_retrieve_src(test_claims, 150, 5, full=False)
# test_ev_retrieve = ev_retrieve_src(test_claims, 150, 5, full=True)

Retrieving evidences:   0%|          | 0/153 [00:00<?, ?it/s]

In [66]:
# Classify claims in test set
test_claimcls = claimcls_src(test_claims, test_ev_retrieve, claimcls_model, bert_tokenizer)
# test_claimcls = claimcls_src(test_claims, test_ev_retrieve, claimcls_f_model, bert_tokenizer)

Predicting claims:   0%|          | 0/153 [00:00<?, ?it/s]

In [67]:
test_pred_output = "test-claims-predictions.json"
test_save_path = f"{prediction_path}{test_pred_output}"
save_results(test_claim_ids, test_claims, test_claimcls, test_ev_retrieve, test_save_path)
print(f"Saved to {test_save_path}")

Saved to prediction/test-claims-predictions.json


In [68]:
def test_insight(index):
    print(f"Claim: {test_claims[index]}")
    for ev in test_ev_retrieve[index]:
        print(evidence_src[ev])
    print(f"Prediction: {id2label[test_claimcls[index]]}")

In [69]:
test_insight(0)

Claim: The contribution of waste heat to the global climate is 0.028 W/m2.
Global forcing from waste heat was 0.028 W/m2 in 2005.
The global temperature increase since the beginning of the industrial period (taken as 1750) is about 0.8 °C (1.4 °F), and the radiative forcing due to CO 2 and other long-lived greenhouse gases – mainly methane, nitrous oxide, and chlorofluorocarbons – emitted since that time is about 2.6 W/m2.
Taking planetary heat uptake rate as the rate of ocean heat uptake estimated by the IPCC AR4 as 0.2 W/m2, yields a value for S of 2.1 °C (3.8 °F).
Without feedbacks the radiative forcing of approximately 3.7 W/m2, due to doubling CO 2 from the pre-industrial 280 ppm, would eventually result in roughly 1 °C global warming.
Solar irradiance is about 0.9 W/m2 brighter during solar maximum than during solar minimum, which correlated in measured average global temperature over the period 1959-2004.
Prediction: SUPPORTS


### End