In [4]:
#!pip install sentence-transformers nlp_function
import json
import numpy as np
from collections import Counter
from nltk.corpus import stopwords
from scipy.spatial.distance import cdist
from sentence_transformers import SentenceTransformer
#from nlp_function import pick_random_keys, stopwords_func, lower_processing
import random

In [5]:
'''Remove stopwords from claim and evidence for reducing the computational consumption'''
def stopwords_func(stop_words, text_type, text_data):
    if text_type == "evidence":
        for i in text_data:
            sentence = text_data[i]
            words = sentence.split()
            filtered_words = [word for word in words if word.lower() not in stop_words]
            filtered_sentence = " ".join(filtered_words)
            text_data[i] = filtered_sentence
    else:
        for i in text_data.values():
            sentence = i["claim_text"]
            words = sentence.split()
            filtered_words = [word for word in words if word.lower() not in stop_words]
            filtered_sentence = " ".join(filtered_words)
            i["claim_text"] = filtered_sentence
    return text_data

'''Function for picking random keys from the dictionary after excluding the specified key(s)'''
def pick_random_keys(dictionary, excluded_keys, num_keys):
    available_keys = [key for key in dictionary.keys() if key not in excluded_keys]
    random_keys = random.sample(available_keys, num_keys)
    return random_keys

'''Function for turning the text into lowercase expression'''
def lower_processing(data, text_type):
    if text_type == "claim_text":
        for i in data:
            data[i][text_type] = data[i][text_type].lower()
    else:
        for i in data:
            data[i] = data[i].lower()
    return data

In [8]:
## Read in data
# Read in training data (claim)
with open('dataset/train-claims.json', 'r') as tclaim_file:
    tclaim_data = json.load(tclaim_file)

# Read in development data (claim)
with open('dataset/dev-claims.json', 'r') as dclaim_file:
    dclaim_data = json.load(dclaim_file)

# Read in test data (claim)
with open('dataset/test-claims-unlabelled.json', 'r') as uclaim_file:
    uclaim_data = json.load(uclaim_file)

# Read in evidence data
with open('dataset/evidence.json', 'r') as evi_file:
    evi_data = json.load(evi_file)

## Preprocessing - Lowercase operation of the case
tclaim_data = lower_processing(tclaim_data, "claim_text")
dclaim_data = lower_processing(dclaim_data, "claim_text")
uclaim_data = lower_processing(uclaim_data, "claim_text")
evi_data = lower_processing(evi_data, 'evidence')

# ## Remove stopwords from claims and evidence (optional)
stop_words = set(stopwords.words('english'))
tclaim_data = stopwords_func(stop_words, "claim", tclaim_data)
dclaim_data = stopwords_func(stop_words, "claim", dclaim_data)
uclaim_data = stopwords_func(stop_words, "claim", uclaim_data)
evi_data = stopwords_func(stop_words, "evidence", evi_data)

## Create claim-evidence pair based on training set
train_pairs = []
for i in tclaim_data.values():
    for j in i["evidences"]:
        train_pairs.append((i["claim_text"], evi_data[j], 1))

## insert negative sample to the training set
for i in tclaim_data.values():
    excluded_keys = i["evidences"]
    random_keys = pick_random_keys(evi_data, excluded_keys, len(excluded_keys))
    for j in random_keys:
        train_pairs.append((i["claim_text"], evi_data[j], 0))

In [9]:
# Obtain sentence list
sentence_dict = {"train": 0, "test": 0}
sentence_list = []
for i in tclaim_data:
    sentence_dict["train"] += 1
    sentence_list.append(tclaim_data[i]["claim_text"])
for i in uclaim_data:
    sentence_dict["test"] += 1
    sentence_list.append(uclaim_data[i]["claim_text"])

# Load pre-trained SBERT model
model_name = 'distilbert-base-nli-mean-tokens'
model = SentenceTransformer(model_name)

# Embed sentences and obtain test set vectors
embeddings = model.encode(sentence_list)
train_matrix = embeddings[:sentence_dict["train"]]
test_matrix = embeddings[sentence_dict["train"]:]

# Capture the closest training instance (index) to the test set
test_train_index = []
for i in range(test_matrix.shape[0]):
    distances = cdist(train_matrix, np.expand_dims(test_matrix[i], axis=0), metric='euclidean')
    test_train_index.append(np.argmin(distances))

Downloading (…)925a9/.gitattributes: 100%|██████████| 690/690 [00:00<00:00, 230kB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 63.4kB/s]
Downloading (…)1a515925a9/README.md: 100%|██████████| 3.99k/3.99k [00:00<00:00, 1.93MB/s]
Downloading (…)515925a9/config.json: 100%|██████████| 550/550 [00:00<00:00, 183kB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 122/122 [00:00<00:00, 60.2kB/s]
Downloading pytorch_model.bin: 100%|██████████| 265M/265M [00:24<00:00, 11.0MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 26.5kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 37.3kB/s]
Downloading (…)925a9/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 750kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 450/450 [00:00<00:00, 222kB/s]
Downloading (…)1a515925a9/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 17.5MB/s]
Downloading (…)15925a9/modules.json: 100%|█████████

In [10]:
# 0-R Classification 
label_list = []
for i in tclaim_data.values():
    label_list.append(i["claim_label"])
strings = label_list
counter = Counter(strings)
most_common = counter.most_common(1)
most_frequent_string = most_common[0][0]
frequency = most_common[0][1]

# Assign label and evidence to the test set
train_key_list = list(tclaim_data.keys())
count = 0
for i in uclaim_data.values():
    i["claim_label"] = most_frequent_string
    i["evidences"] = tclaim_data[train_key_list[test_train_index[count]]]["evidences"]
    count += 1

# Save the test set result
file_path = 'dataset/test-claims-predictions.json'
with open(file_path, 'w') as json_file:
    json.dump(uclaim_data, json_file)