In [1]:
import pandas as pd
import json
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


# spacy.cli.download("en_core_web_sm")
import spacy
nlp = spacy.load("en_core_web_sm")

import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()



In [67]:
# Read train claims
with open('../data/train-claims.json', 'r') as f:
    df_train = pd.DataFrame(json.load(f)).transpose()

# Read dev claims
with open('../data/dev-claims.json', 'r') as f:
    df_dev = pd.DataFrame(json.load(f)).transpose()

# Read evidence
with open('../data/evidence.json', 'r') as f:
    evidence = json.load(f)
ev_txt = [j for i,j in evidence.items()]

In [3]:
# NER texts
def extract_ner(text):
    doc = nlp(text)
    ner_tags = [entity.label_ for entity in doc.ents]
    return ner_tags

# POS texts
def extract_pos_tags(text):
    doc = nlp(text)
    pos_tags = [token.pos_ for token in doc]
    return pos_tags


In [5]:
import time
# Select columns to work on and retrieve tokenized and preprocesed vectors 
def feature_selection(feature, timer=False):
    if timer:
        st = time.time()
    # Set words to lower and tokenize
    tokenized = [word_tokenize(i.lower()) for i in feature]
    if timer:
        print(f'{time.time()-st:0.2f}')
    # Drop unknown characters (This may be modified depending model performance)
    tokenized = [' '.join([w for w in seq if w.isalpha()]) for seq in tokenized]
    if timer:
        print(f'{time.time()-st:0.2f}')
    # Stopwords
    tokenized = [' '.join([w for w in seq.split() if w not in stop_words]) for seq in tokenized]
    if timer:
        print(f'{time.time()-st:0.2f}')
    # Lemmatization
    tokenized = [' '.join([lemmatizer.lemmatize(w) for w in seq.split()]) for seq in tokenized]
    if timer:
        print(f'{time.time()-st:0.2f}')
    # Extract NER and POS tags
    # ner_tags = list(map(extract_ner,feature))
    # pos_tags = list(map(extract_pos_tags,feature))
    # Concatenate texts
    # concatenated_texts = []
    # for i in range(len(tokenized)):
    #     concatenated_texts.extend([tokenized[i].split() + ['[SEP]'] + ner_tags[i] + ['[SEP]'] + pos_tags[i]])
    return tokenized

In [71]:
from tqdm import tqdm
docs = list(nlp.pipe(ev_txt, disable=["tok2vec"]))
ner = []
for doc in tqdm(docs):
    ner.apend([(ent.text, ent.label_) for ent in doc.ents])

KeyboardInterrupt: 

In [6]:
df = df_dev.copy()
# Claim
claim = feature_selection(df['claim_text'])
# Evidence
evidences = feature_selection(ev_txt, True)

79.21
81.05
83.12
105.44


In [64]:
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
from scipy.sparse.linalg import norm
tvec = TfidfVectorizer()
# Timer
st  = time.time()
# Instanciate lists
top = []
ev = pd.DataFrame([i for i,j in evidence.items()], columns=['evidences'])
# Fit and Transform evidences
eviden = tvec.fit_transform(evidences)
print(f'{time.time()-st:0.2f}')
# Iterate claims and evaluate similarity
for i in tqdm(range(len(claim))):
    claims = tvec.transform([claim[i]])
    # Cos Similarity
    sim = (np.dot(claims, eviden.T)/(norm(claims)*norm(eviden))).toarray()
    # Get top 5
    df_ = ev.copy()
    df_['sim'] = sim.reshape(-1,1)
    top.append(df_.sort_values(['sim'], ascending=False)['evidences'][:3].values)
ds = df_dev.copy()
ds['top'] = top

7.53


100%|██████████| 154/154 [00:20<00:00,  7.46it/s]


In [65]:
f1 = []
for idx, row in ds.iterrows():
    pred = [1 if top in row['evidences'] else 0 for top in row['top']]
    TP = sum(pred)/len(row['evidences'])
    FP = len(row['top']) - sum(pred)
    FN = len(row['evidences']) - sum(pred)
    prec = TP/(TP+FP)
    rec = TP/(TP+FN)
    f1.append(2*prec*rec/(prec+rec+1e-10))
print(f'F1 Score: {np.mean(f1):0.4f}')

F1 Score: 0.0455


In [None]:
# Read dev claims
with open('../data/test-claims-unlabelled.json', 'r') as f:
    df_test = pd.DataFrame(json.load(f)).transpose()