# Sentence similarity - TF-IDF + POS + NER

This notebook holds the code for the final approach of the retreival part using a mix of TFID + NER + POS with a Cosine similarity distance measure.

In [1]:
import pandas as pd
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
# Getting stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
# stopwords = nlp.Defaults.stop_words
# [stop_words.add(w) for w in stopwords]
# len(stop_words)

# Preprocessing step
The focus of this section is to transform the inputed texts to a numeric representation trying to keep as much contextual representation as posible from the texts, for so, we decided to use NER and POS tagging over the texts.

In [3]:
# Read train claims
with open('../data/train-claims.json', 'r') as f:
    df_train = pd.DataFrame(json.load(f)).transpose()

# Read dev claims
with open('../data/dev-claims.json', 'r') as f:
    df_dev = pd.DataFrame(json.load(f)).transpose()

# Read evidence
with open('../data/evidence.json', 'r') as f:
    evidence = json.load(f)
ev_txt = [j for i,j in evidence.items()]

In [4]:
# NER and POS texts
def tag(text):
    doc = nlp(text)
    ner_tags = [entity.label_ for entity in doc.ents]
    pos_tags = [token.pos_ for token in doc]
    return ner_tags, pos_tags

# NER and POS texts
def NER(text):
    doc = nlp(text)
    ner_tags = [entity.label_ for entity in doc.ents]
    return ner_tags

def POS(text):
    doc = nlp(text)
    pos_tags = [token.pos_ for token in doc]
    return pos_tags

In [5]:
import time
import re
from nltk.tokenize import WordPunctTokenizer
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Select columns to work on and retrieve tokenized and preprocesed vectors 
def feature_selection(feature, full=False):
    # Set words to lower and tokenize
    tokenized = [WordPunctTokenizer().tokenize(i.lower()) for i in feature]
    # Drop unknown characters (This may be modified depending model performance)
    tokenized = [' '.join([w for w in seq if re.match(r'^.*$', w)]) for seq in tokenized]
    # Stopwords
    tokenized = [' '.join([w for w in seq.split() if w not in stop_words]) for seq in tokenized]
    # Lemmatization
    tokenized = [' '.join([WordNetLemmatizer().lemmatize(w) for w in seq.split()]) for seq in tokenized]
    if full:
        # Extract NER and POS tags
        ner_tags = list(map(NER,feature))
        pos_tags = list(map(POS,feature))
        # Concatenate texts
        concatenated_texts = []
        for i in range(len(tokenized)):
            concatenated_texts.append(tokenized[i].split() + [' | '] + ner_tags[i] + [' | '] + pos_tags[i])
        return concatenated_texts
    return tokenized

In [6]:
from time import time
ner, pos = [], []
st = time()

# Method to extract the contextual representation (NER and POS) from the evidence set
import os
# If the files does not exist in the ws folder extract the NER and POS files
if not os.path.exists('../data/NER.csv'):
    print(1)
    ### NOTE: This function runs in over 1:20hs
    for idx, txt in enumerate(ev_txt):
        if idx % 10000 == 0 :
            print(f"Evidence No. {idx}. \t Time: {time()-st:0.2f}")
        nr, ps = tag(txt)
        ner.append(nr)
        pos.append(ps)
    pd.Series(ner).to_csv('../data/NER.csv')
    pd.Series(pos).to_csv('../data/POS.csv')
# Read the files if they exist
else:
    ner = pd.read_csv('../data/NER.csv', index_col=0).iloc[:,0]
    ner = [[tag.strip("'") for tag in sent.strip("[]").split(", ")] for sent in ner]
    pos = pd.read_csv('../data/POS.csv', index_col=0).iloc[:,0]
    pos = [[tag.strip("'") for tag in sent.strip("[]").split(", ")] for sent in pos]

In [7]:
# transform the dev claims and evidences
df = df_dev.copy()
# Claim
claim = feature_selection(df['claim_text'], True)
claim = [' '.join(i) for i in claim]
# Evidence
evidences = feature_selection(ev_txt, False)
ev = []
for i in range(len(evidences)):
    ev.append(evidences[i].split() + [' | '] + ner[i] + [' | '] + pos[i])

# Sentence sililarity step
In this section we apply a TfidfVectorizer with different similarity measurements 

In [8]:
from Levenshtein import distance as levenshtein_distance
import time
from scipy.sparse.linalg import norm
tvec = TfidfVectorizer(token_pattern=r'\b\w+\b')
# Timer
st  = time.time()
# Instanciate lists
top = []
evs = pd.DataFrame([i for i,j in evidence.items()], columns=['evidences'])
# Fit and Transform evidences
eviden = tvec.fit_transform([' '.join(i) for i in ev])
print(f'{time.time()-st:0.2f}')
# Iterate claims and evaluate similarity
for i in tqdm(range(len(claim))):
    claims = tvec.transform([claim[i]])
    # Levenshtein Distance
    # sim = [levenshtein_distance(claim[i], ev[j]) for j in range(len(ev))]
    # Cos Similarity
    sim = (np.dot(claims, eviden.T)/(norm(claims)*norm(eviden))).toarray()
    # Jaccard Similarity
    # sim = (claims.multiply(eviden).sum(axis=1)) / (claims.sum(axis=1) + eviden.sum(axis=1) - claims.multiply(eviden).sum(axis=1))
    # Get top 5
    df_ = evs.copy()
    df_['sim'] = sim[0]
    top.append(df_.sort_values(['sim'], ascending=False)['evidences'][:3].values)
ds = df_dev.copy()
ds['top'] = top

19.79


100%|██████████| 154/154 [00:53<00:00,  2.87it/s]


In [9]:
f1 = []
for idx, row in ds.iterrows():
    pred = [1 if top in row['evidences'] else 0 for top in row['top']]
    TP = sum(pred)/len(row['evidences'])
    FP = len(row['top']) - sum(pred)
    FN = len(row['evidences']) - sum(pred)
    prec = TP/(TP+FP)
    rec = TP/(TP+FN)
    f1.append(2*prec*rec/(prec+rec+1e-10))
print(f'F1 Score: {np.mean(f1):0.4f}')
# F1 Score: 0.0705

F1 Score: 0.0685


# Prediction over test set

In [10]:
# Read dev claims
with open('../data/test-claims-unlabelled.json', 'r') as f:
    df_test = pd.DataFrame(json.load(f)).transpose()

In [11]:
df = df_test.copy()
# Claim
claim = feature_selection(df['claim_text'], True)
claim = [' '.join(i) for i in claim]

In [12]:
# Timer
st  = time.time()
# Instanciate lists
top = []
evs = pd.DataFrame([i for i,j in evidence.items()], columns=['evidences'])
# Fit and Transform evidences
eviden = tvec.fit_transform([' '.join(i) for i in ev])
print(f'{time.time()-st:0.2f}')
# Iterate claims and evaluate similarity
for i in tqdm(range(len(claim))):
    claims = tvec.transform([claim[i]])
    # Cos Similarity
    sim = (np.dot(claims, eviden.T)/(norm(claims)*norm(eviden))).toarray()
    # Get top 5
    df_ = evs.copy()
    df_['sim'] = sim.reshape(-1,1)
    top.append(df_.sort_values(['sim'], ascending=False)['evidences'][:5].values.tolist())
ds = df_test.copy()
ds['top'] = top

26.18


100%|██████████| 153/153 [00:50<00:00,  3.03it/s]


In [13]:
import random
labels = list(df_dev['claim_label'].unique())
# ds
ds['claim_label'] = [random.choice(labels) for i in range(ds.shape[0])]
ds = ds.iloc[:,[0,2,1]]
ds.columns = ['claim_text', 'claim_label', 'evidences']

In [14]:
ts_dct = ds.to_dict(orient='index')
with open('../Models/Test files/test-output.json', 'w') as f:
    json.dump(ts_dct, f)