In [None]:
import sys
import jsonlines
import os
import pprint
import time
import random
import re
import json
import math

pp = pprint.PrettyPrinter()
sys.path.insert(0, "e:\\Documents\\NLP\\FEVER2021_SharedTask\\FEVEROUS\\src")

DIR_PATH = "e:\\Documents\\NLP\\FEVER2021_SharedTask\\"
TRAIN_DATA_PATH = os.path.join(DIR_PATH, 'data\\train.jsonl')

from database.feverous_db import FeverousDB
from utils.wiki_page import WikiPage

### Import data

In [None]:
db = FeverousDB("C:/Databases/feverous_wikiv1.db")

#### Training data

In [None]:
train_data = []
with jsonlines.open(TRAIN_DATA_PATH) as reader:
    for i, doc in enumerate(reader):
        train_data.append(doc)

In [None]:
train_example = train_data[0]
evidence = train_example['evidence']
PAGE_NAME = "Tammy Garcia"
page_json = db.get_doc_json(PAGE_NAME)
wiki_page = WikiPage(PAGE_NAME, page_json)

def get_sent_evidence(train_json):    
    for e in evidence:
        sent_ids = []
        content = e['content']
        for c in content:
            sent_id = c.replace(PAGE_NAME + "_", "")
            sent_ids.append(sent_id)

    sentences = wiki_page.get_sentences()
    content_text = ""
    for sent_id in sent_ids:
        for sent in sentences:
            if sent.name == sent_id:
                content_text += sent.content + " "
                break

print(content_text)

In [None]:
start_time = time.time()
doc_ids = db.get_doc_ids()
print("Nr of docs: {} took {} seconds to fetch".format(len(doc_ids), time.time()-start_time))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from glob import glob
from nltk.stem.porter import PorterStemmer
import numpy as np

CORPUS_PATH = DIR_PATH + 'data\\corpora\\'

porter_stemmer = PorterStemmer()

def create_corpus():
    file_paths = glob(CORPUS_PATH + '*.json')
    for f_path in file_paths:
        print("Opening file '{}'".format(f_path))
        with open(f_path, 'r') as f:
            docs = json.loads(f.read())
            for key in docs:
                yield docs[key]

def stemming_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

start_time = time.time()
# Without stemming
tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words='english',dtype=np.float32)
# With stemming
# tfidfvectorizer = TfidfVectorizer(stop_words='english', tokenizer=stemming_tokenizer)
corpus = create_corpus()
tfidf_wm = tfidfvectorizer.fit_transform(corpus)
print("Creating TF-IDF matrix took {} seconds".format(time.time() - start_time))

In [None]:
from glob import glob
CORPUS_PATH = DIR_PATH + 'data\\corpora\\'

def create_doc_id_map():
    doc_id_map = []
    file_paths = glob(CORPUS_PATH + '*.json')
    for f_path in file_paths:
        with open(f_path, 'r') as f:
            docs = json.loads(f.read())
            for key in docs:
                doc_id_map.append(key)
    return doc_id_map

doc_id_map = create_doc_id_map()

In [None]:
len(doc_id_map)

In [None]:
tfidf_wm.dtype

In [None]:
sys.getsizeof(doc_id_map)

In [None]:
import pickle
pickle.dump(tfidfvectorizer, open("vectorizer-32bit.pickle", "wb"))
pickle.dump(tfidf_wm, open("tfidf_wm-32bit.pickle", "wb"))

In [None]:
import pickle
from sklearn.metrics.pairwise import cosine_similarity

tfidfvectorizer = pickle.load(open("vectorizer-32bit.pickle", "rb"))
tfidf_wm = pickle.load(open("tfidf_wm-32bit.pickle", "rb"))

In [None]:
with open(DIR_PATH + 'data\\doc_id_map.json', 'r') as f:
    doc_id_map = json.loads(f.read())

In [None]:
from sklearn.random_projection import SparseRandomProjection

srp = SparseRandomProjection()
tfidf_wm_reduced = srp.fit_transform(tfidf_wm)
tfidf_wm_reduced.shape

In [None]:
tfidf_wm.shape

In [None]:
test_query = train_data[0]['claim']
query_tfidf = tfidfvectorizer.transform([test_query])
cosine_similarities = cosine_similarity(query_tfidf, tfidf_wm).flatten()

In [None]:
cosine_similarities[:-6:-1]
# cosine_similarities.sort()[:-6:-1]

In [None]:
related_docs_indices = cosine_similarities.argsort()[:-6:-1]
related_docs_indices

In [None]:
doc_id_map[4244298]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import trange

def get_top_docs(claim):
    query_tfidf = tfidfvectorizer.transform([claim])
    cosine_similarities = cosine_similarity(query_tfidf, tfidf_wm).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-6:-1]
    return [doc_id_map[i] for i in related_docs_indices]

TEST_SAMPLE = 10
claim_top_docs = []
for i in trange(TEST_SAMPLE):
    claim = train_data[i]['claim']
    claim_top_docs.append(get_top_docs(claim))
    
print(claim_top_docs)

In [None]:
pp.pprint(claim_top_docs)

In [None]:
train_data[1]['claim']

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def get_top_sents(doc_id, claim):
    doc_json = db.get_doc_json(doc_id)
    page = WikiPage(doc_json['title'], doc_json)
    sents = extract_sents(doc_json)
    sent_vectorizer = TfidfVectorizer(analyzer='word',stop_words='english')
    sent_wm = sent_vectorizer.fit_transform(sents)
    claim_tfidf = sent_vectorizer.transform([claim])
    cosine_similarities = cosine_similarity(claim_tfidf, sent_wm).flatten()
    top_sents_indices = cosine_similarities.argsort()[:-6:-1]
    print(top_sents_indices)
    return [sent for i, sent in enumerate(sents) if i in top_sents_indices]
    
claim = train_data[0]['claim']
doc_id = 'Tammy Garcia'
print(get_top_sents(doc_id, claim))

In [None]:
claim

In [None]:
doc_json = db.get_doc_json(doc_id)
pp.pprint(doc_json)

In [None]:
len(train_data)