In [1]:
import sys
import jsonlines
import os
import pprint
import time
import random
import re

pp = pprint.PrettyPrinter()
sys.path.insert(0, "e:\\Documents\\NLP\\FEVER2021_SharedTask\\FEVEROUS\\src")

DIR_PATH = "e:\\Documents\\NLP\\FEVER2021_SharedTask\\"
TRAIN_DATA_PATH = os.path.join(DIR_PATH, 'data\\train.jsonl')

from database.feverous_db import FeverousDB
from utils.wiki_page import WikiPage

### Import data

In [2]:
db = FeverousDB("C:/Databases/feverous_wikiv1.db")

#### Training data

In [26]:
train_data = []
with jsonlines.open(TRAIN_DATA_PATH) as reader:
    for i, doc in enumerate(reader):
        train_data.append(doc)

In [None]:
train_example = train_data[0]
evidence = train_example['evidence']
PAGE_NAME = "Tammy Garcia"
page_json = db.get_doc_json(PAGE_NAME)
wiki_page = WikiPage(PAGE_NAME, page_json)

def get_sent_evidence(train_json):    
    for e in evidence:
        sent_ids = []
        content = e['content']
        for c in content:
            sent_id = c.replace(PAGE_NAME + "_", "")
            sent_ids.append(sent_id)

    sentences = wiki_page.get_sentences()
    content_text = ""
    for sent_id in sent_ids:
        for sent in sentences:
            if sent.name == sent_id:
                content_text += sent.content + " "
                break

print(content_text)

In [3]:
start_time = time.time()
doc_ids = db.get_doc_ids()
print("Nr of docs: {} took {} seconds to fetch".format(len(doc_ids), time.time()-start_time))

Nr of docs: 5421406 took 12.55490756034851 seconds to fetch


In [None]:
page = WikiPage(doc_ids[1], example_json)
sents = [sent.content for sent in page.get_sentences()]
print(sents)

In [4]:
def replace_entities(sent):
    regex = r'\[\[([^\|]+)\|([^\]]+)\]\]'
    return re.sub(regex, '\\2', sent)
  
def remove_punctuation(sent):
    if sent[-1] == '.':
        return sent[:-1]
    else:
        return sent

def extract_sents(doc_json):
    page = WikiPage(doc_json['title'], doc_json)
    sents = [replace_entities(sent.content) for sent in page.get_sentences()]
    sents = [sent.lower() for sent in sents]
    # sents = [remove_punctuation(sent) for sent in sents]
    return sents

In [5]:
SAMPLE_SIZE = 100000
sample_doc_ids = random.sample(doc_ids, SAMPLE_SIZE)

In [None]:
json_docs = db.get_doc_jsons(sample_doc_ids[:10])

In [None]:
from tqdm import tqdm
from numba import jit, cuda
from timeit import default_timer as timer   

sample_docs = dict()

def create_sample_docs_cpu():
    for i in sample_doc_ids:
        doc = db.get_doc_json(i)
        sents = extract_sents(doc)
        doc_text = ' '.join(sents)
        sample_docs[i] = doc_text
        
@jit
def create_sample_docs_gpu():
    for i in sample_doc_ids:
        doc = db.get_doc_json(i)
        sents = extract_sents(doc)
        doc_text = ' '.join(sents)
        sample_docs[i] = doc_text

start_time = time.time()
create_sample_docs_cpu()
print("Creating {} sample docs on single thread: {} seconds".format(SAMPLE_SIZE, time.time()-start_time))    

# start = timer()
# create_sample_docs_gpu()
# print("with GPU:", timer()-start)

In [16]:
def create_sample_docs(ids):
    json_docs = db.get_doc_jsons(ids)
    curr_sample_docs = dict()
    for doc in json_docs:
        sents = extract_sents(doc)
        doc_text = ' '.join(sents)
        curr_sample_docs[doc['title']] = doc_text
    return curr_sample_docs

### Create sample docs using multiple threads

In [6]:
import concurrent.futures
import time


NR_OF_THREADS = 1

with concurrent.futures.ThreadPoolExecutor() as executor:
    thread_samples = int(SAMPLE_SIZE / NR_OF_THREADS)
    start_time = time.time()
    futures = []
    for i in range(NR_OF_THREADS):
        start = thread_samples*i
        ids = sample_doc_ids[start:start+thread_samples]
        futures.append(executor.submit(create_sample_docs, ids))

sample_docs = dict()

for f in futures:
    sample_docs.update(f.result())

print("Creating {} sample docs with {} threads: {} seconds".format(SAMPLE_SIZE, NR_OF_THREADS, time.time()-start_time))    

Creating 100000 sample docs with 1 threads: 100.90237498283386 seconds


### Create sample docs on a single thread

In [17]:
start_time = time.time()
sample_docs = create_sample_docs(sample_doc_ids)
print("Creating {} sample docs: {} seconds".format(SAMPLE_SIZE, time.time()-start_time))    

Creating 100000 sample docs: 58.31147503852844 seconds


In [7]:
import threading
threading.active_count()

5

In [20]:
list(sample_docs.keys())[:10]

['"Everything For the Country" Party',
 '"Ode-to-Napoleon" hexachord',
 '"Weird Al" Yankovic\'s Greatest Hits',
 "'77",
 "'A' Is for A-l-i-v-e",
 "'A'akapa",
 "'Abd al-Majid Nimer Zaghmout",
 "'Matšepo Ramakoae",
 "'Neath Canadian Skies",
 "'Round Springfield"]

In [None]:
for i, key in enumerate(sample_docs):
    if i >= 5:
        break
    print(sample_docs[key])

In [21]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words='english')

start_time = time.time()
tfidf_wm = tfidfvectorizer.fit_transform(sample_docs.values())
print("--- %s seconds ---" % (time.time() - start_time))

# tfidf_tokens = tfidfvectorizer.get_feature_names()

# df_tfidfvect = pd.DataFrame(data=tfidf_wm.toarray(), columns=tfidf_tokens, index=sample_docs.keys())

--- 25.735156536102295 seconds ---


In [22]:
len(tfidfvectorizer.get_feature_names())

534896

In [23]:
tfidf_wm.shape

(100000, 534896)

In [25]:
sys.getsizeof(sample_docs)

5242968

In [28]:
train_data[0]['claim']

'Tammy Garcia was born in California but currently lives in Taos, she comes from a long line of Santa Clara Pueblo artists and her great-great-great grandmother Sara Fina Tafoya was a potter.'

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

test_query = train_data[0]['claim']
query_tfidf = tfidfvectorizer.transform([test_query])
cosine_similarities = cosine_similarity(query_tfidf, tfidf_wm).flatten()
print(cosine_similarities)

[0.         0.         0.00693897 ... 0.         0.         0.        ]


In [38]:
related_docs_indices = cosine_similarities.argsort()[:-6:-1]
related_docs_indices

array([59183, 87540, 37748, 14786, 37745], dtype=int64)

In [45]:
list(sample_docs.keys())[59183]

'Margaret Tafoya'

In [46]:
sample_docs['Margaret Tafoya']

'maria margarita "margaret" tafoya (tewa name: corn blossom; august 13, 1904 – february 25, 2001) was the matriarch of santa clara pueblo potters. she was a recipient of a 1984 national heritage fellowship awarded by the national endowment for the arts, which is the united states government\'s highest honor in the folk and traditional arts. margaret was the daughter of sara fina (sometimes spelled serafina) guiterrez tafoya (1863–1949) and jose geronimo tafoya (1863–1955). she attended the santa clara pueblo elementary school, and then the santa fe indian school from 1915 to 1918. she had to drop out of high school to help her family during the flu pandemic of 1918. margaret learned the art of making pottery from her parents, and was particularly influenced by her mother. sara fina was considered the leading potter of santa clara in her day, as the master of making exceptionally large, finely polished blackware. she also occasionally made redware, micaceous clay storage jars and other 

In [36]:
tfidfvectorizer.inverse_transform(cosine_similarities.reshape(1,-1))

[array(['0000', '00005', '000110000', ..., 'cingulo', 'cingulopsidae',
        'cinhil'], dtype='<U85')]