In [1]:
import sys
import jsonlines
import os
import pprint
import time
import random
import re
import json
import math
import unicodedata
import numpy as np

pp = pprint.PrettyPrinter()
sys.path.insert(0, "e:\\Documents\\NLP\\FEVER2021_SharedTask\\FEVEROUS\\src")

DIR_PATH = "e:\\Documents\\NLP\\FEVER2021_SharedTask\\"
TRAIN_DATA_PATH = os.path.join(DIR_PATH, 'data\\train.jsonl')

In [2]:
train_data = []
with jsonlines.open(TRAIN_DATA_PATH) as reader:
    for i, doc in enumerate(reader):
        train_data.append(doc)

In [21]:
len(train_data)

37803

In [3]:
with open(DIR_PATH + 'data\\doc_id_map.json', 'r') as f:
    doc_id_map = json.loads(f.read())

In [9]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

porter_stemmer = PorterStemmer()
s_words = set(stopwords.words('english'))

def stemming_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [word for word in words if word not in s_words]    
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from glob import glob

CORPUS_PATH = DIR_PATH + 'data\\corpora\\'

def create_corpus():
    file_paths = glob(CORPUS_PATH + '*.json')
    for f_path in file_paths:
        print("Opening file '{}'".format(f_path))
        with open(f_path, 'r') as f:
            docs = json.loads(f.read())
            for key in docs:
                yield docs[key]
                
start_time = time.time()
# Without stemming
# tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words='english',dtype=np.float32)
# corpus = create_corpus()
# tfidf_wm = tfidfvectorizer.fit_transform(corpus)

# With stemming
tfidfvectorizer_stem = TfidfVectorizer(tokenizer=stemming_tokenizer, dtype=np.float32, max_df=0.9, min_df=2)
corpus = create_corpus()
tfidf_wm_stem = tfidfvectorizer_stem.fit_transform(corpus)

print("Creating TF-IDF matrix with stemming took {} seconds".format(time.time() - start_time))

In [None]:
import pickle
pickle.dump(tfidfvectorizer_stem, open("models\\vectorizer-stemmed-32bit.pickle", "wb"))
pickle.dump(tfidf_wm_stem, open("models\\tfidf_wm_stemmed-32bit.pickle", "wb"))

In [10]:
import pickle
tfidfvectorizer_stem = pickle.load(open("models\\vectorizer-stemmed-32bit.pickle", "rb"))
tfidf_wm_stem = pickle.load(open("models\\tfidf_wm_stemmed-32bit.pickle", "rb"))

In [7]:
tfidf_wm_stem.shape

(5421406, 2634922)

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

BATCH_SIZE = 100
NR_OF_QUERIES = len(train_data)
BATCHES = math.ceil(NR_OF_QUERIES / BATCH_SIZE)

related_docs = []

start_time = time.time()

for batch_nr in range(BATCHES):
    print("Processing batch {} of {}".format(batch_nr+1, BATCHES))

    start = batch_nr*BATCH_SIZE
    end = (batch_nr+1)*BATCH_SIZE
    if end > NR_OF_QUERIES:
        end = NR_OF_QUERIES
    
    test_queries = [train_data[i]['claim'] for i in range(start, end)]
    
    query_tfidf = tfidfvectorizer_stem.transform(test_queries)
    cosine_similarities = cosine_similarity(query_tfidf, tfidf_wm_stem)
    print("Calculating cosine similarity for batch {} took {} seconds".format(batch_nr+1, time.time() - start_time))
    
    for i in range(cosine_similarities.shape[0]):
        related_docs_indices = cosine_similarities[i].argsort()[:-6:-1]
        related_docs.append([doc_id_map[i] for i in related_docs_indices])

print("Total time for consine similarities {} seconds".format(time.time() - start_time))

Processing batch 1 of 379
Calculating cosine similarity for batch 1 took 68.17412304878235 seconds
Processing batch 2 of 379
Calculating cosine similarity for batch 2 took 178.51950335502625 seconds
Processing batch 3 of 379
Calculating cosine similarity for batch 3 took 289.9600524902344 seconds
Processing batch 4 of 379
Calculating cosine similarity for batch 4 took 399.17440009117126 seconds
Processing batch 5 of 379
Calculating cosine similarity for batch 5 took 506.9954288005829 seconds
Processing batch 6 of 379
Calculating cosine similarity for batch 6 took 614.521398305893 seconds
Processing batch 7 of 379
Calculating cosine similarity for batch 7 took 722.1443617343903 seconds
Processing batch 8 of 379
Calculating cosine similarity for batch 8 took 829.6983766555786 seconds
Processing batch 9 of 379
Calculating cosine similarity for batch 9 took 938.8413755893707 seconds
Processing batch 10 of 379
Calculating cosine similarity for batch 10 took 1047.8183770179749 seconds
Proces

Calculating cosine similarity for batch 82 took 8551.928621768951 seconds
Processing batch 83 of 379
Calculating cosine similarity for batch 83 took 8656.068167686462 seconds
Processing batch 84 of 379
Calculating cosine similarity for batch 84 took 8759.788167238235 seconds
Processing batch 85 of 379
Calculating cosine similarity for batch 85 took 8863.37613105774 seconds
Processing batch 86 of 379
Calculating cosine similarity for batch 86 took 8966.076133728027 seconds
Processing batch 87 of 379
Calculating cosine similarity for batch 87 took 9067.85616850853 seconds
Processing batch 88 of 379
Calculating cosine similarity for batch 88 took 9169.37794804573 seconds
Processing batch 89 of 379
Calculating cosine similarity for batch 89 took 9273.388909816742 seconds
Processing batch 90 of 379
Calculating cosine similarity for batch 90 took 9375.928944349289 seconds
Processing batch 91 of 379
Calculating cosine similarity for batch 91 took 9480.138945579529 seconds
Processing batch 92 

Processing batch 162 of 379
Calculating cosine similarity for batch 162 took 16794.94176721573 seconds
Processing batch 163 of 379
Calculating cosine similarity for batch 163 took 16896.941729307175 seconds
Processing batch 164 of 379
Calculating cosine similarity for batch 164 took 16997.470729589462 seconds
Processing batch 165 of 379
Calculating cosine similarity for batch 165 took 17097.95973277092 seconds
Processing batch 166 of 379
Calculating cosine similarity for batch 166 took 17199.670732736588 seconds
Processing batch 167 of 379
Calculating cosine similarity for batch 167 took 17301.267729520798 seconds
Processing batch 168 of 379
Calculating cosine similarity for batch 168 took 17404.784048080444 seconds
Processing batch 169 of 379
Calculating cosine similarity for batch 169 took 17509.553009986877 seconds
Processing batch 170 of 379
Calculating cosine similarity for batch 170 took 17612.732059001923 seconds
Processing batch 171 of 379
Calculating cosine similarity for batc

Calculating cosine similarity for batch 241 took 24957.110705137253 seconds
Processing batch 242 of 379
Calculating cosine similarity for batch 242 took 25058.968668699265 seconds
Processing batch 243 of 379
Calculating cosine similarity for batch 243 took 25163.392668247223 seconds
Processing batch 244 of 379
Calculating cosine similarity for batch 244 took 25265.059668302536 seconds
Processing batch 245 of 379
Calculating cosine similarity for batch 245 took 25367.076678276062 seconds
Processing batch 246 of 379
Calculating cosine similarity for batch 246 took 25470.95867872238 seconds
Processing batch 247 of 379
Calculating cosine similarity for batch 247 took 25572.76667904854 seconds
Processing batch 248 of 379
Calculating cosine similarity for batch 248 took 25675.796678066254 seconds
Processing batch 249 of 379
Calculating cosine similarity for batch 249 took 25778.25267791748 seconds
Processing batch 250 of 379
Calculating cosine similarity for batch 250 took 25879.204715013504

Processing batch 321 of 379
Calculating cosine similarity for batch 321 took 33120.904341220856 seconds
Processing batch 322 of 379
Calculating cosine similarity for batch 322 took 33226.25633907318 seconds
Processing batch 323 of 379
Calculating cosine similarity for batch 323 took 33329.874341487885 seconds
Processing batch 324 of 379
Calculating cosine similarity for batch 324 took 33435.5233399868 seconds
Processing batch 325 of 379
Calculating cosine similarity for batch 325 took 33540.23257660866 seconds
Processing batch 326 of 379
Calculating cosine similarity for batch 326 took 33644.53358006477 seconds
Processing batch 327 of 379
Calculating cosine similarity for batch 327 took 33749.27457737923 seconds
Processing batch 328 of 379
Calculating cosine similarity for batch 328 took 33854.75461268425 seconds
Processing batch 329 of 379
Calculating cosine similarity for batch 329 took 33958.87761306763 seconds
Processing batch 330 of 379
Calculating cosine similarity for batch 330 

In [15]:
len(related_docs)

37803

In [17]:
pp.pprint(related_docs[:5])

[['Tammy Garcia',
  'LuAnn Tafoya',
  'Margaret Tafoya',
  'Paul Speckled Rock',
  'Tafoya'],
 ['Travis Hafner',
  'Hafner',
  'Hafner Manufacturing Company',
  'Paul Travis',
  'Robert Hafner'],
 ['Sarkis Mazmanian',
  'Immunity (medical)',
  'Immune system',
  'Immunization',
  'Immune privilege'],
 ['Kiko Insa',
  'INSA (disambiguation)',
  'Kiko',
  'Kikos',
  "List of FIFA Women's World Cup own goals"],
 ['Ridwan',
  'Yağısıyan',
  'Battle of Iconium (1190)',
  'Janah ad-Dawla',
  'Siege of Aleppo (1124)']]


In [19]:
with open('data\\top_5_documents.json', 'w', encoding='utf8') as f:
    f.write(json.dumps(related_docs, indent=2))

In [None]:
test_queries

In [20]:
pp.pprint(train_data[4])

{'annotator_operations': [{'operation': 'start', 'time': '0', 'value': 'start'},
                          {'operation': 'Now on',
                           'time': '0.99',
                           'value': '?search='},
                          {'operation': 'search',
                           'time': '5.328',
                           'value': 'yagisiyan'},
                          {'operation': 'Now on',
                           'time': '6.466',
                           'value': 'Yağısıyan'},
                          {'operation': 'Highlighting',
                           'time': '14.347',
                           'value': ' Yağısıyan_sentence_16'},
                          {'operation': 'Highlighting',
                           'time': '42.924',
                           'value': ' Yağısıyan_sentence_15'},
                          {'operation': 'Highlighting',
                           'time': '55.656',
                           'value': ' Yağısıyan_sentence_20'

### Calculate document retrieval accuracy

The accuracy is calculated as the number of train samples where all of the documents needed to verify the claim is retrieved. This means that if the claim needs information from two docs but the retriver only gets one of them, this is considered as a failure. 

In [65]:
def get_evidence_docs(doc_json):
    doc_names = []
    for evidence_content in doc_json['evidence'][0]['content']:
        doc_name = evidence_content.split('_')[0]
        if doc_name not in doc_names:
            doc_names.append(doc_name)
    return doc_names

def calculate_accuracy(related_docs, print_examples=False):
    nr_of_correct_samples = 0
    accuracy = 0
    for i in range(len(train_data)):
        evidence_docs = get_evidence_docs(train_data[i])
        nr_of_correct_samples += 1
        for doc in evidence_docs:
            match = False
            for rel_doc in related_docs[i]:
                if unicodedata.normalize('NFC', rel_doc) == unicodedata.normalize('NFC', doc):
                    match = True
            if not match:
                if i < 40 and print_examples:
                    print()
                    print("Claim: " + train_data[i]['claim'])
                    print("Evidence docs: {}".format(evidence_docs))
                    print("Related docs: {}".format(related_docs[i]))
                nr_of_correct_samples -= 1
                break
        accuracy = (nr_of_correct_samples/len(train_data))*100
    return accuracy

In [66]:
accuracy = calculate_accuracy(related_docs)
print()
print("The accuracy is {}%".format(accuracy))


The accuracy is 56.156918763061135%


In [29]:
print('Yağısıyan' == 'Yağısıyan')

False


In [36]:
print('Yağısıyan'.encode() == 'Yağısıyan'.encode())

False


In [37]:
print('Yağısıyan'.encode())
print('Yağısıyan'.encode())

b'Ya\xc4\x9f\xc4\xb1s\xc4\xb1yan'
b'Yag\xcc\x86\xc4\xb1s\xc4\xb1yan'


In [41]:
import unicodedata
print(unicodedata.normalize('NFC', 'Yağısıyan').encode())
print(unicodedata.normalize('NFC', 'Yağısıyan').encode())
print(unicodedata.normalize('NFD', 'Yağısıyan').encode())
print(unicodedata.normalize('NFD', 'Yağısıyan').encode())
print(unicodedata.normalize('NFC', 'Yağısıyan').encode() == unicodedata.normalize('NFC', 'Yağısıyan').encode())

b'Ya\xc4\x9f\xc4\xb1s\xc4\xb1yan'
b'Ya\xc4\x9f\xc4\xb1s\xc4\xb1yan'
b'Yag\xcc\x86\xc4\xb1s\xc4\xb1yan'
b'Yag\xcc\x86\xc4\xb1s\xc4\xb1yan'
True


### Create matching matrix for document title

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
from glob import glob

CORPUS_PATH = DIR_PATH + 'data\\corpora\\'

def create_corpus():
    file_paths = glob(CORPUS_PATH + '*.json')
    for f_path in file_paths:
        print("Opening file '{}'".format(f_path))
        with open(f_path, 'r') as f:
            docs = json.loads(f.read())
            for key in docs:
                yield key
                
start_time = time.time()
title_vectorizer = TfidfVectorizer(analyzer='word',stop_words='english',dtype=np.float32, ngram_range=(2,2))
corpus = create_corpus()
title_wm = title_vectorizer.fit_transform(corpus)

# With stemming
# tfidfvectorizer_stem = TfidfVectorizer(tokenizer=stemming_tokenizer, dtype=np.float32, max_df=0.9, min_df=2)
# corpus = create_corpus()
# tfidf_wm_stem = tfidfvectorizer_stem.fit_transform(corpus)

print("Creating TF-IDF matrix for titles took {} seconds".format(time.time() - start_time))

Opening file 'e:\Documents\NLP\FEVER2021_SharedTask\data\corpora\corpora_1.json'
Opening file 'e:\Documents\NLP\FEVER2021_SharedTask\data\corpora\corpora_10.json'
Opening file 'e:\Documents\NLP\FEVER2021_SharedTask\data\corpora\corpora_11.json'
Opening file 'e:\Documents\NLP\FEVER2021_SharedTask\data\corpora\corpora_12.json'
Opening file 'e:\Documents\NLP\FEVER2021_SharedTask\data\corpora\corpora_13.json'
Opening file 'e:\Documents\NLP\FEVER2021_SharedTask\data\corpora\corpora_14.json'
Opening file 'e:\Documents\NLP\FEVER2021_SharedTask\data\corpora\corpora_15.json'
Opening file 'e:\Documents\NLP\FEVER2021_SharedTask\data\corpora\corpora_16.json'
Opening file 'e:\Documents\NLP\FEVER2021_SharedTask\data\corpora\corpora_17.json'
Opening file 'e:\Documents\NLP\FEVER2021_SharedTask\data\corpora\corpora_18.json'
Opening file 'e:\Documents\NLP\FEVER2021_SharedTask\data\corpora\corpora_19.json'
Opening file 'e:\Documents\NLP\FEVER2021_SharedTask\data\corpora\corpora_2.json'
Opening file 'e:\D

In [47]:
title_wm.shape

(5421406, 4727577)

### Calculate cosine similarities for the titles

In [50]:
from sklearn.metrics.pairwise import cosine_similarity

BATCH_SIZE = 300
NR_OF_QUERIES = len(train_data)
BATCHES = math.ceil(NR_OF_QUERIES / BATCH_SIZE)

related_titles = []

start_time = time.time()

for batch_nr in range(BATCHES):
    print("Processing batch {} of {}".format(batch_nr+1, BATCHES))

    start = batch_nr*BATCH_SIZE
    end = (batch_nr+1)*BATCH_SIZE
    if end > NR_OF_QUERIES:
        end = NR_OF_QUERIES
    
    test_queries = [train_data[i]['claim'] for i in range(start, end)]
    
    query_tfidf = title_vectorizer.transform(test_queries)
    cosine_similarities = cosine_similarity(query_tfidf, title_wm)
    del query_tfidf
    print("Calculating cosine similarity for batch {} took {} seconds".format(batch_nr+1, time.time() - start_time))
    
    for i in range(cosine_similarities.shape[0]):
        related_titles_indices = cosine_similarities[i].argsort()[:-6:-1]
        related_titles.append([doc_id_map[i] for i in related_titles_indices])

print("Total time for consine similarities {} seconds".format(time.time() - start_time))

Processing batch 1 of 127
Calculating cosine similarity for batch 1 took 0.8023359775543213 seconds
Processing batch 2 of 127
Calculating cosine similarity for batch 2 took 41.915050983428955 seconds
Processing batch 3 of 127
Calculating cosine similarity for batch 3 took 82.8300530910492 seconds
Processing batch 4 of 127
Calculating cosine similarity for batch 4 took 124.56505060195923 seconds
Processing batch 5 of 127
Calculating cosine similarity for batch 5 took 165.62814712524414 seconds
Processing batch 6 of 127
Calculating cosine similarity for batch 6 took 205.16681027412415 seconds
Processing batch 7 of 127
Calculating cosine similarity for batch 7 took 246.33493065834045 seconds
Processing batch 8 of 127
Calculating cosine similarity for batch 8 took 287.99285101890564 seconds
Processing batch 9 of 127
Calculating cosine similarity for batch 9 took 327.91326427459717 seconds
Processing batch 10 of 127
Calculating cosine similarity for batch 10 took 367.8580377101898 seconds
P

Processing batch 82 of 127
Calculating cosine similarity for batch 82 took 3265.077572584152 seconds
Processing batch 83 of 127
Calculating cosine similarity for batch 83 took 3304.3936195373535 seconds
Processing batch 84 of 127
Calculating cosine similarity for batch 84 took 3343.894789457321 seconds
Processing batch 85 of 127
Calculating cosine similarity for batch 85 took 3383.28315448761 seconds
Processing batch 86 of 127
Calculating cosine similarity for batch 86 took 3422.6026146411896 seconds
Processing batch 87 of 127
Calculating cosine similarity for batch 87 took 3462.1526503562927 seconds
Processing batch 88 of 127
Calculating cosine similarity for batch 88 took 3501.4366166591644 seconds
Processing batch 89 of 127
Calculating cosine similarity for batch 89 took 3540.845613718033 seconds
Processing batch 90 of 127
Calculating cosine similarity for batch 90 took 3580.2013235092163 seconds
Processing batch 91 of 127
Calculating cosine similarity for batch 91 took 3619.4077770

In [51]:
len(related_titles)

37803

In [59]:
merged_docs = [list(set(x + y)) for x, y in zip(related_docs, related_titles)]
merged_docs[0]

['Santa Clara Pueblo v. Martinez',
 "It's a Great, Great World",
 'LuAnn Tafoya',
 'Paul Speckled Rock',
 'Tafoya',
 'Margaret Tafoya',
 'Great Great Great',
 'Santa Clara Pueblo, New Mexico',
 'Tammy Garcia']

In [69]:
merged_docs[0]

['Santa Clara Pueblo v. Martinez',
 "It's a Great, Great World",
 'LuAnn Tafoya',
 'Paul Speckled Rock',
 'Tafoya',
 'Margaret Tafoya',
 'Great Great Great',
 'Santa Clara Pueblo, New Mexico',
 'Tammy Garcia']

In [63]:
accuracy = calculate_accuracy(merged_docs)
print()
print("The accuracy is {}%".format(accuracy))


Claim: In 2005 it was the 105th season for the Cleveland Indians who finished in second with 207 team total home runs and Travis Hafner had 33 of them.
Evidence docs: ['2005 Cleveland Indians season']
Related docs: ['Home runs per nine innings', 'Paul Travis', 'Hafner Manufacturing Company', 'Home runs per hit', 'Hafner', 'Cleveland Indians', 'Travis Hafner', 'Robert Hafner', 'Home runs allowed']

Claim: Zsuzsanna Ury ranked 33rd for Hungary at the 2012 Winter Youth Olympics in the giant slalom, which involves skiing between sets of poles spaced at a distance from each other.
Evidence docs: ["Alpine skiing at the 2012 Winter Youth Olympics – Girls' combined"]
Related docs: ['Muhammad Karim (skier)', 'India at the 2012 Winter Youth Olympics', 'Australia at the 2012 Winter Youth Olympics', '2012 Winter Youth Olympics', 'Alexandra Tilley', 'Strahinja Stanišić', 'Maria Kirkova', 'Ireland at the 2012 Winter Youth Olympics', 'Hungary at the 2012 Winter Youth Olympics']

Claim: After leavi

### Get the sentences from the documents

In [67]:
from utils.wiki_page import WikiPage

def replace_entities(sent):
    regex = r'\[\[([^\|]+)\|([^\]]+)\]\]'
    return re.sub(regex, '\\2', sent)

def extract_sents(doc_json):
    page = WikiPage(doc_json['title'], doc_json)
    sents = [replace_entities(sent.content) for sent in page.get_sentences()]
    sents = [sent.lower() for sent in sents]
    return sents

In [125]:
def extract_table_text(table):
    cell_ids = table.get_ids()
    table_rows = []
    for i, cell_id in enumerate(cell_ids):
        if 'table_caption' in cell_id:
            continue
        cell_id_list = cell_id.split('_')
        row = int(cell_id_list[-2])
        if len(table_rows) < row+1:
            table_rows.append(replace_entities(table.get_cell_content(cell_id)))
        else:
            table_rows[row] += ' ' + replace_entities(table.get_cell_content(cell_id))
    return table_rows

def extract_tables(doc_json):
    page = WikiPage(doc_json['title'], doc_json)
    tables = page.get_tables()
    tables_content = []
    for table in tables:
        table_rows = extract_table_text(table)
        tables_content.append(table_rows)
    return tables_content

[['Type Public',
  'Traded as ',
  'Industry Loyalty marketing services',
  'Founded April 22, 1983',
  'Headquarters Columbus, Ohio, U.S.',
  'Area served USA, Canada',
  'Key people Ralph Andretta (President & CEO)',
  'Products Air Miles',
  'Revenue US$7.719 billion (2017)',
  'Operating income US$1.646 billion (2017)',
  'Net income US$789 million (2017)',
  'Total assets US$30.685 billion (2017)',
  'Total equity US$1.855 billion (2017)',
  'Number of employees ~20,000 (December 2017)',
  'Subsidiaries LoyaltyOne\nAlliance Data Retail Services',
  'Website ']]


In [None]:
from database.feverous_db import FeverousDB
db = FeverousDB("C:/Databases/feverous_wikiv1.db")

In [134]:
doc_json = db.get_doc_json("Albert Johnson Walker")
tables_content = extract_tables(doc_json)
pp.pprint(tables_content)

[['Albert Johnson Walker Albert Johnson Walker',
  'Born (1946-08-09) August 9, 1946 (age 74)\nParis, Ontario, Canada',
  'Other names David W. Davis, Ronald Joseph Platt',
  'Occupation Financial planner, Mortgage broker',
  'Criminal status In prison',
  'Spouse(s) Barbara Walker',
  'Children 4',
  'Criminal charge Murder, Theft, Fraud',
  'Penalty Life imprisonment, 4 Years']]


In [137]:
def expand_table_id(table_id):
    split_id = table_id.split('_')
    doc_json = db.get_doc_json(split_id[0])
    page = WikiPage(doc_json['title'], doc_json)
    tables = page.get_tables()
    result = []
    for i, table in enumerate(tables):
        cell_ids = table.get_ids()
        for cell_id in cell_ids:
            if not 'cell' in cell_id:
                continue
            splitted_cell_id = cell_id.split('_')
            row = int(splitted_cell_id[-2])
            if 'table_{}_{}'.format(i, row) in table_id:
                result.append('{}_{}'.format(doc_json['title'], cell_id))
    return result

def get_top_sents(doc_ids, claim, nr_of_sents=5):
    sent_ids = []
    table_ids = []
    all_sents = []
    all_table_rows = []
    for doc_id in doc_ids:
        doc_json = db.get_doc_json(doc_id)
        sents = extract_sents(doc_json)
        for i in range(len(sents)):
            sent_ids.append('{}_sentence_{}'.format(doc_json['title'], i))
        all_sents += sents
        
        tables_content = extract_tables(doc_json)
        for i, table_content in enumerate(tables_content):
            for j in range(len(table_content)):
                table_ids.append('{}_table_{}_{}'.format(doc_json['title'], i, j))
            all_table_rows += table_content

    sent_vectorizer = TfidfVectorizer(analyzer='word',stop_words='english',ngram_range=(1,3))
    sent_wm = sent_vectorizer.fit_transform(all_sents + all_table_rows)
    claim_tfidf = sent_vectorizer.transform([claim])
    cosine_similarities = cosine_similarity(claim_tfidf, sent_wm).flatten()
    top_sents_indices = cosine_similarities.argsort()[:-nr_of_sents-1:-1]
    top_sents = [sent for i, sent in enumerate(sent_ids + table_ids) if i in top_sents_indices]
    for sent in top_sents:
        if 'table' in sent:
            top_sents += expand_table_id(sent)
    top_sents = [sent for sent in top_sents if 'table' not in sent]
    top_sents = list(set(top_sents))
    return top_sents
    

index = 5
claim = train_data[index]['claim']
top_sents = get_top_sents(merged_docs[index], claim)
        
print(top_sents)
print(expand_table_id('Albert Johnson Walker_table_0_3'))

['Albert Johnson Walker_header_cell_0_3_0', 'Albert Johnson Walker_header_cell_0_1_0', 'Albert Johnson Walker_sentence_0', 'Albert Johnson Walker_cell_0_1_1', 'Albert Johnson Walker_header_cell_0_0_0', 'Albert Johnson_sentence_0', 'Albert Johnson Walker_cell_0_3_1']
['Albert Johnson Walker_header_cell_0_3_0', 'Albert Johnson Walker_cell_0_3_1']


In [145]:
def calculate_score():
    nr_of_correct_samples = 0
    sum_precision = 0
    sum_recall = 0
    test_sample = train_data[:100]
    counter = 0
    for i in range(len(test_sample)):
        evidence_sents = train_data[i]['evidence'][0]['content']
        related_sents = get_top_sents(merged_docs[i], train_data[i]['claim'],nr_of_sents=10)
        nr_of_correct_sents = 0
        for sent in evidence_sents:
            for rel_sent in related_sents:
                if unicodedata.normalize('NFC', rel_sent) == unicodedata.normalize('NFC', sent):
                    nr_of_correct_sents += 1
        precision = (nr_of_correct_sents/len(related_sents))*100
        recall = (nr_of_correct_sents/len(evidence_sents))*100
        sum_precision += precision
        sum_recall += recall
        
        if counter < 10 and recall < 30:
            print("Retrieved sentences: {}".format(related_sents))
            print("Correct sentences: {}".format(evidence_sents))
            print("Precision for nr {}: {}".format(i, precision))
            print("Recall for nr {}: {}".format(i, recall))
            print()
            counter += 1

    avg_precision = sum_precision/len(test_sample)
    avg_recall = sum_recall/len(test_sample)
        
    return avg_precision, avg_recall

avg_precision, avg_recall = calculate_score()
print()
print("Precision: {}".format(avg_precision))
print("Recall: {}".format(avg_recall))

Retrieved sentences: ['Home runs per nine innings_sentence_0', 'Home runs allowed_sentence_0', 'Travis Hafner_header_cell_0_0_0', 'Cleveland Indians_sentence_243', 'Cleveland Indians_sentence_332', 'Cleveland Indians_header_cell_0_0_0', 'Travis Hafner_header_cell_0_7_0', 'Travis Hafner_sentence_27', 'Travis Hafner_cell_0_7_1', 'Cleveland Indians_sentence_307', 'Cleveland Indians_sentence_160']
Correct sentences: ['2005 Cleveland Indians season_sentence_0', '2005 Cleveland Indians season_cell_0_5_1', '2005 Cleveland Indians season_cell_3_16_0', '2005 Cleveland Indians season_cell_3_16_7', '2005 Cleveland Indians season_cell_3_34_7', '2005 Cleveland Indians season_cell_3_34_0', '2005 Cleveland Indians season_sentence_3']
Precision for nr 1: 0.0
Recall for nr 1: 0.0

Retrieved sentences: ['Kiko Insa_cell_2_2_2', 'Goals Soccer Centres_cell_0_0_1', 'Kiko_sentence_0', "List of FIFA Women's World Cup own goals_sentence_3", 'Goals Soccer Centers_cell_0_0_1', 'Kiko Insa_cell_2_2_0', 'Goals Socc


Precision: 12.118092438883046
Recall: 48.40979344729345


In [153]:
pp.pprint(train_data[21])

{'annotator_operations': [{'operation': 'start', 'time': '0', 'value': 'start'},
                          {'operation': 'Now on',
                           'time': '0.943',
                           'value': '?search='},
                          {'operation': 'search',
                           'time': '29.919',
                           'value': '2016 Australian federal election'},
                          {'operation': 'Now on',
                           'time': '31.565',
                           'value': '2016 Australian federal election'},
                          {'operation': 'hyperlink',
                           'time': '77.994',
                           'value': 'Candidates of the 2016 Australian federal '
                                    'election'},
                          {'operation': 'Now on',
                           'time': '78.894',
                           'value': 'Candidates of the 2016 Australian federal '
                                    

In [156]:
doc_json = db.get_doc_json("Kiko Insa")
page = WikiPage("Kiko Insa", doc_json)
tables = page.get_tables()
cell_ids = tables[1].get_ids()
pp.pprint(cell_ids)

['header_cell_1_0_0',
 'header_cell_1_0_1',
 'header_cell_1_0_2',
 'header_cell_1_0_2',
 'header_cell_1_0_2',
 'header_cell_1_0_5',
 'header_cell_1_0_5',
 'header_cell_1_0_7',
 'header_cell_1_0_7',
 'header_cell_1_0_9',
 'header_cell_1_0_9',
 'header_cell_1_0_11',
 'header_cell_1_0_11',
 'header_cell_1_0_0',
 'header_cell_1_0_1',
 'header_cell_1_1_0',
 'header_cell_1_1_1',
 'header_cell_1_1_2',
 'header_cell_1_1_3',
 'header_cell_1_1_4',
 'header_cell_1_1_5',
 'header_cell_1_1_6',
 'header_cell_1_1_7',
 'header_cell_1_1_8',
 'header_cell_1_1_9',
 'header_cell_1_1_10',
 'cell_1_2_0',
 'cell_1_2_1',
 'cell_1_2_2',
 'cell_1_2_3',
 'cell_1_2_4',
 'cell_1_2_5',
 'cell_1_2_6',
 'cell_1_2_7',
 'cell_1_2_8',
 'cell_1_2_9',
 'cell_1_2_9',
 'cell_1_2_11',
 'cell_1_2_12',
 'cell_1_2_0',
 'header_cell_1_3_0',
 'header_cell_1_3_0',
 'header_cell_1_3_2',
 'header_cell_1_3_3',
 'header_cell_1_3_4',
 'header_cell_1_3_5',
 'header_cell_1_3_6',
 'header_cell_1_3_7',
 'header_cell_1_3_8',
 'header_cell_1

In [157]:
tables_content = extract_tables(doc_json)
pp.pprint(tables_content)

[['Personal information Personal information Personal information Personal '
  'information',
  'Full name Francisco Javier Insa Bohigues Francisco Javier Insa Bohigues '
  'Francisco Javier Insa Bohigues',
  'Date of birth (1988-01-25) 25 January 1988 (age 32) (1988-01-25) 25 January '
  '1988 (age 32) (1988-01-25) 25 January 1988 (age 32)',
  'Place of birth Alicante, Spain Alicante, Spain Alicante, Spain',
  'Height 1.88 m (6 ft 2 in) 1.88 m (6 ft 2 in) 1.88 m (6 ft 2 in)',
  'Playing position(s) Centre-back Centre-back Centre-back',
  'Club information Club information Club information Club information',
  "Current team Johor Darul Ta'zim Johor Darul Ta'zim Johor Darul Ta'zim",
  'Number 30 30 30',
  'Youth career Youth career Youth career Youth career',
  ' Albacete Albacete Albacete',
  'Senior career* Senior career* Senior career* Senior career*',
  'Years Team Apps (Gls)',
  '2005–2006 Albacete B 3 (0)',
  '2006–2007 Real Murcia B 8 (0)',
  '2007–2008 Alcoyano 13 (0)',
  '2008 

In [165]:
def create_markdown_style_table(table):
    cell_ids = table.get_ids()
    last_col_nr = 0
    table_text = ""
    for i, cell_id in enumerate(cell_ids):
        if not 'cell' in cell_id:
            continue
        splitted_cell_id = cell_id.split('_')
        col_nr = int(splitted_cell_id[-1])
        if i == 0:
            table_text += table.get_cell_content(cell_id)
        elif col_nr < last_col_nr:
            # New row
            table_text += "\n"
            table_text += table.get_cell_content(cell_id)
        else:
            table_text += " | " + table.get_cell_content(cell_id)
            
        last_col_nr = col_nr

    return replace_entities(table_text)

print(create_markdown_style_table(tables[1]))

Club | Club | Season | League | League | League | Cup | Cup | League Cup | League Cup | Continental | Continental | Total | Total
Club | Season
Division | Apps | Goals | Apps | Goals | Apps | Goals | Apps | Goals | Apps | Goals
Bali United | 2016 | Indonesia Soccer Championship A | 9 | 2 | 5 | 0 | 0 | 0 | – | – | 14 | 2
Bali United | Total | Total | 9 | 2 | 5 | 0 | 0 | 0 | – | – | 14 | 2
Pahang | 2017 | Malaysia Super League | 11 | 0 | 4 | 0 | 3 | 0 | – | – | 18 | 0
Pahang | Total | Total | 11 | 0 | 4 | 0 | 3 | 0 | – | – | 18 | 0
Bangkok Glass | 2018 | Thai League 1 | 0 | 0 | 0 | 0 | 0 | 0 | – | – | 0 | 0
Bangkok Glass | Total | Total | 0 | 0 | 0 | 0 | 0 | 0 | – | – | 0 | 0
Career Total | Career Total | Career Total | 0 | 0 | 0 | 0 | 0 | 0 | – | – | 0 | 0
