In [168]:
from csv import DictReader

DATASET_TARGET = 'test'
DATASET_USE_SAMPLE = False

with open('gap-%s.tsv' % DATASET_TARGET, newline='', encoding='utf-8') as csvfile:
    dataset_reader = DictReader(csvfile, delimiter='\t')
    dataset = list(dataset_reader)

    if DATASET_USE_SAMPLE:
        dataset = dataset[:10]

In [169]:
from nltk.parse.corenlp import CoreNLPServer, CoreNLPDependencyParser

server = CoreNLPServer(
    "./corenlp/stanford-corenlp-4.0.0.jar",
    "./corenlp/stanford-corenlp-4.0.0-models.jar"
)
server.start()

print("Started CoreNLP Server Successfully")

parser = CoreNLPDependencyParser(url=server.url)

Started CoreNLP Server Successfully


In [170]:
from difflib import SequenceMatcher
from nltk import word_tokenize
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize.punkt import PunktSentenceTokenizer
from tqdm.auto import tqdm

# Match leaves of the tree with index in data
# using index of word_tokenize
# Also, change nonterminals from "word" to "word/index in words"
def label_words(sent, start_index_sent, graph, word_index):
    node_values = sorted(
        [value for value in graph.nodes.values() if value['word'] is not None],
        key=lambda item: item['word']
    )
    leave_addresses = [node['address'] for node in node_values]
    leaves = [node['word'] for node in node_values]
    
    word_boundaries = list(word_tokenizer.span_tokenize(sent))
    words = [sent[start:end] for (start, end) in word_boundaries]
    
    word_boundaries, words = zip(*sorted(
        zip(word_boundaries, words),
        key=lambda item: item[1]
    ))
    
    matcher = SequenceMatcher(None, leaves, words, autojunk=False)
    
    i = word_index
    word_results = []
    
    for a_idx_start, b_idx_start, size in matcher.get_matching_blocks():
        for j in range(size):
            a_idx = a_idx_start + j
            b_idx = b_idx_start + j

            graph.nodes[leave_addresses[a_idx]]['word'] = "%s/%d" % (leaves[a_idx], i)

            start, end = word_boundaries[b_idx]
            word_results.append(("%s/%d" % (words[b_idx], i), start + start_index_sent, end + start_index_sent))
            i += 1
    
    word_results = sorted(
        word_results,
        key=lambda item: item[1]
    )
    
    return tree, word_results, word_index + len(leaves)


# 274, "Cheryl Cassidy" to ["Cheryl/39", "Cassidy/40"]
def find_location(location, name, words):
    location_end = location + len(name)
    results = []
    
    for (word, start, end) in words:
        if start <= location < end:
            results.append(word)
            continue
        
        if start < location_end <= end:
            results.append(word)
            continue
        
        if location < start and end < location_end:
            results.append(word)
            continue
    
    return results


# Tokenize sentences, parse dependency of the sentence
# and save it to the data_preprocessed
data_preprocessed = []
sent_tokenizer = PunktSentenceTokenizer()
word_tokenizer = TreebankWordTokenizer()
for data in tqdm(dataset):
    data_text = data['Text']
    word_index = 0
    sent_boundaries = sent_tokenizer.span_tokenize(data_text)
    sent_trees = []
    sent_words = []
    
    for (start, end) in sent_boundaries:
        sent = data_text[start:end]
        tree = None
        try:
            tree = parser.parse_one(word_tokenize(sent))

        except:
            # In case of timeout or no tree
            continue
        
        # Label words with its index in order to distinguish a specific word used at a specific position
        # For example, I like her and her friends, is labeld as
        # I/1 like/2 her/3 and/4 her/5 friends/6
        # so I can distinguish her/3 and her/5
        tree, words, word_index = label_words(sent, start, tree, word_index)
        sent_trees.append(tree)
        sent_words.append(words)
    
    sent_words_flatten = sum(sent_words, [])
    
    # Find which word does Offset point.
    # (ex: (her, 8) points her/3 as 8-th character of "I like her and her friends" points her/3)
    pron_words = find_location(int(data['Pronoun-offset']), data['Pronoun'], sent_words_flatten)
    a_words = find_location(int(data['A-offset']), data['A'], sent_words_flatten)
    b_words = find_location(int(data['B-offset']), data['B'], sent_words_flatten)
    sent_words_noidx = [[word for word, start, end in words] for words in sent_words]
    data_preprocessed.append({
        'trees': sent_trees,
        'words': sent_words_noidx,
        'pron': pron_words,
        'A': a_words,
        'B': b_words
    })


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




In [118]:
from nltk import NaiveBayesClassifier
from nltk.corpus import names as names_corpus
from random import random

def predict_gender(name):
    return name_gender_classifier.classify(
        name_feature(name)
    )

# Generate a feature, which will be used in Naive bayesian classifier, from name
def name_feature(name):
    return {
        'first': name[0],
        'suffix1': name[-1],
        'suffix2': name[-2]
    }

name_gender_dataset = sorted(
    (
        [(name, 'male') for name in names_corpus.words('male.txt')] +
        [(name, 'female') for name in names_corpus.words('female.txt')]
    ), key = lambda k: random()
)
feature_gender_dataset = [
    (name_feature(name), gender) for name, gender in name_gender_dataset
]

name_gender_classifier = NaiveBayesClassifier.train(feature_gender_dataset)


In [152]:
# Find sentence which includes the word
def find_sent_from_word(sent_words, word):
    return [i for i, words in enumerate(sent_words) if word in words][0]


# Find triplets, which includes the word, from dependency graph
def query(querying, words):
    queried_features = []
    for feature in querying:
        (word, word_pos), rel, (target, target_pos) = feature
        if word in words or target in words:
            queried_features.append(feature)
    
    return queried_features


# Determine a pronoun refers to the target word.
def is_same(sent_trees, sent_words, pron_words, target_words):
    if len(pron_words) == 0 or len(target_words) == 0:
        return False
    
    pron_sentence = 0
    target_sentence = 0
    
    try:
        pron_sentence = find_sent_from_word(sent_words, pron_words[0])
        target_sentence = find_sent_from_word(sent_words, target_words[0])
    
    except:
        tqdm.write("Cannot find sent with target from data... skipping!")
        return False
    
    querying = list(sent_trees[pron_sentence].triples()) + \
        list(sent_trees[target_sentence].triples())
    
    words = pron_words + target_words
    features = query(querying, words)
    
    # Pronoun cannot refer to word which is not mentioned before
    if pron_sentence < target_sentence:
        return False
    
    # Predict by depdency when two words are in the same sentences
    if pron_sentence == target_sentence:
        for ((word, _), rel, (target, __)) in features:
            if 'subj' in rel:
                queried_features = query(querying, [word])
                obj = []
                for ((qw, wpos), qr, (qt, tpos)) in queried_features:
                    # Objective & Subjective -> NOT SAME
                    if 'obj' in qr and qt in words:
                        return False
                    
                    # Complement & Subjective -> SAME
                    if 'comp' in qr and qt in words:
                        return True
    
    # Predict by gender
    pron_words_norm = [word.split('/')[0].lower() for word in pron_words]
    pron_gender = 'female' if ('her', 'she') in pron_words_norm else 'male'

    if pron_gender != predict_gender(' '.join(target_words)):
        return False
    
    # If subject of previous sentence of pronoun is target
    if pron_sentence == target_sentence + 1:
        for ((word, _), rel, (target, __)) in query(
            list(sent_trees[target_sentence].triples()),
            target_words
        ):
            if 'subj' in rel:
                return True
    
    # When it is not sure
    return None


same_count = 0
results = []
for data in data_preprocessed:
    sent_trees, sent_words, pron_words, a_words, b_words = (
        data['trees'], data['words'], data['pron'], data['A'], data['B']
    )
    
    a_same = is_same(sent_trees, sent_words, pron_words, a_words)
    b_same = is_same(sent_trees, sent_words, pron_words, b_words)
    
    if a_same is None and b_same is not None:
        results.append((not b_same, b_same))
        continue
    
    if a_same is not None and b_same is None:
        results.append((a_same, not a_same))
        continue
    
    if a_same is None and b_same is None:
        if len(a_words) == 0 or len(b_words) == 0 or len(pron_words) == 0:
            results.append((True, True))
            continue
        
        # When it was unable to determine the pronoun refers to another word,
        # flag it as uncertain and return the near word.
        pron_sentence = find_sent_from_word(sent_words, pron_words[0])
        a_sentence = find_sent_from_word(sent_words, a_words[0])
        b_sentence = find_sent_from_word(sent_words, b_words[0])
        
        a_dist = pron_sentence - a_sentence
        b_dist = pron_sentence - b_sentence
        
        results.append((None, False, True) if a_dist > b_dist else (None, True, False))
        continue
    
    results.append((a_same, b_same))

In [137]:
from csv import writer as CSVWriter

# Write the result
with open ('./results.tsv', 'w', encoding='utf-8') as csvfile:
    writer = CSVWriter(csvfile, delimiter='\t')
    
    for i, result in enumerate(results):
        confident = result[0]
        if confident is None:
            a_coref, b_coref = result[1:]
        
        else:
            a_coref, b_coref = result
        
        writer.writerow((
            dataset[i]['ID'],
            a_coref,
            b_coref
        ))

In [166]:
from urllib.parse import unquote
from wikipediaapi import Wikipedia

wiki = Wikipedia('en')
wiki_crawled = {}

# Finds uncertain results and crawl it from wikipedia
for i, result in enumerate(tqdm(results)):
    confident = result[0]
    if confident is not None:
        continue
    
    wiki_url = unquote(dataset[i]['URL'].replace('http://en.wikipedia.org/wiki/', ''))
    wiki_text = wiki.page(wiki_url).summary
    wiki_crawled[i] = wiki_text


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




In [142]:
"""
import json
with open('./wikidump.txt', 'w', encoding='utf-8') as f:
    f.write(json.dumps(wiki_crawled))
"""

In [164]:
def predict_with_wiki(sent_trees, sent_words, pron_words, target_words, wiki_text):
    target_normalized = ' '.join([t_word.split('/')[0] for t_word in target_words])
    
    # If A or B occur in wikipedia,
    if target_normalized not in wiki_text:
        return None, False
    
    # Preprocess the summary text
    target_idx = wiki_text.index(target_normalized)
    word_index = -10000
    wiki_boundaries = sent_tokenizer.span_tokenize(wiki_text)
    wiki_trees = []
    wiki_words = []
    
    for (start, end) in wiki_boundaries:
        sent = wiki_text[start:end]
        tree = None
        try:
            tree = parser.parse_one(word_tokenize(sent))

        except:
            # In case of timeout or no tree
            continue
        
        tree, words, word_index = label_words(sent, start, tree, word_index)
        wiki_trees.append(tree)
        wiki_words.append(words)
    
    wiki_words_flatten = sum(wiki_words, [])
    wiki_target_words = find_location(target_idx, target_normalized, wiki_words_flatten)
    
    # Combine the summary text with original text
    total_sents = [[word for word, start, end in words] for words in wiki_words] + sent_words
    total_trees = wiki_trees + sent_trees
    pron_sentence = 0
    target_sentence = 0
    
    # Find two words, pronoun and (A or B)
    try:
        pron_sentence = find_sent_from_word(total_sents, pron_words[0])
        target_sentence = find_sent_from_word(total_sents, wiki_target_words[0])
    
    except:
        tqdm.write("Cannot find sent with target from data... skipping!")
        return None, True
    
    # Remain only two sentences, which includes the pronoun or (A or B)
    # Then, do the same steps
    return is_same(
        [total_trees[target_sentence], total_trees[pron_sentence]],
        [total_sents[target_sentence], total_sents[pron_sentence]],
        pron_words,
        wiki_target_words
    ), True


for i, wiki_text in tqdm(wiki_crawled.items()):
    data = data_preprocessed[i]
    sent_trees, sent_words, pron_words, a_words, b_words = (
        data['trees'], data['words'], data['pron'], data['A'], data['B']
    )
    
    a_same, a_inwiki = predict_with_wiki(sent_trees, sent_words, pron_words, a_words, wiki_text)
    b_same, b_inwiki = predict_with_wiki(sent_trees, sent_words, pron_words, a_words, wiki_text)
        
    # if a_same is None and b_same is None:
    #     a_same = True if a_inwiki else None
    #     b_same = True if b_inwiki else None
    
    if a_same is None and b_same is None:
        continue

    if a_same is None and b_same is not None:
        results[i] = ((not b_same, b_same))
        continue
    
    if a_same is not None and b_same is None:
        results[i] = ((a_same, not a_same))
        continue
    
    results[i] = ((a_same, b_same))


HBox(children=(FloatProgress(value=0.0, max=1224.0), HTML(value='')))

Cannot find sent with target from data... skipping!
Cannot find sent with target from data... skipping!



In [167]:
# Write the result
with open ('./results_page.tsv', 'w', encoding='utf-8') as csvfile:
    writer = CSVWriter(csvfile, delimiter='\t')
    
    for i, result in enumerate(results):
        confident = result[0]
        if confident is None:
            a_coref, b_coref = result[1:]
        
        else:
            a_coref, b_coref = result
        
        writer.writerow((
            dataset[i]['ID'],
            a_coref,
            b_coref
        ))

In [None]:
server.stop()