In [1]:
import collections
import os

In [2]:
import gensim.models
import nltk.tokenize
import numpy as np
import pandas as pd
from pattern import en
import random
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
import import_ipynb
import aux.relation_extraction
import aux.nlp

importing Jupyter notebook from /Users/YK/mt/project/aux/relation_extraction.ipynb
importing Jupyter notebook from /Users/YK/mt/project/aux/nlp.ipynb


In [4]:
STATEMENTS_DIR = "/Users/YK/mt/project/statements_5/"
RACE_PART = "test/middle"
RACE_DIR = "/Users/YK/mt/RACE"
PARSED_RACE_DIR = "/Users/YK/mt/parsed/race"

### Finding the Most Important Words (Not Used)

In [5]:
file_names = os.listdir(
    os.path.join(
        PARSED_RACE_DIR, RACE_PART
    )
)

In [6]:
def read_file(input_directory, file_name):
    text, _, _ = aux.relation_extraction.load_relations(
        os.path.join(input_directory, file_name)
    )
    return text 

In [7]:
texts = [
    read_file(
        os.path.join(PARSED_RACE_DIR, RACE_PART), 
        file_name
    )
    for file_name in file_names[:10]
]

In [8]:
tfidf_vectorizer = TfidfVectorizer(
    input="content",
    lowercase=False, 
    stop_words="english", 
    norm=None
)

In [9]:
tfidf = tfidf_vectorizer.fit_transform(texts)

In [10]:
a = np.argsort(tfidf.todense(), axis=1)

In [11]:
def get_tfidf(tfidf, tfidf_vectorizer, i, word):
    return float(tfidf[i, tfidf_vectorizer.get_feature_names().index(word)])

In [12]:
def get_most_important_words(tfidf_vectorizer, a, i):
    return np.array(
        tfidf_vectorizer.get_feature_names()
    )[np.array(a[i, -10:]).flatten()[::-1]]    

In [13]:
get_most_important_words(tfidf_vectorizer, a, 0)

array(['Easter', 'beach', 'chocolate', 'Australia', 'hunt', 'eggs',
       'bilbies', 'animals', 'tents', 'camping'], dtype='<U13')

In [14]:
get_most_important_words(tfidf_vectorizer, a, 1)

array(['Swift', 'US', 'does', 'laughed', 'speak', 'understand', 'way',
       'people', 'square', 'speech'], dtype='<U13')

### Paraphrasing

In [5]:
synonyms_df = pd.read_csv(
    "../filtered_edited_synonyms.csv"
)

In [6]:
synonyms_map = {
    row["word"]: row["synonyms"].split(", ") for _, row in synonyms_df.iterrows()
}

In [7]:
middle_vocabulary_df = pd.read_csv("../middle_vocabulary.csv", usecols=[0, 1])
middle_vocabulary_df.columns=["word", "pos"]
middle_vocabulary = set([w.lower() for w in middle_vocabulary_df.word])

In [8]:
exclude_df = pd.read_csv("../words_to_exclude.csv", header=None)

words_to_exclude = []
for i in range(4):
    words_to_exclude.extend(
        [line.split()[0].lower() for line in exclude_df.loc[~exclude_df[i].isnull(), i]]
    )

In [9]:
def make_past(verb):
    return en.conjugate(
        verb, 
        tense = en.PAST,        # INFINITIVE, PRESENT, PAST, FUTURE
        person = 1,              # 1, 2, 3 or None
        number = en.SINGULAR,       # SG, PL
        mood = en.INDICATIVE,     # INDICATIVE, IMPERATIVE, CONDITIONAL, SUBJUNCTIVE
        negated = False,          # True or False
        parse = True
    )


def make_progressive(verb):
    return en.conjugate(
        verb, 
        tense = en.PRESENT,        # INFINITIVE, PRESENT, PAST, FUTURE
        person = 1,              # 1, 2, 3 or None
        number = en.SINGULAR,       # SG, PL
        mood = en.INDICATIVE,     # INDICATIVE, IMPERATIVE, CONDITIONAL, SUBJUNCTIVE
        aspect = en.PROGRESSIVE,
        negated = False,          # True or False
        parse = True
    ) 


def make_perfect(verb):
    return en.conjugate(
        verb, 
        tense = en.PAST,        # INFINITIVE, PRESENT, PAST, FUTURE
        person = 1,              # 1, 2, 3 or None
        number = en.SINGULAR,       # SG, PL
        mood = en.INDICATIVE,     # INDICATIVE, IMPERATIVE, CONDITIONAL, SUBJUNCTIVE
        aspect = en.PARTICIPLE,
        negated = False,          # True or False
        parse = True
    )


def make_third_person(verb):
    return en.conjugate(
        verb, 
        tense = en.PRESENT,        # INFINITIVE, PRESENT, PAST, FUTURE
        person = 3,              # 1, 2, 3 or None
        number = en.SINGULAR,       # SG, PL
        mood = en.INDICATIVE,     # INDICATIVE, IMPERATIVE, CONDITIONAL, SUBJUNCTIVE
        negated = False,          # True or False
        parse = True
    )


def is_past(verb):
    return make_past(verb) == verb


def is_progressive(verb):
    return make_progressive(verb) == verb


def is_perfect(verb):
    return make_perfect(verb) == verb


def is_third_person(verb):
    return make_third_person(verb) == verb


def is_verb(pos):
    return pos[:1] == "V"


def is_plural(word):
    return en.singularize(word) != word


def is_noun(pos):
    return pos == "NN"

In [10]:
def join_tokens(tokens):
    if len(tokens) == 0:
        return ""
    else:
        tokens_with_space = [tokens[0]]
        prev_word = tokens[0]
        for token in tokens[1:]:
            if (
                token not in {".", ",", "!", "?", ";", ":", ")"} 
                    and token[:1] != "'"
                    and prev_word != "("
            ):
                tokens_with_space.append(" ")
            tokens_with_space.append(token)
            prev_word = token
        return aux.nlp.fix_quotes("".join(tokens_with_space))
    
    
def get_form(substitute, original_word, pos):
    if is_verb(pos):
        if is_past(original_word):
            return make_past(substitute)
        elif is_progressive(original_word):
            return make_progressive(substitute)
        elif is_perfect(original_word):
            return make_perfect(substitute)
        elif is_third_person(original_word):
            return make_third_person(substitute)
        else:
            return substitute
    elif is_noun(pos):
        if is_plural(original_word):
            return en.pluralize(substitute)
        else:
            return substitute
    else:
        return substitute
    

def is_in_vocabulary(synset, vocabulary):
    return synset[0] in vocabulary


def get_hypernym(token, pos, lemmatized_token, middle_vocabulary):
    if is_verb(pos) or is_noun(pos) or pos == "ADV" or pos == "ADJ":
        synsets = en.wordnet.synsets(lemmatized_token, pos=pos)
        for synset in synsets:
            if is_in_vocabulary(synset, middle_vocabulary):
                for hypernym in synsets[0].hypernyms():
                    if is_in_vocabulary(hypernym, middle_vocabulary):
                        return get_form(hypernym[0], token, pos)
    return None
    

def get_synonym(token, pos, lemmatized_token, synonyms_map):
    if lemmatized_token in synonyms_map:
        synonym = np.random.choice(synonyms_map[lemmatized_token], 1)[0]
        return get_form(synonym, token, pos)
    else:
        return None
    
    
def paraphrase(
    sentence, 
    synonyms_map, 
    middle_vocabulary, 
    synonym_stats, 
    hypernym_stats, 
    words_to_exclude,
    use_hypernyms=True
):
    tokens, pos = zip(*en.tag(sentence))
    tokens = list(tokens)
    substituted = False
    for i in range(len(tokens)):
        lemmatized_token = en.lemma(tokens[i])
        if not lemmatized_token in words_to_exclude:
            synonym = get_synonym( # trying to substitute with a synonym first
                tokens[i], pos[i], lemmatized_token, synonyms_map
            )
            if synonym is not None: 
                synonym_stats[lemmatized_token] += 1
                tokens[i] = synonym
                substituted = True
            else: # no synonym found, trying to substitute with a hypernym
                if use_hypernyms:
                    hypernym = get_hypernym(
                        tokens[i], pos[i], lemmatized_token, middle_vocabulary
                    )
                    if hypernym is not None:
                        hypernym_stats[lemmatized_token] += 1
                        tokens[i] = hypernym
                        substituted = True
    return join_tokens(tokens), substituted

In [11]:
paraphrase(
    "He gained access to the file.", 
    synonyms_map, 
    middle_vocabulary,
    collections.Counter(),
    collections.Counter(),
    words_to_exclude
)

('He obtained rights to the record.', True)

In [19]:
alternatives_df = pd.read_excel(
    os.path.join(
        STATEMENTS_DIR, 
        f"prefiltered_alternatives_test-middle_9bed806f.xlsx"
    ),
    index=False
)

In [20]:
def paraphrase_statements(statements, synonyms_map, words_to_exclude, use_hypernyms=True):
    paraphrased_statements = []
    substitution_indicators = []
    synonym_stats = collections.Counter()
    hypernym_stats = collections.Counter()
    for statement in statements:
        paraphrased_statement, substituted = paraphrase(
            statement, 
            synonyms_map, 
            middle_vocabulary, 
            synonym_stats, 
            hypernym_stats, 
            words_to_exclude,
            use_hypernyms
        )
        paraphrased_statements.append(paraphrased_statement)
        substitution_indicators.append(substituted)
    return paraphrased_statements, substitution_indicators, synonym_stats, hypernym_stats

In [21]:
def add_paraphrased_statement_in_place(
    df, column, synonyms_map, words_to_exclude, use_hypernyms=True
):
    paraphrased_statements, substitution_indicators, synonym_stats, hypernym_stats = \
        paraphrase_statements(
            df[column], synonyms_map, words_to_exclude, use_hypernyms
        )
    prefix = "synonym" if not use_hypernyms else "synonym_and_hypernym"
    df[f"{prefix}_paraphrased_{column}"] = paraphrased_statements
    df[f"was_{column}_{prefix}_paraphrased"] = substitution_indicators
    return synonym_stats, hypernym_stats

In [22]:
add_paraphrased_statement_in_place(
    alternatives_df, "true_statement", synonyms_map, words_to_exclude, False
)
add_paraphrased_statement_in_place(
    alternatives_df, "alternative_statement", synonyms_map, words_to_exclude, False
);

In [23]:
synonym_stats__true_statements, hypernym_stats__true_statements \
    = add_paraphrased_statement_in_place(
        alternatives_df, "true_statement", synonyms_map, words_to_exclude
    )
synonym_stats__alternatives, hypernym_stats__alternatives = \
    add_paraphrased_statement_in_place(
        alternatives_df, "alternative_statement", synonyms_map, words_to_exclude
    )

In [24]:
synonym_stats__true_statements.most_common()

[('say', 65),
 ('all', 56),
 ('very', 54),
 ('good', 33),
 ('learn', 27),
 ('away', 27),
 ('big', 25),
 ('buy', 22),
 ('love', 20),
 ('father', 19),
 ('little', 19),
 ('new', 17),
 ('house', 16),
 ('job', 16),
 ('important', 15),
 ('idea', 15),
 ('kind', 14),
 ('great', 14),
 ('part', 14),
 ('earth', 13),
 ('story', 13),
 ('begin', 11),
 ('popular', 11),
 ('usually', 10),
 ('taxi', 10),
 ('writer', 10),
 ('place', 9),
 ('woman', 9),
 ('fly', 7),
 ('old', 7),
 ('easy', 7),
 ('decide', 7),
 ('problem', 7),
 ('hate', 7),
 ('receive', 6),
 ('quiet', 6),
 ('gift', 6),
 ('child', 5),
 ('round', 5),
 ('answer', 4),
 ('word', 4),
 ('beautiful', 4),
 ('happy', 4),
 ('cold', 3),
 ('explain', 3),
 ('arrive', 3),
 ('sure', 3),
 ('quite', 3),
 ('street', 3),
 ('strange', 3),
 ('fat', 3),
 ('wonderful', 2),
 ('sad', 2),
 ('right', 2),
 ('dark', 2),
 ('shop', 2),
 ('wrong', 2),
 ('nearly', 2),
 ('bad', 2),
 ('dear', 2),
 ('trouble', 2),
 ('kill', 2),
 ('angry', 2),
 ('trip', 2),
 ('raise', 2),
 ('tru

In [25]:
hypernym_stats__true_statements.most_common()

[('fact', 180),
 ('time', 33),
 ('air', 18),
 ('area', 18),
 ('book', 17),
 ('man', 17),
 ('jump', 15),
 ('read', 14),
 ('tiger', 14),
 ('team', 13),
 ('boy', 13),
 ('walk', 13),
 ('number', 12),
 ('girl', 12),
 ('way', 12),
 ('banana', 12),
 ('adult', 12),
 ('party', 11),
 ('age', 10),
 ('mom', 10),
 ('friend', 9),
 ('seem', 9),
 ('example', 9),
 ('family', 9),
 ('dance', 9),
 ('math', 9),
 ('study', 8),
 ('letter', 8),
 ('information', 8),
 ('collect', 8),
 ('speed', 8),
 ('play', 8),
 ('exam', 7),
 ('colour', 7),
 ('sleep', 7),
 ('gentleman', 7),
 ('plastic', 6),
 ('library', 6),
 ('present', 6),
 ('birthday', 6),
 ('contain', 6),
 ('dinner', 6),
 ('factory', 6),
 ('repair', 6),
 ('c', 6),
 ('lunch', 6),
 ('drive', 6),
 ('entrance', 6),
 ('today', 6),
 ('phrase', 6),
 ('exercise', 6),
 ('housework', 6),
 ('winter', 6),
 ('consult', 6),
 ('category', 6),
 ('thing', 5),
 ('afternoon', 5),
 ('save', 5),
 ('send', 5),
 ('one', 5),
 ('power', 5),
 ('classroom', 4),
 ('produce', 4),
 ('ro

In [26]:
sum(
    alternatives_df.loc[:, "was_true_statement_synonym_and_hypernym_paraphrased"]
) / len(alternatives_df)

0.9323671497584541

In [31]:
alternatives_df.columns

Index(['text_no', 'true_statement', 'nuclei_hash', 'alternative_statement',
       'relation_type', 'position', 'distance_words', 'distance_sentences',
       'sn_length', 'sn_length_relative_difference', 'jaccard_distance',
       'edit_distance', 'rule', 'reason', 'synonym_paraphrased_true_statement',
       'was_true_statement_synonym_paraphrased',
       'synonym_paraphrased_alternative_statement',
       'was_alternative_statement_synonym_paraphrased',
       'synonym_and_hypernym_paraphrased_true_statement',
       'was_true_statement_synonym_and_hypernym_paraphrased',
       'synonym_and_hypernym_paraphrased_alternative_statement',
       'was_alternative_statement_synonym_and_hypernym_paraphrased'],
      dtype='object')

In [32]:
output_df = alternatives_df[
    [
        "text_no",
        "true_statement",
        "nuclei_hash",
        "synonym_paraphrased_true_statement",
#         "synonym_and_hypernym_paraphrased_true_statement",
        "alternative_statement",
        "synonym_paraphrased_alternative_statement",
#         "synonym_and_hypernym_paraphrased_alternative_statement",
        "was_true_statement_synonym_paraphrased",
        "was_alternative_statement_synonym_paraphrased",
#         "was_true_statement_synonym_and_hypernym_paraphrased",
#         "was_alternative_statement_synonym_and_hypernym_paraphrased"
        "reason",
        "position"
    ]
]

In [28]:
output_df.iloc[0]["true_statement"]

'The plastic leg Jeff can run. Moreover, Jeff made a plan with his friends who had plastic legs.'

In [29]:
output_df.iloc[0]["synonym_and_hypernym_paraphrased_true_statement"]

'The solid leg Jeff can run. Moreover, Jeff made a plan with his friends who had plastic legs.'

In [33]:
output_df.to_excel(
    os.path.join(
        STATEMENTS_DIR, 
        f"paraphrased_{RACE_PART.replace('/', '-')}_{random.randint(0, 2**32):x}.xlsx"
    ),
    index=False
)

### Word2Vec (Not Used)

In [15]:
class Word2VecModel:
    W2V_MODEL_DIR = "../w2v_models"
    
    def __init__(self, model_name):
        self.model_name = model_name
        self.model = gensim.models.KeyedVectors.load_word2vec_format(
            os.path.join(Word2VecModel.W2V_MODEL_DIR, f"{model_name}.txt")
        )

In [16]:
w2v_nolg_100d = Word2VecModel("enwiki_20180420_nolg_100d")

In [41]:
w2v_nolg_100d.model.most_similar("way")

[('ENTITY/Stour_and_Orwell_Walk', 0.732032835483551),
 ('path', 0.7227527499198914),
 ('führerweg', 0.7152807712554932),
 ('hopefully', 0.7044017314910889),
 ('ENTITY/WP:EL', 0.7037861347198486),
 ('ENTITY/WP:DR', 0.7026253938674927),
 ('jitler', 0.7024155855178833),
 ('cenzar', 0.7017824649810791),
 ('torpel', 0.7011029720306396),
 ('ENTITY/WP:Wikipedia', 0.7005131244659424)]

In [42]:
w2v_nolg_100d.model.most_similar("people")

[('persons', 0.7992599010467529),
 ('residents', 0.7535831928253174),
 ('ENTITY/People', 0.7503395080566406),
 ('helsinkians', 0.7265914678573608),
 ('citizens', 0.7153019905090332),
 ('natives', 0.6979749798774719),
 ('youths', 0.6972818374633789),
 ('inhabitants', 0.694731593132019),
 ('tangbotens', 0.6896241903305054),
 ('americans', 0.6892567873001099)]

In [43]:
w2v_nolg_100d.model.most_similar("chocolate")

[('ENTITY/Chocolate', 0.9106836318969727),
 ('caramel', 0.8976747989654541),
 ('ENTITY/Caramel', 0.8792448043823242),
 ('ENTITY/White_chocolate', 0.8668451309204102),
 ('ENTITY/Peanut_butter', 0.8533239364624023),
 ('ENTITY/Types_of_chocolate', 0.8502905368804932),
 ('chocolates', 0.8490653038024902),
 ('ENTITY/Types_of_chocolate#Milk_chocolate', 0.8471919298171997),
 ('candy', 0.844744086265564),
 ('cream', 0.842796266078949)]

In [46]:
w2v_nolg_100d.model.most_similar("cancel", topn=20)

[('canceling', 0.8277071714401245),
 ('cancelling', 0.8080153465270996),
 ('postpone', 0.7998612523078918),
 ('reschedule', 0.7844353914260864),
 ('suspend', 0.7557451725006104),
 ('rescind', 0.7517332434654236),
 ('cancellation', 0.7479344010353088),
 ('cancels', 0.7384265661239624),
 ('announce', 0.7238617539405823),
 ('reconsider', 0.7106267213821411),
 ('renew', 0.7019257545471191),
 ('discontinue', 0.6998554468154907),
 ('postponing', 0.6976670026779175),
 ('postponement', 0.6904169917106628),
 ('ENTITY/Postpone_to_a_certain_time', 0.6869893670082092),
 ('announcing', 0.679280161857605),
 ('reinstate', 0.6781035661697388),
 ('revoke', 0.6780909299850464),
 ('rescheduling', 0.6767849922180176),
 ('renegotiate', 0.6732800602912903)]