In [1]:
import collections
import os

In [289]:
import gensim.models
import nltk.tokenize
import numpy as np
import pandas as pd
from pattern import en
import random
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
import import_ipynb
import aux.relation_extraction
import aux.nlp

In [112]:
STATEMENTS_DIR = "/Users/YK/mt/project/statements_3/"
RACE_PART = "train/middle"
RACE_DIR = "/Users/YK/mt/RACE"
PARSED_RACE_DIR = "/Users/YK/mt/parsed/race"

### Finding the Most Important Words (Not Used)

In [5]:
file_names = os.listdir(
    os.path.join(
        PARSED_RACE_DIR, RACE_PART
    )
)

In [6]:
def read_file(input_directory, file_name):
    text, _, _ = aux.relation_extraction.load_relations(
        os.path.join(input_directory, file_name)
    )
    return text 

In [7]:
texts = [
    read_file(
        os.path.join(PARSED_RACE_DIR, RACE_PART), 
        file_name
    )
    for file_name in file_names[:10]
]

In [8]:
tfidf_vectorizer = TfidfVectorizer(
    input="content",
    lowercase=False, 
    stop_words="english", 
    norm=None
)

In [9]:
tfidf = tfidf_vectorizer.fit_transform(texts)

In [10]:
a = np.argsort(tfidf.todense(), axis=1)

In [11]:
def get_tfidf(tfidf, tfidf_vectorizer, i, word):
    return float(tfidf[i, tfidf_vectorizer.get_feature_names().index(word)])

In [12]:
def get_most_important_words(tfidf_vectorizer, a, i):
    return np.array(
        tfidf_vectorizer.get_feature_names()
    )[np.array(a[i, -10:]).flatten()[::-1]]    

In [13]:
get_most_important_words(tfidf_vectorizer, a, 0)

array(['Easter', 'beach', 'chocolate', 'Australia', 'hunt', 'eggs',
       'bilbies', 'animals', 'tents', 'camping'], dtype='<U13')

In [14]:
get_most_important_words(tfidf_vectorizer, a, 1)

array(['Swift', 'US', 'does', 'laughed', 'speak', 'understand', 'way',
       'people', 'square', 'speech'], dtype='<U13')

### Paraphrasing

In [249]:
synonyms_df = pd.read_csv(
    "../filtered_edited_synonyms.csv"
)

In [251]:
synonyms_map = {
    row["word"]: row["synonyms"].split(", ") for _, row in synonyms_df.iterrows()
}

In [287]:
middle_vocabulary_df = pd.read_csv("../middle_vocabulary.csv", usecols=[0, 1])
middle_vocabulary_df.columns=["word", "pos"]
middle_vocabulary = set([w.lower() for w in middle_vocabulary_df.word])

In [354]:
def make_past(verb):
    return en.conjugate(
        verb, 
        tense = en.PAST,        # INFINITIVE, PRESENT, PAST, FUTURE
        person = 1,              # 1, 2, 3 or None
        number = en.SINGULAR,       # SG, PL
        mood = en.INDICATIVE,     # INDICATIVE, IMPERATIVE, CONDITIONAL, SUBJUNCTIVE
        negated = False,          # True or False
        parse = True
    )


def make_progressive(verb):
    return en.conjugate(
        verb, 
        tense = en.PRESENT,        # INFINITIVE, PRESENT, PAST, FUTURE
        person = 1,              # 1, 2, 3 or None
        number = en.SINGULAR,       # SG, PL
        mood = en.INDICATIVE,     # INDICATIVE, IMPERATIVE, CONDITIONAL, SUBJUNCTIVE
        aspect = en.PROGRESSIVE,
        negated = False,          # True or False
        parse = True
    ) 


def make_perfect(verb):
    return en.conjugate(
        verb, 
        tense = en.PAST,        # INFINITIVE, PRESENT, PAST, FUTURE
        person = 1,              # 1, 2, 3 or None
        number = en.SINGULAR,       # SG, PL
        mood = en.INDICATIVE,     # INDICATIVE, IMPERATIVE, CONDITIONAL, SUBJUNCTIVE
        aspect = en.PARTICIPLE,
        negated = False,          # True or False
        parse = True
    )


def make_third_person(verb):
    return en.conjugate(
        verb, 
        tense = en.PRESENT,        # INFINITIVE, PRESENT, PAST, FUTURE
        person = 3,              # 1, 2, 3 or None
        number = en.SINGULAR,       # SG, PL
        mood = en.INDICATIVE,     # INDICATIVE, IMPERATIVE, CONDITIONAL, SUBJUNCTIVE
        negated = False,          # True or False
        parse = True
    )


def is_past(verb):
    return make_past(verb) == verb


def is_progressive(verb):
    return make_progressive(verb) == verb


def is_perfect(verb):
    return make_perfect(verb) == verb


def is_third_person(verb):
    return make_third_person(verb) == verb


def is_verb(pos):
    return pos[:1] == "V"


def is_plural(word):
    return en.singularize(word) != word


def is_noun(pos):
    return pos == "NN"

In [373]:
def join_tokens(tokens):
    if len(tokens) == 0:
        return ""
    else:
        tokens_with_space = [tokens[0]]
        prev_word = tokens[0]
        for token in tokens[1:]:
            if (
                token not in {".", ",", "!", "?", ";", ":", ")"} 
                    and token[:1] != "'"
                    and prev_word != "("
            ):
                tokens_with_space.append(" ")
            tokens_with_space.append(token)
            prev_word = token
        return "".join(tokens_with_space)
    
    
def get_form(substitute, original_word, pos):
    if is_verb(pos):
        if is_past(original_word):
            return make_past(substitute)
        elif is_progressive(original_word):
            return make_progressive(substitute)
        elif is_perfect(original_word):
            return make_perfect(substitute)
        elif is_third_person(original_word):
            return make_third_person(substitute)
        else:
            return substitute
    elif is_noun(pos):
        if is_plural(original_word):
            return en.pluralize(substitute)
        else:
            return substitute
    else:
        return substitute
    

def is_in_vocabulary(synset, vocabulary):
    return synset[0] in vocabulary


def get_hypernym(token, pos, lemmatized_token, middle_vocabulary):
    if is_verb(pos) or is_noun(pos) or pos == "ADV" or pos == "ADJ":
        synsets = en.wordnet.synsets(lemmatized_token, pos=pos)
        for synset in synsets:
            if is_in_vocabulary(synset, middle_vocabulary):
                for hypernym in synsets[0].hypernyms():
                    if is_in_vocabulary(hypernym, middle_vocabulary):
                        return get_form(hypernym[0], token, pos)
    return None
    

def get_synonym(token, pos, lemmatized_token, synonyms_map):
    if lemmatized_token in synonyms_map:
        synonym = np.random.choice(synonyms_map[lemmatized_token], 1)[0]
        return get_form(synonym, token, pos)
    else:
        return None
    
    
def paraphrase(sentence, synonyms_map, middle_vocabulary, synonym_stats, hypernym_stats):
    tokens, pos = zip(*en.tag(sentence))
    tokens = list(tokens)
    substituted = False
    for i in range(len(tokens)):
        lemmatized_token = lemma(tokens[i])
        synonym = get_synonym( # trying to substitute with a synonym first
            tokens[i], pos[i], lemmatized_token, synonyms_map
        )
        if synonym is not None: 
            synonym_stats[lemmatized_token] += 1
            tokens[i] = synonym
            substituted = True
        else: # no synonym found, trying to substitute with a hypernym
            hypernym = get_hypernym(
                tokens[i], pos[i], lemmatized_token, middle_vocabulary
            )
            if hypernym is not None:
                hypernym_stats[lemmatized_token] += 1
                tokens[i] = hypernym
                substituted = True
    return join_tokens(tokens), substituted

In [377]:
paraphrase(
    "He gained access to the file.", 
    synonyms_map, 
    middle_vocabulary,
    collections.Counter(),
    collections.Counter()
)

('He obtained rights to the record.', True)

In [387]:
alternatives_df = pd.read_excel(
    os.path.join(
        STATEMENTS_DIR, 
        f"alternatives_train-middle_68aa2429.xlsx"
    ),
    index=False
)

In [388]:
def paraphrase_statements(statements, synonyms_map):
    paraphrased_statements = []
    substitution_indicators = []
    synonym_stats = collections.Counter()
    hypernym_stats = collections.Counter()
    for statement in statements:
        paraphrased_statement, substituted = paraphrase(
            statement, synonyms_map, middle_vocabulary, synonym_stats, hypernym_stats
        )
        paraphrased_statements.append(paraphrased_statement)
        substitution_indicators.append(substituted)
    return paraphrased_statements, substitution_indicators, synonym_stats, hypernym_stats

In [389]:
def add_paraphrased_statement_in_place(df, column, synonyms_map):
    paraphrased_statements, substitution_indicators, synonym_stats, hypernym_stats = \
        paraphrase_statements(
            df[column], synonyms_map
        )
    df[f"paraphrased_{column}"] = paraphrased_statements
    df[f"was_{column}_paraphrased"] = substitution_indicators
    return synonym_stats, hypernym_stats

In [390]:
synonym_stats__true_statements, hypernym_stats__true_statements \
    = add_paraphrased_statement_in_place(
        alternatives_df, "true_statement", synonyms_map
    )
synonym_stats__alternatives, hypernym_stats__alternatives = \
    add_paraphrased_statement_in_place(
        alternatives_df, "alternative_statement", synonyms_map
    )

In [391]:
synonym_stats__true_statements.most_common()

[('say', 2100),
 ('very', 1530),
 ('all', 1324),
 ('good', 883),
 ('new', 670),
 ('old', 591),
 ('learn', 563),
 ('father', 510),
 ('big', 473),
 ('begin', 460),
 ('buy', 439),
 ('kind', 428),
 ('place', 421),
 ('little', 401),
 ('love', 387),
 ('important', 355),
 ('house', 352),
 ('away', 351),
 ('problem', 350),
 ('shop', 342),
 ('happy', 329),
 ('usually', 315),
 ('word', 273),
 ('popular', 263),
 ('enjoy', 259),
 ('job', 255),
 ('great', 243),
 ('answer', 241),
 ('woman', 237),
 ('bad', 236),
 ('part', 232),
 ('right', 227),
 ('decide', 214),
 ('fly', 209),
 ('beautiful', 205),
 ('idea', 205),
 ('street', 183),
 ('easy', 170),
 ('angry', 169),
 ('story', 169),
 ('earth', 155),
 ('hurt', 153),
 ('difficult', 151),
 ('cry', 147),
 ('child', 141),
 ('sure', 134),
 ('fast', 127),
 ('cold', 113),
 ('trip', 102),
 ('sad', 98),
 ('arrive', 96),
 ('strange', 94),
 ('wrong', 91),
 ('explain', 88),
 ('gift', 85),
 ('quite', 84),
 ('wonderful', 78),
 ('kill', 77),
 ('fresh', 74),
 ('dark', 7

In [392]:
hypernym_stats__true_statements.most_common()

[('fact', 5550),
 ('time', 1207),
 ('come', 1029),
 ('take', 975),
 ('help', 831),
 ('man', 751),
 ('ask', 673),
 ('tell', 670),
 ('mother', 657),
 ('play', 583),
 ('way', 553),
 ('water', 499),
 ('call', 491),
 ('try', 488),
 ('read', 459),
 ('eat', 453),
 ('live', 436),
 ('talk', 409),
 ('family', 389),
 ('walk', 370),
 ('put', 354),
 ('boy', 350),
 ('stay', 304),
 ('show', 303),
 ('grow', 291),
 ('work', 275),
 ('girl', 240),
 ('turn', 237),
 ('fall', 233),
 ('teach', 219),
 ('room', 219),
 ('seem', 212),
 ('stand', 207),
 ('friend', 193),
 ('thing', 192),
 ('one', 185),
 ('book', 183),
 ('visit', 183),
 ('class', 183),
 ('air', 178),
 ('computer', 175),
 ('believe', 171),
 ('paper', 171),
 ('sleep', 162),
 ('birthday', 160),
 ('number', 148),
 ('choose', 144),
 ('bring', 137),
 ('party', 137),
 ('exercise', 136),
 ('team', 136),
 ('dance', 136),
 ('finish', 135),
 ('send', 127),
 ('today', 126),
 ('lunch', 125),
 ('baby', 121),
 ('language', 121),
 ('care', 120),
 ('pick', 120),
 (

In [393]:
sum(
    alternatives_df.loc[:, "was_true_statement_paraphrased"]
) / len(alternatives_df)

0.936298913334404

In [394]:
output_df = alternatives_df[
    [
        "text_no",
        "true_statement",
        "paraphrased_true_statement",
        "alternative_statement",
        "paraphrased_alternative_statement",
        "was_true_statement_paraphrased",
        "was_alternative_statement_paraphrased"
    ]
]

In [395]:
output_df.iloc[0]["true_statement"]

"Drink the water that has not been boiled because many people think boiled water is safe and good to people's health."

In [396]:
output_df.iloc[0]["paraphrased_true_statement"]

"Drink the liquid that has not been boiled because many people think boiled liquid is safe and satisfactory to people's health."

In [397]:
output_df.to_excel(
    os.path.join(
        STATEMENTS_DIR, 
        f"paraphrased_{RACE_PART.replace('/', '-')}_{random.randint(0, 2**32):x}.xlsx"
    ),
    index=False
)

### Word2Vec (Not Used)

In [15]:
class Word2VecModel:
    W2V_MODEL_DIR = "../w2v_models"
    
    def __init__(self, model_name):
        self.model_name = model_name
        self.model = gensim.models.KeyedVectors.load_word2vec_format(
            os.path.join(Word2VecModel.W2V_MODEL_DIR, f"{model_name}.txt")
        )

In [16]:
w2v_nolg_100d = Word2VecModel("enwiki_20180420_nolg_100d")

In [41]:
w2v_nolg_100d.model.most_similar("way")

[('ENTITY/Stour_and_Orwell_Walk', 0.732032835483551),
 ('path', 0.7227527499198914),
 ('führerweg', 0.7152807712554932),
 ('hopefully', 0.7044017314910889),
 ('ENTITY/WP:EL', 0.7037861347198486),
 ('ENTITY/WP:DR', 0.7026253938674927),
 ('jitler', 0.7024155855178833),
 ('cenzar', 0.7017824649810791),
 ('torpel', 0.7011029720306396),
 ('ENTITY/WP:Wikipedia', 0.7005131244659424)]

In [42]:
w2v_nolg_100d.model.most_similar("people")

[('persons', 0.7992599010467529),
 ('residents', 0.7535831928253174),
 ('ENTITY/People', 0.7503395080566406),
 ('helsinkians', 0.7265914678573608),
 ('citizens', 0.7153019905090332),
 ('natives', 0.6979749798774719),
 ('youths', 0.6972818374633789),
 ('inhabitants', 0.694731593132019),
 ('tangbotens', 0.6896241903305054),
 ('americans', 0.6892567873001099)]

In [43]:
w2v_nolg_100d.model.most_similar("chocolate")

[('ENTITY/Chocolate', 0.9106836318969727),
 ('caramel', 0.8976747989654541),
 ('ENTITY/Caramel', 0.8792448043823242),
 ('ENTITY/White_chocolate', 0.8668451309204102),
 ('ENTITY/Peanut_butter', 0.8533239364624023),
 ('ENTITY/Types_of_chocolate', 0.8502905368804932),
 ('chocolates', 0.8490653038024902),
 ('ENTITY/Types_of_chocolate#Milk_chocolate', 0.8471919298171997),
 ('candy', 0.844744086265564),
 ('cream', 0.842796266078949)]

In [46]:
w2v_nolg_100d.model.most_similar("cancel", topn=20)

[('canceling', 0.8277071714401245),
 ('cancelling', 0.8080153465270996),
 ('postpone', 0.7998612523078918),
 ('reschedule', 0.7844353914260864),
 ('suspend', 0.7557451725006104),
 ('rescind', 0.7517332434654236),
 ('cancellation', 0.7479344010353088),
 ('cancels', 0.7384265661239624),
 ('announce', 0.7238617539405823),
 ('reconsider', 0.7106267213821411),
 ('renew', 0.7019257545471191),
 ('discontinue', 0.6998554468154907),
 ('postponing', 0.6976670026779175),
 ('postponement', 0.6904169917106628),
 ('ENTITY/Postpone_to_a_certain_time', 0.6869893670082092),
 ('announcing', 0.679280161857605),
 ('reinstate', 0.6781035661697388),
 ('revoke', 0.6780909299850464),
 ('rescheduling', 0.6767849922180176),
 ('renegotiate', 0.6732800602912903)]