In [1]:
from nltk.corpus import gutenberg

using_corpus = gutenberg

In [2]:
from nltk.corpus import wordnet as wn
import re

word_regex = re.compile('[a-zA-Z]+')

In [3]:
import nltk
from tqdm.autonotebook import tqdm

adverbs = {}
predicates = {}
adv_any_bigrams = {}
adv_pred_bigrams = {}

sentences_portion = 1
sentences = using_corpus.sents()
sentences = sentences[:int(len(sentences) * sentences_portion)]
sentences = [
    [ word.lower() for word in words if word_regex.match(word) ] for words in sentences
]


# 1. Preprocessing dataset
# =====
# Parse sentences into adverbs, predicates, adv_any_bigrams, adv_pred_bigrams
# Which is {(word or bigram) => frequency}
#     adverbs: Adverbs which are coming together with predicates in (adverb + predicate) form
#     predicates: Predicates wich are coming together with adverbs in (adverb + predicate) form
#     adv_any_bigrams: Bigrams which first element is an adverb
#     adv_pred_bigrams: Bigrams which first element is an adverb and second element is a predicate

for sent in tqdm(sentences):
    tokens = nltk.pos_tag(sent)
    for index, (word, pos) in enumerate(tokens):
        if pos not in ('RB', 'RBR', 'RBS'):
            continue
        
        if len(tokens) <= index + 1:
            continue
        
        next_word, next_pos = tokens[index + 1]
        
        bigram = (word, next_word)
        if bigram not in adv_any_bigrams:
            adv_any_bigrams[bigram] = 0
        
        adv_any_bigrams[bigram] += 1
        
        if next_pos not in ('VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS'):
            continue
        
        if word not in adverbs:
            adverbs[word] = 0
        
        adverbs[word] += 1
        
        if next_word not in predicates:
            predicates[next_word] = 0
        
        predicates[next_word] += 1
        
        if bigram not in adv_pred_bigrams:
            adv_pred_bigrams[bigram] = 0
        
        adv_pred_bigrams[bigram] += 1

  


HBox(children=(FloatProgress(value=0.0, max=98552.0), HTML(value='')))




In [4]:
import numpy as np

adv_bi_sim_memoization = {}
def adverb_bigram_similarity(adv1, adv2):
    # Getting cosine similarity between next word frequency vector of adverbs
    
    if adv1 == adv2:
        return 1
    
    if (adv1, adv2) in adv_bi_sim_memoization:
        return adv_bi_sim_memoization[(adv1, adv2)]
    
    adv1_words = {}
    adv2_words = {}
    
    for bigram in adv_any_bigrams:
        adv, next_word = bigram
        
        if adv != adv1 and adv != adv2:
            continue

        if next_word not in adv1_words:
            adv1_words[next_word] = 0

        if next_word not in adv2_words:
            adv2_words[next_word] = 0

        if adv == adv1:
            adv1_words[next_word] += 1

        else:
            adv2_words[next_word] += 1
    
    adv1_value_vec = np.asarray([
        freq for word, freq in sorted(
            adv1_words.items(),
            key = lambda x: x[0]
        )
    ])
    
    adv2_value_vec = np.asarray([
        freq for word, freq in sorted(
            adv2_words.items(),
            key = lambda x: x[0]
        )
    ])
    
    similarity = np.dot(adv1_value_vec, adv2_value_vec) / \
        (np.linalg.norm(adv1_value_vec) * np.linalg.norm(adv2_value_vec))
    
    adv_bi_sim_memoization[(adv1, adv2)] = similarity
    
    return similarity


In [26]:
from nltk.sentiment import vader

bigrams = {}
intensifiers = {}
not_intensifiers = ['that', 'back', 'only', 'always', 'now', 'as', 'also']

variance_threshold = 10.0
similarity_threshold = 0.05
adverb_variance = {}
adverb_mean = {}
adverb_count = {}

# 2. Finding intensifier
# =====
# Find adverb which
#   * is not NEGATE (not, never, ....) and
#   * is not (only, always, now, as) and
#   * does not appear with certain predicates (this is why the threshold is low enough) and
#   * has similar next word frequency with 'very'
#
# But actually, the final one does not worked well, so I have lowered the threshold

for bigram in adv_pred_bigrams:
    if bigram[0] not in adverb_count:
        adverb_count[bigram[0]] = 0
    
    adverb_count[bigram[0]] += 1

for adverb, predicate_count in adverb_count.items():
    adverb_mean[adverb] = adverbs[adverb] / predicate_count

for bigram, frequency in adv_pred_bigrams.items():
    adverb, predicate = bigram
    if adverb not in adverb_variance:
        adverb_variance[adverb] = 0
    
    adverb_variance[adverb] += (frequency - adverb_mean[adverb]) ** 2

for adverb, variance in adverb_variance.items():
    if not (
        adverb in vader.NEGATE or
        adverb in not_intensifiers or
        (variance <= variance_threshold) or
        adverb_bigram_similarity('very', adverb) < similarity_threshold
    ): # or (adverb in vader.BOOSTER_DICT):
        
        if adverb not in intensifiers:
            intensifiers[adverb] = 0
        
        intensifiers[adverb] += frequency
        bigrams[bigram] = frequency

print(len(intensifiers), list(intensifiers.keys())[:50])

45 ['very', 'most', 'highly', 'too', 'however', 'so', 'well', 'almost', 'really', 'yet', 'ever', 'particularly', 'more', 'quite', 'certainly', 'therefore', 'perfectly', 'even', 'rather', 'then', 'truly', 'often', 'still', 'there', 'just', 'away', 'totally', 'pretty', 'better', 'altogether', 'remarkably', 'less', 'here', 'again', 'exceedingly', 'down', 'thus', 'once', 'equally', 'extremely', 'together', 'somewhat', 'o', 'ere', 'jolly']


In [27]:
def find_similar(word):
    synsets = wn.synsets(word)
    if len(synsets) < 1:
        return []

    synonyms = set()
    for synset in synsets:
        if synset.pos() == 'n':
            continue
            
        synonyms.update(synset.lemma_names())

    return synonyms

In [28]:
predicate_similar = {}

# 3. Finding similar words for predicates
# =====
# Find similar words by exploring lemma_names in wordnet

for predicate in predicates:
    similars = find_similar(predicate)
    similar_predicates = [
        similar for similar in similars
        if similar in predicates and similar != predicate
    ]
    
    if len(similar_predicates) > 0:
        predicate_similar[predicate] = similar_predicates

In [29]:
from nltk.tokenize import word_tokenize

results = []

# 4. Finding triples
# =====
# The triples are found from wordnet definition.
# The details are like this:
#
# 1) For each predicates, find similar words
# 2) For each words, find synsets from wordnet
# 3) For each synset, get definition and tokenize
# 4) Find that the definition contains any intensifiers
# 5) If it has an intensifier, add to result set

for predicate in tqdm(predicates):
    if predicate not in predicate_similar:
        continue
    
    similar_words = predicate_similar[predicate]
    
    for similar_word in similar_words:
        synsets = wn.synsets(similar_word)
        containing_intensifiers = []
        
        for synset in synsets:
            definition = synset.definition()
            tokens = word_tokenize(definition)

            if predicate not in tokens:
                continue
            
            token_len = len(tokens)
            
            for index, token in enumerate(tokens):
                if token not in intensifiers:
                    continue
                
                containing_intensifiers.append(token)
                break
        
        highest_intensifier = (0, None)
        for intensifier in containing_intensifiers:
            very_similarity = adverb_bigram_similarity('very', intensifier)
            if very_similarity > highest_intensifier[0]:
                highest_intensifier = (very_similarity, intensifier)
        
        if highest_intensifier[1] == None:
            continue
        
        results.append(
            (predicate, intensifier, similar_word)
        )

HBox(children=(FloatProgress(value=0.0, max=7618.0), HTML(value='')))




In [31]:
# 5. Saving it with CSV
# =====
# Save the found triples in a csv file

import csv

with open('results.csv', 'w', encoding='utf-8', newline='') as csvfile:
    writer = csv.writer(csvfile)
    
    for result in results[:50]:
        writer.writerow(result)

import random
print(sorted(results, key = lambda x: random.random())[:50])

[('pass', 'down', 'return'), ('progress', 'better', 'advance'), ('awake', 'there', 'wake'), ('poor', 'very', 'miserable'), ('spring', 'away', 'bound'), ('close', 'so', 'tight'), ('restore', 'together', 'repair'), ('require', 'just', 'ask'), ('humble', 'even', 'modest'), ('whole', 'together', 'solid'), ('drop', 'less', 'fell'), ('regarded', 'well', 'regard'), ('esteemed', 'well', 'respect'), ('sink', 'down', 'subside'), ('right', 'most', 'good'), ('down', 'down', 'depressed'), ('go', 'away', 'depart'), ('pressing', 'more', 'compress'), ('running', 'then', 'draw'), ('maintain', 'often', 'hold'), ('calm', 'still', 'tranquillize'), ('force', 'away', 'push'), ('sink', 'down', 'settle'), ('restore', 'together', 'fix'), ('improbable', 'too', 'marvellous'), ('exhausted', 'extremely', 'fatigued'), ('work', 'rather', 'play'), ('expects', 'so', 'bear'), ('require', 'just', 'demand'), ('ascended', 'better', 'rise'), ('soften', 'less', 'moderate'), ('turning', 'away', 'turn'), ('disappear', 'less',

In [11]:
# Appendix 1. Old approach
# =====
# Finding an intensifier which is most coming with certain similar word of predicate

"""
words = set([ word for word in using_corpus.words() if word_regex.match(word) ])
print(len(words))

int_freq_by_pred = {}

for bigram, frequency in bigrams.items():
    intensifier, predicate = bigram
    if predicate not in int_freq_by_pred:
        int_freq_by_pred[predicate] = {}
    
    int_freq_by_pred[predicate][intensifier] = frequency

for predicate, int_freq in int_freq_by_pred.items():
    int_freq_by_pred[predicate] = sorted(
        int_freq.items(),
        key = lambda x: x[1]
    )

frequency_threshold = 20
result = []
for pred, similar_preds in predicate_similar.items():
    for similar_pred in similar_preds:
        if similar_pred not in int_freq_by_pred:
            continue
        
        if frequency_threshold < int_freq_by_pred[similar_pred][0][1]:
            continue
        
        result.append(
            (pred, int_freq_by_pred[similar_pred][0][0], similar_pred)
        )

print(result[:50])
len(result)
"""

'\nint_freq_by_pred = {}\n\nfor bigram, frequency in bigrams.items():\n    intensifier, predicate = bigram\n    if predicate not in int_freq_by_pred:\n        int_freq_by_pred[predicate] = {}\n    \n    int_freq_by_pred[predicate][intensifier] = frequency\n\nfor predicate, int_freq in int_freq_by_pred.items():\n    int_freq_by_pred[predicate] = sorted(\n        int_freq.items(),\n        key = lambda x: x[1]\n    )\n\nfrequency_threshold = 20\nresult = []\nfor pred, similar_preds in predicate_similar.items():\n    for similar_pred in similar_preds:\n        if similar_pred not in int_freq_by_pred:\n            continue\n        \n        if frequency_threshold < int_freq_by_pred[similar_pred][0][1]:\n            continue\n        \n        result.append(\n            (pred, int_freq_by_pred[similar_pred][0][0], similar_pred)\n        )\n\nprint(result[:50])\nlen(result)\n'

In [12]:
# Appendix 2. Approach using word2vec
# ====
# I have approached with word2vec
# Find words which has good similarity with "very". They are intensifiers
# Find predicates which fulfills this condition:
#   * Its similar word's vector + intensifier vector is similar with original predicates'vector

"""
from nltk.corpus import gutenberg

using_corpus = gutenberg

from nltk.corpus import wordnet as wn
import re

word_regex = re.compile('[a-zA-Z]+')

def find_similar(word):
    synsets = wn.synsets(word)
    if len(synsets) < 1:
        return []

    synonyms = set()
    for synset in synsets:
        if synset.pos() == 'n':
            continue
            
        synonyms.update(synset.lemma_names())

    return synonyms

import nltk
from tqdm.notebook import tqdm

adverbs = {}
predicates = {}
adv_pred_bigrams = {}

sentences_portion = .3
sentences = using_corpus.sents()
sentences = sentences[:int(len(sentences) * sentences_portion)]
sentences = [
    [ word.lower() for word in words if word_regex.match(word) ] for words in sentences
]

for sent in tqdm(sentences):
    tokens = nltk.pos_tag(sent)
    for index, (word, pos) in enumerate(tokens):
        if pos not in ('RB', 'RBR', 'RBS'):
            continue
        
        if len(tokens) <= index + 1:
            continue
        
        next_word, next_pos = tokens[index + 1]
        
        if next_pos not in ('VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS'):
            continue
        
        if word not in adverbs:
            adverbs[word] = 0
        
        adverbs[word] += 1
        
        if next_word not in predicates:
            predicates[next_word] = 0
        
        predicates[next_word] += 1
        
        bigram = (word, next_word)
        if bigram not in adv_pred_bigrams:
            adv_pred_bigrams[bigram] = 0
        
        adv_pred_bigrams[bigram] += 1

from gensim.models import word2vec

model = word2vec.Word2Vec(
    sentences, 
    workers=4,
    size=50,
    min_count=10,
    window=10,
    sample=1e-3
)

bigrams = {}
intensifiers = {}

distance_threshold = 0.3
general_frequency_threshold = 20.0

### Filtering by very-similarity ###
for bigram, frequency in adv_pred_bigrams.items():
    adverb, predicate = bigram
    
    if adverb not in model.wv:
        continue
    
    if adverbs[adverb] / frequency <= general_frequency_threshold:
        continue
    
    if model.wv.distance(adverb, 'very') > distance_threshold:
        continue
        
    if adverb not in intensifiers:
        intensifiers[adverb] = 0
        
    intensifiers[adverb] += frequency
    bigrams[bigram] = frequency

print(len(intensifiers), list(intensifiers.keys())[:50])

predicate_similar = {}

for predicate in predicates:
    similars = find_similar(predicate)
    similar_predicates = [
        similar for similar in similars
        if similar in predicates and similar != predicate
    ]
    
    if len(similar_predicates) > 0:
        predicate_similar[predicate] = similar_predicates

print(predicate_similar)

int_freq_by_pred = {}

for bigram, frequency in bigrams.items():
    intensifier, predicate = bigram
    if predicate not in int_freq_by_pred:
        int_freq_by_pred[predicate] = {}
    
    int_freq_by_pred[predicate][intensifier] = frequency

for predicate, int_freq in int_freq_by_pred.items():
    int_freq_by_pred[predicate] = sorted(
        int_freq.items(),
        key = lambda x: x[1]
    )


from numpy import dot
from numpy.linalg import norm

pred_similarity_threshold = 0.7
frequency_threshold = 20
result = []

for pred, similar_preds in predicate_similar.items():
    if pred not in model.wv:
        continue
    
    original_vector = model.wv[pred]
    
    for similar_pred in similar_preds:
        if similar_pred not in int_freq_by_pred:
            continue
        
        if similar_pred not in model.wv:
            continue
        
        for intensifier, frequency in int_freq_by_pred[similar_pred]:
            if frequency_threshold < frequency:
                continue
            
            combined_word = model.wv[similar_pred] + model.wv[intensifier]
            cosine_similarity = dot(original_vector, combined_word) / \
                (norm(original_vector) * norm(combined_word))
            
            if cosine_similarity < pred_similarity_threshold:
                continue
        
            result.append(
                (pred, '%s %s' % (intensifier, similar_pred))
            )

print(result[:100])
len(result)
"""

"\nfrom gensim.models import word2vec\n\nmodel = word2vec.Word2Vec(\n    sentences, \n    workers=4,\n    size=50,\n    min_count=10,\n    window=10,\n    sample=1e-3\n)\n\nbigrams = {}\nintensifiers = {}\n\ndistance_threshold = 0.3\ngeneral_frequency_threshold = 20.0\n\n### Filtering by very-similarity ###\nfor bigram, frequency in adv_pred_bigrams.items():\n    adverb, predicate = bigram\n    \n    if adverb not in model.wv:\n        continue\n    \n    if adverbs[adverb] / frequency <= general_frequency_threshold:\n        continue\n    \n    if model.wv.distance(adverb, 'very') > distance_threshold:\n        continue\n        \n    if adverb not in intensifiers:\n        intensifiers[adverb] = 0\n        \n    intensifiers[adverb] += frequency\n    bigrams[bigram] = frequency\n\nprint(len(intensifiers), list(intensifiers.keys())[:50])\n\npredicate_similar = {}\n\nfor predicate in predicates:\n    similars = find_similar(predicate)\n    similar_predicates = [\n        similar for si