In [18]:
from nltk.corpus import brown, sentiwordnet as swn
from nltk.corpus.reader.tagged import TaggedCorpusReader
from tqdm.autonotebook import tqdm

import csv
import nltk

In [49]:
# Preprocess. Preprocessing corpus
# =====
# * Get some portion of sentences from corpus (default: all sentences)
# * Filter words which are not alphabetio
# * Make words to the lower case
# * Get POS tags of sentences if corpus is not already tagged

print("==== Preprocess: Preprocessing corpus ====")

using_corpus = brown
sentences_portion = 1

corpus_tagged = isinstance(using_corpus, TaggedCorpusReader)

sentences = using_corpus.tagged_sents() if corpus_tagged else using_corpus.sents()
sentences = sentences[:int(len(sentences) * sentences_portion)]

if corpus_tagged:
    sentences = [
        [
            (word.lower(), tag) for word, tag in words if word.isalpha()
        ] for words in tqdm(sentences)
    ]

else:
    sentences = [
        [
            (word.lower(), tag) for word, tag in nltk.pos_tag(words) if word.isalpha()
        ] for words in tqdm(sentences)
    ]

==== Preprocess: Preprocessing corpus ====


HBox(children=(FloatProgress(value=0.0, max=57340.0), HTML(value='')))




In [50]:
# 0. Extracting Lexical Items
# =====
# * Find adverb + predicate pairs from corpus

print("==== Stage 0: Extracting Lexical Items ====")

adverbs = {}
adverbs_all = {}
predicates = {}
predicates_all = {}
bigrams = {}

for tokens in tqdm(sentences):
    for index, (word, pos) in enumerate(tokens):
        # Count adjectives
        if pos in ('JJ', 'JJR', 'JJS'):
            if word not in predicates_all:
                predicates_all[word] = 0
            
            predicates_all[word] += 1
            continue
        
        if pos != 'RB':
            continue
        
        if len(tokens) <= index + 1:
            continue
        
        # Count adverbs
        if word not in adverbs_all:
            adverbs_all[word] = 0
        
        adverbs_all[word] += 1
        
        next_word, next_pos = tokens[index + 1]
        
        # Not using ('RB', 'RBR', 'RBS') as it contains some collocations like nearly as, ...
        if next_pos not in ('JJ', 'JJR', 'JJS'):
            continue
        
        # Count adverbs with adjectives
        if word not in adverbs:
            adverbs[word] = 0
        
        adverbs[word] += 1
        
        # Count adjectives with adverb
        if next_word not in predicates:
            predicates[next_word] = 0
        
        predicates[next_word] += 1
        
        # Count (adverb, adjective) bigrams
        bigram = (word, next_word)
        if bigram not in bigrams:
            bigrams[bigram] = 0
        
        bigrams[bigram] += 1

print("Adverbs: %d" % len(adverbs))
print("Predicates: %d" % len(predicates))
print("Bigrams: %d" % len(bigrams))

==== Stage 0: Extracting Lexical Items ====


HBox(children=(FloatProgress(value=0.0, max=57340.0), HTML(value='')))


Adverbs: 436
Predicates: 858
Bigrams: 1495


In [43]:
# 1. Filtering rare adverbs
# =====
# * Filter some adverbs which doesn't appear too much
#   as it is hard to determine how restrictive it is

print("==== Stage 1: Filter rare adverbs ====")

appear_threshold = 2 # int(len(sentences) * 0.0003)

# Filtering rare adverbs
rare_adverbs = {}
def filter_adverbs(adverb, appear_count):
    if appear_count < appear_threshold:
        rare_adverbs[adverb] = appear_count
        return False
    
    return True

adverbs = {
    adverb: appear_count
    for adverb, appear_count in adverbs.items()
    if filter_adverbs(adverb, appear_count)
}

# Filtering bigrams & predicates by rare adverbs
def filter_bigrams(bigram, appear_count):
    adverb, predicate = bigram
    if adverb not in rare_adverbs:
        return True
    
    # Invalidate predicates
    predicates[predicate] -= appear_count
    return False

bigrams = {
    bigram: appear_count
    for bigram, appear_count in bigrams.items()
    if filter_bigrams(bigram, appear_count)
}

predicates = {
    predicate: appear_count
    for predicate, appear_count in predicates.items()
    if appear_count > 0
}

print("%d adverbs have been invalidated" % len(rare_adverbs))
print("%d adverbs left" % len(adverbs))

==== Stage 1: Filter rare adverbs ====
225 adverbs have been invalidated
211 adverbs left


In [44]:
# 2. Get restrictiveness of adverbs
# =====
# * Find how restrictive an adverb is by getting max frequency of a certain verb / adjective
#   coming with the adverb among the whole count of verb / adjective
#
# (Restrictiveness is higher if the adverb comes mostly with a certain verb / adjective)


print("==== Stage 2: Get restrictiveness of adverbs ====")

adverb_max_collocation = {}
for (adverb, predicate), appear_count in bigrams.items():
    appear_percentage = appear_count / (adverbs_all[adverb]) # (adverbs[adverb])
    
    if (adverb not in adverb_max_collocation) or \
        (adverb_max_collocation[adverb][1] < appear_percentage):
            
        adverb_max_collocation[adverb] = (predicate, appear_percentage)

print(sorted(
    adverb_max_collocation.items(),
    key=lambda x: x[1][1],
    reverse = True
)[:10])

==== Stage 2: Get restrictiveness of adverbs ====
[('pitifully', ('small', 1.0)), ('childishly', ('merry', 1.0)), ('psychically', ('blind', 1.0)), ('inversely', ('proportional', 0.6)), ('extraordinarily', ('patient', 0.5)), ('demonstrably', ('incapable', 0.5)), ('ethically', ('acceptable', 0.5)), ('healthily', ('individual', 0.5)), ('delightfully', ('refreshing', 0.5)), ('grotesquely', ('unshaven', 0.5))]


In [45]:
# 3. Filtering objective adverbs
# =====
# * Filter objective adverbs, which min of obj_score of sentiwordnet is more than 0.9
#   as there are many non-intensifiers like adverbs of frequency(usually, sometimes, ...),
#   which usually have high obj_score

print("==== Stage 3: Filtering objective adverbs ====")

object_adverbs = []
passed_count = 0
for adverb, (predicate, appear_percentage) in adverb_max_collocation.items():
    senti_synsets = list(swn.senti_synsets(adverb))
    
    min_obj = None
    for senti_synset in senti_synsets:
        if min_obj is None or senti_synset.obj_score() < min_obj:
            min_obj = senti_synset.obj_score()
    
    if min_obj is None:
        passed_count += 1
        continue
    
    if min_obj > 0.9:
        object_adverbs.append(adverb)

print("%d adverbs have been filtered" % len(object_adverbs))
print("%d adverbs have been passed" % passed_count)
print(object_adverbs)

==== Stage 3: Filtering objective adverbs ====
74 adverbs have been filtered
1 adverbs have been passed
['almost', 'probably', 'suddenly', 'also', 'increasingly', 'literally', 'potentially', 'possibly', 'uniquely', 'pitifully', 'usually', 'sometimes', 'sufficiently', 'heretofore', 'mostly', 'there', 'frequently', 'financially', 'relatively', 'primarily', 'militarily', 'simultaneously', 'widely', 'ethically', 'partly', 'notably', 'perhaps', 'economically', 'partially', 'ever', 'deeply', 'once', 'merely', 'continuously', 'therefore', 'thus', 'purely', 'previously', 'indeed', 'so', 'hitherto', 'proportionately', 'progressively', 'formerly', 'especially', 'currently', 'supposedly', 'rapidly', 'personally', 'occasionally', 'culturally', 'again', 'recently', 'specially', 'socially', 'moreover', 'unusually', 'here', 'vaguely', 'politically', 'admittedly', 'ultimately', 'similarly', 'internationally', 'characteristically', 'largely', 'distinctly', 'eventually', 'normally', 'semantically', 'chi

In [46]:
# 4. Filtering the results
# =====
# * Filter object_adverb containing sets from collocations
# * Sort collocations by its restrictiveness

print("==== Stage 4: Filtering the results ====")

result = sorted(
    [
        (key, value, appear_percentage)
        for key, (value, appear_percentage) in adverb_max_collocation.items()
        if key not in object_adverbs
    ],
    key=lambda x: x[2],
    reverse = True
)[:100]


==== Stage 4: Filtering the results ====


In [47]:
# 5. Saving the results
# =====
# * Save the results from stage 4 as a csv and print it out

print("==== Stage 5: Saving the results ====")

with open('results.csv', 'w', encoding='utf-8', newline='') as csvfile:
    writer = csv.writer(csvfile)
    
    for index, (key, value, _) in enumerate(result):
        writer.writerow((key, value))
        print("%d %s %s" % (index, key, value))


==== Stage 5: Saving the results ====
0 inversely proportional
1 extraordinarily patient
2 demonstrably incapable
3 healthily individual
4 delightfully refreshing
5 grotesquely unshaven
6 academically talented
7 noticeably longer
8 startlingly bright
9 amazingly light
10 blissfully unaware
11 empirically successful
12 extremely conservative
13 professedly benevolent
14 inherently incapable
15 conspicuously absent
16 strikingly effective
17 commercially available
18 intrinsically evil
19 enormously long
20 cruelly distant
21 mutually exclusive
22 mentally ill
23 intellectually sterile
24 sexually selfish
25 artistically successful
26 immensely fertile
27 hopelessly inadequate
28 endlessly enchanting
29 warmly melodious
30 uniformly excellent
31 strangely disquieting
32 bitterly cold
33 perfectly conceivable
34 morally proper
35 positively indecent
36 curiously sleepy
37 oddly sibilant
38 upward mobile
39 technically original
40 remarkably elaborate
41 vividly real
42 stiffly motionless
