In [1]:
from sqlite_utils import fetch_all
import nltk
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer
from nltk import wordnet as wn
from collections import Counter, defaultdict

In [2]:
machiavelli, montesquieu = fetch_all()

In [3]:
len(machiavelli)

434

In [4]:
machiavelli[0].lower()

'on the borders of this desert clime, one has told me, i will encounter the shadow of the great montesquieu. is this him who is before me?'

#### Processing:

1. Tokenize each sentence in lower-cased text
2. Lemmatize each word in each sentence (results were not satisfying, so...)
3. Simplify lemmas via finding most common synsets

In [5]:
machia_tokens = [word_tokenize(sent.lower()) for sent in machiavelli]
machia_tokens[0][:8]

['on', 'the', 'borders', 'of', 'this', 'desert', 'clime', ',']

In [6]:
wnl = WordNetLemmatizer()
machia_lemmas = [[wnl.lemmatize(w) for w in sent] for sent in machia_tokens]
machia_lemmas[2][0:8]

['is', 'this', 'the', 'philosopher', 'or', 'the', 'statesman', 'who']

In [7]:
def synset_lemmatizer(word):
    """
    Simplify words by taking their simpler or most common synsets. Words up to 3 letters do not get modified.
    
    Examples:
        in: "hello", out: "hello"
        in: "distanced", out: "distance"
        in: "spaces", out: "space"
        in: "told", out: "tell"
    It's not perfect:
        in: "comprehend", out: "grok"
    """
    
    # don't modify small words
    if len(word) <=3:
        return word
    
    try:
        # get synsets        
        synsets_list = wn.wordnet.synsets(word)

        # clear synsets: get names as strings
        synsets_list = [w.name().split(".")[0] for w in synsets_list]

        word_counter = Counter(synsets_list)

        # if there are many words
        if len(word_counter) > 1:
            word_freq1, word_freq2 = word_counter.most_common(2) # each is a tuple: ("word", counts)

            # if they have the same frequencies: pick the shorter word, else pick the first
            if word_freq1[1] == word_freq2[1]:
                if len(word_freq1[0]) <= len(word_freq2[0]):
                    return word_freq1[0]
                else:
                    return word_freq2[0]
            else:
                return word_freq1[0]

        # if there is only one word    
        else:
            return word_counter.most_common()[0][0]
        
    # if there are no synsets, return the word as it is
    except IndexError:
        return word

In [8]:
machia_lemmas2 = [[synset_lemmatizer(w) for w in sent] for sent in machia_lemmas]
machia_lemmas2[5][0:8]

['this', 'is', 'what', 'deceive', 'you', ',', 'montesquieu', ',']

In [9]:
" ".join(machia_lemmas[0])

'on the border of this desert clime , one ha told me , i will encounter the shadow of the great montesquieu . is this him who is before me ?'

In [10]:
" ".join(machia_lemmas2[0])

'on the border of this desert climate , one ha tell me , i will meet the shadow of the great montesquieu . is this him who is ahead me ?'

In [11]:
machiavelli[0]

'On the borders of this desert clime, one has told me, I will encounter the shadow of the great Montesquieu. Is this him who is before me?'

Later in the project, we might need to revert this process by de-simplifying the words. Although more methods will be explored, a word mapping that keeps track of every change made so far will probably be useful.

However, as seen below, two differents words can get the same mapping (and then would get overwritten if we implemented a simple dictionary). Instead, we can use a `collections.defaultdict(list)` which solves the issue without much additional strain.

In [12]:
word_mappings_ = {new:old for (old, new) in zip([w for sent in machia_tokens for w in sent], 
                                     [w for sent in machia_lemmas2 for w in sent])}

len(word_mappings_) == len(Counter([w for sent in machia_tokens for w in sent]))

False

In [13]:
word_mappings = defaultdict(list)

In [14]:
word_mappings

defaultdict(list, {})

In [15]:
for (old, new) in zip([w for sent in machia_tokens for w in sent], 
                    [w for sent in machia_lemmas2 for w in sent]):
    
    if old not in word_mappings[new]:
        word_mappings[new].append(old)

In [16]:
word_mappings

defaultdict(list,
            {'on': ['on'],
             'the': ['the'],
             'border': ['borders', 'border'],
             'of': ['of'],
             'this': ['this'],
             'desert': ['desert'],
             'climate': ['clime'],
             ',': [','],
             'one': ['one', 'ones', 'unity'],
             'ha': ['has'],
             'tell': ['told', 'tell', 'telling', 'tells'],
             'me': ['me'],
             'i': ['i'],
             'will': ['will'],
             'meet': ['encounter',
              'meet',
              'meetings',
              'encounters',
              'meeting'],
             'shadow': ['shadow', 'shadows'],
             'great': ['great', 'greatest', 'greater'],
             'montesquieu': ['montesquieu'],
             '.': ['.'],
             'is': ['is'],
             'him': ['him'],
             'who': ['who'],
             'ahead': ['before', 'ahead'],
             '?': ['?'],
             'among': ['among'],
             'ce