In [172]:
from nltk import ngrams, word_tokenize, pos_tag, FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from itertools import zip_longest, filterfalse
from functools import reduce
from enum import Enum

In [173]:
class caps(Enum):
    NoCaps = 0
    FirstCap = 1
    MixedCap = 2
    AllCap = 3

class nlp:
    defaultSuffixes =  list(line.rstrip() for line in open("suffixes.txt"))
    defaultSuffixTable = dict([suffix, 0] for suffix in defaultSuffixes)
    lemma = WordNetLemmatizer()
    
    def treebankToWordnetPOS(self, treebankPosTag):
        return{'J':wordnet.ADJ,
               'V':wordnet.VERB,
               'N':wordnet.NOUN,
               'R':wordnet.ADV}.get(treebankPosTag[0], wordnet.NOUN)
                
    tokenize = lambda self, text: (word_tokenize(text.lower()), word_tokenize(text))
    
    pos = lambda self, tokens: pos_tag(tokens)
    
    posTagOnly = lambda self, tagged: [tag for word,tag in tagged]
    
    def processPosTagsAndLemmatize(self, word, pos):
        return self.lemma.lemmatize(word, self.treebankToWordnetPOS(pos))
    
    lemmatize = lambda  self, posTokens: [self.processPosTagsAndLemmatize(*wordPos) for wordPos in tagged]
    
    ngram = lambda self, tokens, n: ngrams(tokens, n)
    
    ngramFreq = lambda self, grams: FreqDist(grams)
    
    def longestSuffixAWordEndsWith(self, tokens):
        return[max(s2, key=len) for s2 in 
                    [s1 for s1 in 
                        [
                            [s0 for s0 in self.defaultSuffixes if word.endswith(s0)]
                        for word in tokens]
                    if s1]]
                    
    def upperLowerLen(self, tokens):
        return[
                    (
                    sum([1 if letter.isupper() else 0 for letter in token]),
                    sum([1 if letter.islower() else 0 for letter in token]),
                    len(token),
                    token
                    )
                for token in tokensOriginalCase]
                
    def capLetterFreq(self, ull):
        return reduce(lambda i, u: i +u[0] , ull, 0)/reduce(lambda i, l: i + l[1] , ull, 0)
    
    def wordCase(self, upper, lower, length, token):
        if upper == 0:
            return caps.NoCaps
        elif upper == length:
            return caps.AllCap
        elif upper == 1 and token[0].isupper():
            return caps.FirstCap
        else:
            return caps.MixedCap 

## Tokenize and Tag

In [174]:
text = """For A number of years now, work has been proceeding IN order to bring perfection to the crudely conceived idea of a transmission that would not only supply inverse reactive current for use in unilateral phase detractors, but would also be capable of automatically synchronizing cardinal grammeters. Such an instrument is the turbo encabulator.Now basically the only new principle involved is that instead of power being generated by the relative motion of conductors and fluxes, it is produced by the modial interaction of magneto-reluctance and capacitive diractance.The original machine had a base plate of pre-famulated amulite surmounted by a malleable logarithmic casing in such a way that the two spurving bearings were in a direct line with the panametric fan. The latter consisted simply of six hydrocoptic marzlevanes, so fitted to the ambifacient lunar waneshaft that side fumbling was effectively prevented.The main winding was of the normal lotus-o-delta type placed in panendermic semi-boloid slots of the stator, every seventh conductor being connected by a non-reversible tremie pipe to the differential girdle spring on the "up" end of the grammeters.The turbo-encabulator has now reached a high level of development, and it’s being successfully used in the operation of novertrunnions. Moreover, whenever a forescent skor motion is required, it may also be employed in conjunction with a drawn reciprocation dingle arm, to reduce sinusoidal repleneration."""

n = nlp()
tokens = n.tokenize(text)
tagged = n.pos(tokens[0])

## Lemmatize

In [175]:
n.lemmatize(tagged)

['for',
 'a',
 'number',
 'of',
 'year',
 'now',
 ',',
 'work',
 'have',
 'be',
 'proceed',
 'in',
 'order',
 'to',
 'bring',
 'perfection',
 'to',
 'the',
 'crudely',
 'conceived',
 'idea',
 'of',
 'a',
 'transmission',
 'that',
 'would',
 'not',
 'only',
 'supply',
 'inverse',
 'reactive',
 'current',
 'for',
 'use',
 'in',
 'unilateral',
 'phase',
 'detractor',
 ',',
 'but',
 'would',
 'also',
 'be',
 'capable',
 'of',
 'automatically',
 'synchronize',
 'cardinal',
 'grammeters',
 '.',
 'such',
 'an',
 'instrument',
 'be',
 'the',
 'turbo',
 'encabulator.now',
 'basically',
 'the',
 'only',
 'new',
 'principle',
 'involve',
 'be',
 'that',
 'instead',
 'of',
 'power',
 'be',
 'generate',
 'by',
 'the',
 'relative',
 'motion',
 'of',
 'conductor',
 'and',
 'flux',
 ',',
 'it',
 'be',
 'produce',
 'by',
 'the',
 'modial',
 'interaction',
 'of',
 'magneto-reluctance',
 'and',
 'capacitive',
 'diractance.the',
 'original',
 'machine',
 'have',
 'a',
 'base',
 'plate',
 'of',
 'pre-famul

### Process Suffixes

In [176]:
n.longestSuffixAWordEndsWith(tokens[0])

['or',
 'er',
 's',
 's',
 'en',
 'ing',
 'er',
 'ing',
 'tion',
 'ly',
 'ed',
 'sion',
 'ly',
 'ly',
 'ive',
 'ent',
 'or',
 'al',
 's',
 'able',
 'ly',
 'ing',
 'al',
 's',
 'an',
 'ment',
 's',
 'ly',
 'ly',
 'ed',
 's',
 'er',
 'ing',
 'ed',
 'y',
 'ative',
 'tion',
 's',
 's',
 's',
 'ed',
 'y',
 'ial',
 'tion',
 'ance',
 'ive',
 'al',
 'ine',
 'ate',
 'ed',
 'ite',
 'ed',
 'y',
 'able',
 'ic',
 'ing',
 'y',
 'ing',
 's',
 'ine',
 'th',
 'ic',
 'an',
 'er',
 'ed',
 'ly',
 'ic',
 's',
 'ed',
 'ent',
 'ar',
 'ling',
 's',
 'ly',
 'ing',
 's',
 'al',
 'ed',
 'ic',
 'oid',
 's',
 'or',
 'y',
 'th',
 'or',
 'ing',
 'ed',
 'y',
 'ible',
 'ial',
 'ing',
 'or',
 's',
 'ed',
 'ment',
 's',
 'ing',
 'ly',
 'ed',
 'ation',
 's',
 'er',
 'er',
 'ent',
 'or',
 'tion',
 's',
 'ed',
 'y',
 'ed',
 'tion',
 'th',
 'ation',
 'al',
 'ation']

## Create N-Grams

In [177]:
FreqDist(list(ngrams(tokens[0], 3)))

FreqDist({("''", 'end', 'of'): 1,
          (',', 'and', 'it’s'): 1,
          (',', 'but', 'would'): 1,
          (',', 'every', 'seventh'): 1,
          (',', 'it', 'is'): 1,
          (',', 'it', 'may'): 1,
          (',', 'so', 'fitted'): 1,
          (',', 'to', 'reduce'): 1,
          (',', 'whenever', 'a'): 1,
          (',', 'work', 'has'): 1,
          ('.', 'moreover', ','): 1,
          ('.', 'such', 'an'): 1,
          ('.', 'the', 'latter'): 1,
          ('``', 'up', "''"): 1,
          ('a', 'base', 'plate'): 1,
          ('a', 'direct', 'line'): 1,
          ('a', 'drawn', 'reciprocation'): 1,
          ('a', 'forescent', 'skor'): 1,
          ('a', 'high', 'level'): 1,
          ('a', 'malleable', 'logarithmic'): 1,
          ('a', 'non-reversible', 'tremie'): 1,
          ('a', 'number', 'of'): 1,
          ('a', 'transmission', 'that'): 1,
          ('a', 'way', 'that'): 1,
          ('also', 'be', 'capable'): 1,
          ('also', 'be', 'employed'): 1,
          ('am


## POS Grams

In [178]:
posOnly = n.posTagOnly(tagged)
FreqDist(list(ngrams(posOnly, 3)))


FreqDist({("''", 'NN', 'IN'): 1,
          (',', 'CC', 'MD'): 1,
          (',', 'CC', 'VB'): 1,
          (',', 'DT', 'JJ'): 1,
          (',', 'NN', 'VBZ'): 1,
          (',', 'PRP', 'MD'): 1,
          (',', 'PRP', 'VBZ'): 1,
          (',', 'RB', 'VBN'): 1,
          (',', 'TO', 'VB'): 1,
          (',', 'WRB', 'DT'): 1,
          ('.', 'DT', 'JJ'): 1,
          ('.', 'NN', ','): 1,
          ('.', 'PDT', 'DT'): 1,
          ('CC', 'JJ', 'NN'): 1,
          ('CC', 'MD', 'RB'): 1,
          ('CC', 'NNS', ','): 1,
          ('CC', 'VB', 'VBG'): 1,
          ('CD', 'JJ', 'NNS'): 2,
          ('DT', 'CD', 'JJ'): 1,
          ('DT', 'JJ', 'JJ'): 3,
          ('DT', 'JJ', 'NN'): 9,
          ('DT', 'JJ', 'VBD'): 1,
          ('DT', 'NN', ','): 1,
          ('DT', 'NN', 'IN'): 3,
          ('DT', 'NN', 'NN'): 4,
          ('DT', 'NN', 'VBP'): 1,
          ('DT', 'NN', 'VBZ'): 1,
          ('DT', 'NN', 'WDT'): 1,
          ('DT', 'RB', 'JJ'): 1,
          ('DT', '``', 'RB'): 1,
          (

## Uppercase/Lowercase

In [179]:
ull = n.upperLowerLen(tokens[0])

In [180]:
n.capLetterFreq(ull)

0.009075907590759076

In [181]:
cases = list(map(lambda ull: n.wordCase(*ull), ull))

In [182]:
FreqDist(list(ngrams(cases, 3)))

FreqDist({(<caps.NoCaps: 0>, <caps.AllCap: 3>, <caps.NoCaps: 0>): 1,
          (<caps.NoCaps: 0>, <caps.NoCaps: 0>, <caps.FirstCap: 1>): 3,
          (<caps.NoCaps: 0>, <caps.NoCaps: 0>, <caps.AllCap: 3>): 1,
          (<caps.NoCaps: 0>, <caps.NoCaps: 0>, <caps.NoCaps: 0>): 210,
          (<caps.NoCaps: 0>, <caps.NoCaps: 0>, <caps.MixedCap: 2>): 4,
          (<caps.FirstCap: 1>, <caps.AllCap: 3>, <caps.NoCaps: 0>): 1,
          (<caps.FirstCap: 1>, <caps.NoCaps: 0>, <caps.NoCaps: 0>): 3,
          (<caps.MixedCap: 2>, <caps.NoCaps: 0>, <caps.NoCaps: 0>): 4,
          (<caps.AllCap: 3>, <caps.NoCaps: 0>, <caps.NoCaps: 0>): 2,
          (<caps.NoCaps: 0>, <caps.MixedCap: 2>, <caps.NoCaps: 0>): 4,
          (<caps.NoCaps: 0>, <caps.FirstCap: 1>, <caps.NoCaps: 0>): 3})