In [227]:
from nltk import ngrams, word_tokenize, pos_tag, FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from itertools import zip_longest
from functools import reduce

In [233]:
text = """For A number of years now, work has been proceeding IN order to bring perfection to the crudely conceived idea of a transmission that would not only supply inverse reactive current for use in unilateral phase detractors, but would also be capable of automatically synchronizing cardinal grammeters. Such an instrument is the turbo encabulator.Now basically the only new principle involved is that instead of power being generated by the relative motion of conductors and fluxes, it is produced by the modial interaction of magneto-reluctance and capacitive diractance.The original machine had a base plate of pre-famulated amulite surmounted by a malleable logarithmic casing in such a way that the two spurving bearings were in a direct line with the panametric fan. The latter consisted simply of six hydrocoptic marzlevanes, so fitted to the ambifacient lunar waneshaft that side fumbling was effectively prevented.The main winding was of the normal lotus-o-delta type placed in panendermic semi-boloid slots of the stator, every seventh conductor being connected by a non-reversible tremie pipe to the differential girdle spring on the "up" end of the grammeters.The turbo-encabulator has now reached a high level of development, and it’s being successfully used in the operation of novertrunnions. Moreover, whenever a forescent skor motion is required, it may also be employed in conjunction with a drawn reciprocation dingle arm, to reduce sinusoidal repleneration."""

## Tokenize and Tag

In [234]:
lower = text.lower()
tokensOriginalCase = word_tokenize(text)
tokens = word_tokenize(lower)
tagged = pos_tag(tokens)

## Lemmatize

In [235]:
wnl = WordNetLemmatizer()

def treebankToWordnetPOS(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN #NOUN is the default type if no POS is given for lemmatize
    
lemma = []
for word, pos in tagged:
    root = wnl.lemmatize(word, treebankToWordnetPOS(pos))
    lemma.append(root)

## Load Suffix Table

In [236]:
suffixTable = {}
with open("suffixes.txt") as f:
    for line in f:
        suffix = line.strip()
        suffixTable[suffix] = 0

### Process Suffixes

In [237]:
suffixes = []
for word in tokens:
    for suffix in suffixTable:
        if word.endswith(suffix):
            suffixTable[suffix] = suffixTable.get(suffix, 0) + 1   
            suffixes.append(suffix)

## Create N-Grams

In [238]:
uni = []
bi = []
tri = []
for gram in ngrams(tokens, 1):
    uni.append(gram)
for gram in ngrams(tokens, 2):
    bi.append(gram)
for gram in ngrams(tokens, 3):
    tri.append(gram)

### Gram Frequency

In [239]:
uniDist = FreqDist(uni)
biDist = FreqDist(bi)
triDist = FreqDist(tri)

for gram, count in uniDist.items():
    gram,count
for gram, count in biDist.items():
    gram,count
for gram, count in triDist.items():
    gram,count

## POS Grams

In [240]:
pos = [tag for word, tag in tagged]

uniPOS = []
biPOS = []
triPOS = []
quadPOS = []
pentPOS = []
for gram in ngrams(pos, 1):
    uniPOS.append(gram)
for gram in ngrams(pos, 2):
    biPOS.append(gram)
for gram in ngrams(pos, 3):
    triPOS.append(gram)
for gram in ngrams(pos, 4):
    quadPOS.append(gram)
for gram in ngrams(pos, 5):
    pentPOS.append(gram)

### POS Gram Frequency

In [241]:
uniPOSdist = FreqDist(uniPOS)
biPOSdist = FreqDist(biPOS)
triPOSdist = FreqDist(triPOS)
quadPOSdist = FreqDist(quadPOS)
pentPOSdist = FreqDist(pentPOS)

for gram, count in uniPOSdist.items():
    gram,count
for gram, count in biPOSdist.items():
    gram,count
for gram, count in triPOSdist.items():
    gram,count
for gram, count in quadPOSdist.items():
    gram,count
for gram, count in pentPOSdist.items():
    gram,count    

## Uppercase/Lowercase ratio

In [246]:
capsFreq = sum([1 if letter.isupper() else 0 for letter in text])/sum([1 if letter.islower() else 0 for letter in text])

In [247]:
wordCase = []
for word in tokensOriginalCase:
    wordCase.append(1 if reduce(lambda upper, c: True if upper and c.isupper() else False, word, True) else 0)
allcapsFreq = sum(wordCase)/len(wordCase)