# Chapter 3

In [2]:
import nltk, re, pprint
from nltk import word_tokenize, ToktokTokenizer, wordpunct_tokenize, sent_tokenize, WhitespaceTokenizer

In [3]:
from urllib import request
url = "http://www.gutenberg.org/files/2554/2554-0.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')

tokens = word_tokenize(raw)
print(len(tokens), tokens[:9])

257058 ['\ufeffThe', 'Project', 'Gutenberg', 'eBook', 'of', 'Crime', 'and', 'Punishment', ',']


In [4]:
text = nltk.Text(tokens)
print(text)
print(text.collocations())

<Text: ﻿The Project Gutenberg eBook of Crime and Punishment...>
Katerina Ivanovna; Pyotr Petrovitch; Pulcheria Alexandrovna; Avdotya
Romanovna; Rodion Romanovitch; Marfa Petrovna; Sofya Semyonovna; old
woman; Project Gutenberg-tm; Porfiry Petrovitch; Amalia Ivanovna;
great deal; young man; Nikodim Fomitch; Project Gutenberg; Ilya
Petrovitch; Andrey Semyonovitch; Hay Market; Dmitri Prokofitch; Good
heavens
None


In [5]:
print(raw.find("PART I"), raw.rfind("You"))

5575 1171580


# The effect of case folding on tokens

In [6]:

raw_lowered = raw.lower()

tokens = word_tokenize(raw)
print("word_tokenize non-case folded tokens: " + str(len(tokens)))
tokens = word_tokenize(raw_lowered)
print("word_tokenize case folded tokens: " + str(len(tokens)))
print()

toktok = ToktokTokenizer()
tokens = toktok.tokenize(text=raw, return_str=True)
print("ToktokTokenizer non-case folded tokens: " + str(len(tokens)))
tokens = toktok.tokenize(text=raw_lowered, return_str=True)
print("ToktokTokenizer case folded tokens: " + str(len(tokens)))
print()

tokens = wordpunct_tokenize(raw)
print("wordpunct_tokenize non-case folded tokens: " + str(len(tokens)))
tokens = wordpunct_tokenize(raw_lowered)
print("wordpunct_tokenize case folded tokens: " + str(len(tokens)))
print()

tokens = sent_tokenize(raw)
print("sent_tokenize non-case folded tokens: " + str(len(tokens)))
tokens = sent_tokenize(raw_lowered)
print("sent_tokenize case folded tokens: " + str(len(tokens)))
print()

tokens = list(WhitespaceTokenizer().span_tokenize(raw))
print("WhitespaceTokenizer non-case folded tokens: " + str(len(tokens)))
tokens = list(WhitespaceTokenizer().span_tokenize(raw_lowered))
print("WhitespaceTokenizer case folded tokens: " + str(len(tokens)))
print()



word_tokenize non-case folded tokens: 257058
word_tokenize case folded tokens: 257033

ToktokTokenizer non-case folded tokens: 1217441
ToktokTokenizer case folded tokens: 1217441

wordpunct_tokenize non-case folded tokens: 255819
wordpunct_tokenize case folded tokens: 255819

sent_tokenize non-case folded tokens: 12060
sent_tokenize case folded tokens: 11726

WhitespaceTokenizer non-case folded tokens: 206551
WhitespaceTokenizer case folded tokens: 206551



# Stemming & lemmatization

In [10]:
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()

tokens = set(word_tokenize(raw))
tokens_set = set(tokens)
print("word_tokenize token amount: " + str(len(tokens)))
print("word_tokenize token set amount: " + str(len(tokens_set)))
stemmed_tokens = [porter.stem(t) for t in tokens]
print("porter stemmer token amount: " + str(len(stemmed_tokens)))
stemmed_tokens = [lancaster.stem(t) for t in tokens]
print("lancaster stemmer token amount: " + str(len(stemmed_tokens)))
print()

toktok = ToktokTokenizer()
tokens = toktok.tokenize(text=raw, return_str=True)
tokens_set = set(tokens)
print("ToktokTokenizer token amount: " + str(len(tokens)))
print("ToktokTokenizer token set amount: " + str(len(tokens_set)))
stemmed_tokens = [porter.stem(t) for t in tokens]
print("porter stemmer token amount: " + str(len(stemmed_tokens)))
stemmed_tokens = [lancaster.stem(t) for t in tokens]
print("lancaster stemmer token amount: " + str(len(stemmed_tokens)))
print()

word_tokenize token amount: 11516
word_tokenize token set amount: 11516
porter stemmer token amount: 11516
lancaster stemmer token amount: 11516

ToktokTokenizer token amount: 1217441
ToktokTokenizer token set amount: 101
porter stemmer token amount: 1217441
lancaster stemmer token amount: 1217441



In [13]:
sentences = nltk.sent_tokenize(raw)
print(sentences[1])
print(sentences[2])

You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this eBook or online at
www.gutenberg.org.
If you are not located in the United States, you
will have to check the laws of the country where you are located before
using this eBook.


# N Grams

In [16]:
from nltk.util import ngrams
unigrams = ngrams(raw[:1000].split(), 1)
for item in unigrams:
    print(item)

('\ufeffThe',)
('Project',)
('Gutenberg',)
('eBook',)
('of',)
('Crime',)
('and',)
('Punishment,',)
('by',)
('Fyodor',)
('Dostoevsky',)
('This',)
('eBook',)
('is',)
('for',)
('the',)
('use',)
('of',)
('anyone',)
('anywhere',)
('in',)
('the',)
('United',)
('States',)
('and',)
('most',)
('other',)
('parts',)
('of',)
('the',)
('world',)
('at',)
('no',)
('cost',)
('and',)
('with',)
('almost',)
('no',)
('restrictions',)
('whatsoever.',)
('You',)
('may',)
('copy',)
('it,',)
('give',)
('it',)
('away',)
('or',)
('re-use',)
('it',)
('under',)
('the',)
('terms',)
('of',)
('the',)
('Project',)
('Gutenberg',)
('License',)
('included',)
('with',)
('this',)
('eBook',)
('or',)
('online',)
('at',)
('www.gutenberg.org.',)
('If',)
('you',)
('are',)
('not',)
('located',)
('in',)
('the',)
('United',)
('States,',)
('you',)
('will',)
('have',)
('to',)
('check',)
('the',)
('laws',)
('of',)
('the',)
('country',)
('where',)
('you',)
('are',)
('located',)
('before',)
('using',)
('this',)
('eBook.',)
('Title:',)


In [17]:
bigrams = ngrams(raw[:1000].split(), 2)
for item in bigrams:
    print(item)

('\ufeffThe', 'Project')
('Project', 'Gutenberg')
('Gutenberg', 'eBook')
('eBook', 'of')
('of', 'Crime')
('Crime', 'and')
('and', 'Punishment,')
('Punishment,', 'by')
('by', 'Fyodor')
('Fyodor', 'Dostoevsky')
('Dostoevsky', 'This')
('This', 'eBook')
('eBook', 'is')
('is', 'for')
('for', 'the')
('the', 'use')
('use', 'of')
('of', 'anyone')
('anyone', 'anywhere')
('anywhere', 'in')
('in', 'the')
('the', 'United')
('United', 'States')
('States', 'and')
('and', 'most')
('most', 'other')
('other', 'parts')
('parts', 'of')
('of', 'the')
('the', 'world')
('world', 'at')
('at', 'no')
('no', 'cost')
('cost', 'and')
('and', 'with')
('with', 'almost')
('almost', 'no')
('no', 'restrictions')
('restrictions', 'whatsoever.')
('whatsoever.', 'You')
('You', 'may')
('may', 'copy')
('copy', 'it,')
('it,', 'give')
('give', 'it')
('it', 'away')
('away', 'or')
('or', 're-use')
('re-use', 'it')
('it', 'under')
('under', 'the')
('the', 'terms')
('terms', 'of')
('of', 'the')
('the', 'Project')
('Project', 'Gu

In [18]:
trigrams = ngrams(raw[:1000].split(), 3)
for item in trigrams:
    print(item)

('\ufeffThe', 'Project', 'Gutenberg')
('Project', 'Gutenberg', 'eBook')
('Gutenberg', 'eBook', 'of')
('eBook', 'of', 'Crime')
('of', 'Crime', 'and')
('Crime', 'and', 'Punishment,')
('and', 'Punishment,', 'by')
('Punishment,', 'by', 'Fyodor')
('by', 'Fyodor', 'Dostoevsky')
('Fyodor', 'Dostoevsky', 'This')
('Dostoevsky', 'This', 'eBook')
('This', 'eBook', 'is')
('eBook', 'is', 'for')
('is', 'for', 'the')
('for', 'the', 'use')
('the', 'use', 'of')
('use', 'of', 'anyone')
('of', 'anyone', 'anywhere')
('anyone', 'anywhere', 'in')
('anywhere', 'in', 'the')
('in', 'the', 'United')
('the', 'United', 'States')
('United', 'States', 'and')
('States', 'and', 'most')
('and', 'most', 'other')
('most', 'other', 'parts')
('other', 'parts', 'of')
('parts', 'of', 'the')
('of', 'the', 'world')
('the', 'world', 'at')
('world', 'at', 'no')
('at', 'no', 'cost')
('no', 'cost', 'and')
('cost', 'and', 'with')
('and', 'with', 'almost')
('with', 'almost', 'no')
('almost', 'no', 'restrictions')
('no', 'restrictio

In [68]:
from nltk.corpus import gutenberg
from nltk.text import Text
import regex as re
import math

def get_tfidf(word, k):
    fileids = gutenberg.fileids()
    pattern = re.compile(r'\b{}\b'.format(word), re.I) # case insensitive
    doc_count = 0
    tf = []
    
    print("Calculating tf...")
    for fileid in fileids:
        fd = nltk.FreqDist()
        text = gutenberg.raw(fileid)
        text = word_tokenize(text)
        count = 0
        found = False
        for text_word in text:
            if text_word == word:
                count += 1
                found = True
        tf.append(count/len(text))
        if (found):
            doc_count += 1
    
    print("Calculating idf and sorting...")

    idf = math.log(len(fileids)/doc_count)
    if idf == 0:
        print("the term appears in every document, breaking")
        return None

    doc_tf_idf = {}
    for index, fileid in enumerate(fileids):
        doc_tf_idf[fileid] = tf[index]/idf

    final = sorted(doc_tf_idf.items(), key=lambda item: item[1])
    return final[-k:]


print(get_tfidf("tea", 3))


Calculating tf...


KeyboardInterrupt: 