In [1]:
import os
import datetime
import multiprocessing

import numpy as np
import pandas as pd
import matplotlib as plt

import nltk
import spacy 
from spacy.tokenizer import Tokenizer
from spacy.pipeline import Sentencizer

## Loading the book

__License__: https://www.gutenberg.org/policy/license.html

In [2]:
with open('Pride and Prejudice - Jane Austen Chapter 1 to 20.txt') as f:
    book = f.read()
    print(book[0:200])

Chapter 1

      It is a truth universally acknowledged, that a single man in
      possession of a good fortune, must be in want of a wife.

      However little known the feelings or views of such a


In [3]:
book_content_list = book.split()
new_book = " ".join(book_content_list)
print(new_book[0:200])

Chapter 1 It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife. However little known the feelings or views of such a man may be on his f


In [4]:
len(new_book)

199920

# NLTK analysis

### Tokenize words

In [5]:
nltk_token_words_time = []

for i in range(0,10):
    
    # Start timer
    start = datetime.datetime.now()

    # tokenize words
    word_tokens = nltk.word_tokenize(new_book)
    
    # Stop timer
    finish = datetime.datetime.now()
    
    # Compute time for operation
    nltk_token_words_time.append((finish - start).total_seconds())

print('mean time: ' + str(np.mean(nltk_token_words_time)))
nltk_token_words_time

mean time: 0.1948823


[0.202879,
 0.184694,
 0.17732,
 0.176843,
 0.213453,
 0.207826,
 0.198524,
 0.188604,
 0.19957,
 0.19911]

In [6]:
nltk.word_tokenize(new_book)[5]

'truth'

### Tokenize sentences

In [7]:
nltk_token_sent_time = []

for i in range(0,10):
    
    # Start timer
    start = datetime.datetime.now()

    # tokenize sentences
    sentence_tokens = nltk.tokenize.sent_tokenize(new_book)
    
    # Stop timer
    finish = datetime.datetime.now()
    
    # Compute time for operation
    nltk_token_sent_time.append((finish - start).total_seconds())

print('mean time: ' + str(np.mean(nltk_token_sent_time)))
nltk_token_sent_time

mean time: 0.05494140000000001


[0.060867,
 0.055356,
 0.054919,
 0.054774,
 0.054322,
 0.061668,
 0.059307,
 0.051871,
 0.048654,
 0.047676]

In [8]:
nltk.tokenize.sent_tokenize(new_book)[3]

'“But it is,” returned she; “for Mrs. Long has just been here, and she told me all about it.” Mr. Bennet made no answer.'

### POS tagging

In [9]:
nltk_pos_time = []

for i in range(0,10):
    
    # Start timer
    start = datetime.datetime.now()

    # tag words
    tags = nltk.pos_tag(nltk.tokenize.word_tokenize(book))
    
    # Stop timer
    finish = datetime.datetime.now()
    
    # Compute time for operation
    nltk_pos_time.append((finish - start).total_seconds())

print('mean time: ' + str(np.mean(nltk_pos_time)))
nltk_pos_time

mean time: 1.4022461


[1.508282,
 1.451872,
 1.574116,
 1.449399,
 1.496267,
 1.323999,
 1.329226,
 1.305203,
 1.30432,
 1.279777]

In [10]:
nltk.pos_tag(nltk.tokenize.word_tokenize(book))[3]

('is', 'VBZ')

### Stemming

In [11]:
nltk_stemming_time = []
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

for i in range(0,10):
    
    # Start timer
    start = datetime.datetime.now()

    # stemming
    word_tokens = nltk.word_tokenize(new_book)
    stems = [lemmatizer.lemmatize(w) for w in word_tokens]
    
    # Stop timer
    finish = datetime.datetime.now()
    
    # Compute time for operation
    nltk_stemming_time.append((finish - start).total_seconds())

print('mean time: ' + str(np.mean(nltk_stemming_time)))
nltk_stemming_time

mean time: 0.44251510000000005


[1.63918,
 0.339941,
 0.321635,
 0.300169,
 0.315072,
 0.305255,
 0.301121,
 0.312424,
 0.293688,
 0.296666]

In [12]:
print('before: ' + word_tokens[32])
print('after: ' +[lemmatizer.lemmatize(w) for w in word_tokens][32])

before: feelings
after: feeling


# Spacy

https://spacy.io/usage/processing-pipelines

Use the following command to find which pipes are activated: 'nlp.config'

In [13]:
nlp=spacy.load("en_core_web_sm")

### Tokenize words

In [14]:
spacy_token_words_time = []
tokenizer = Tokenizer(nlp.vocab)

for i in range(0,10):
    
    # Start timer
    start = datetime.datetime.now()

    # tokenize words
    tokens = tokenizer(new_book)
    words = []
    for token in tokens:
        words.append(token)

    # Stop timer
    finish = datetime.datetime.now()
    
    # Compute time for operation
    spacy_token_words_time.append((finish - start).total_seconds())

print('mean time: ' + str(np.mean(spacy_token_words_time)))
spacy_token_words_time

mean time: 0.0551747


[0.226859,
 0.01637,
 0.015665,
 0.015225,
 0.107003,
 0.015578,
 0.016011,
 0.015645,
 0.108461,
 0.01493]

In [15]:
words = []
for token in tokens:
    words.append(token)
words[5]

truth

### Tokenize sentences

In [16]:
nlp_sent = spacy.load("en_core_web_sm",disable=['tok2vec',
                                           'tagger',
                                           'parser',
                                           'senter',
                                           'attribute_ruler',
                                           'lemmatizer',
                                           'ner'])
nlp_sent.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7f95f7d5acc0>

In [17]:
spacy_token_sent_time = []

for i in range(0,10):
    
    # Start timer
    start = datetime.datetime.now()

    # tokenize sentences
    doc = nlp_sent(new_book)
    sent = []
    for sentence in doc.sents:
        sent.append(sentence)    
        
    # Stop timer
    finish = datetime.datetime.now()
    
    # Compute time for operation
    spacy_token_sent_time.append((finish - start).total_seconds())

print('mean time: ' + str(np.mean(spacy_token_sent_time)))
spacy_token_sent_time

mean time: 0.2916965


[0.362631,
 0.282622,
 0.285327,
 0.286404,
 0.281837,
 0.279676,
 0.268161,
 0.306057,
 0.277944,
 0.286306]

In [18]:
doc = nlp_sent(new_book)
sent = []
for sentence in doc.sents:
    sent.append(sentence)   
    
sent[5]

Mr. Bennet made no answer. “

### POS tagging

In [19]:
nlp_tag = spacy.load("en_core_web_sm",exclude=[
                                               #'tokenizer',
                                               #'sentencizer'
                                               'parser',
                                               'senter',
                                               'lemmatizer',
                                               'ner'])

In [None]:
spacy_pos_time = []

for i in range(0,10):
    
    # Start timer
    start = datetime.datetime.now()

    # tag words
    tags = [(w.pos_) for w in nlp_tag(new_book)]
    
    # Stop timer
    finish = datetime.datetime.now()
    
    # Compute time for operation
    spacy_pos_time.append((finish - start).total_seconds())

print('mean time: ' + str(np.mean(spacy_pos_time)))
spacy_pos_time

In [None]:
tags = [(w, w.pos_) for w in nlp_tag(new_book)]
tags

### Stemming

In [None]:
nlp_stem = spacy.load("en_core_web_sm",disable=[
                                           'parser',
                                           'senter',
                                           'ner'])

In [None]:
spacy_stemming_time = []

for i in range(0,10):
    
    # Start timer
    start = datetime.datetime.now()

    # stemming
    [w.lemma_ for w in nlp_stem(book)]
    
    # Stop timer
    finish = datetime.datetime.now()
    
    # Compute time for operation
    spacy_stemming_time.append((finish - start).total_seconds())

print('mean time: ' + str(np.mean(spacy_stemming_time)))
spacy_stemming_time

In [None]:
[[w,w.lemma_] for w in nlp_stem(new_book)][7]

In [None]:
[[w,w.lemma_] for w in nlp_stem(new_book)][32]

# Multiprocessing

In [None]:
nlp_single=spacy.load("en_core_web_sm")

with open('Pride and Prejudice - Jane Austen Chapter 1 to 20.txt') as f:
    book1 = f.read()

# Start timer
start = datetime.datetime.now()

# Operation
docs = nlp_single(book1)

# Stop timer
finish = datetime.datetime.now()

# Compute time for operation
print((finish - start).total_seconds())

In [None]:
nlp_multi=spacy.load("en_core_web_sm")

with open('Pride and Prejudice - Jane Austen Chapter 1 to 20.txt') as f:
    book2 = f.read()

# Start timer
start = datetime.datetime.now()

# Operation
docs = nlp_multi.pipe(book2, n_process=8)

# Stop timer
finish = datetime.datetime.now()

# Compute time for operation
print((finish - start).total_seconds())

# Results df

In [None]:
df = pd.DataFrame(np.array([t for t in (np.mean(nltk_token_words_time), np.mean(spacy_token_words_time),
                                        np.mean(nltk_token_sent_time), np.mean(spacy_token_sent_time),
                                        np.mean(nltk_pos_time), np.mean(spacy_pos_time),
                                        nltk_stemming_time[0], np.mean(spacy_stemming_time))]).reshape((4, 2)),
             index=["Word token", "Sentence token", "POS tagging", "Stemming"],
             columns=["NLTK", "Spacy"]
            )

df