In [1]:
import os
import datetime
import multiprocessing

import numpy as np
import pandas as pd
import matplotlib as plt

import nltk
import spacy 
from spacy.tokenizer import Tokenizer
from spacy.pipeline import Sentencizer

## Loading the book

__License__: https://www.gutenberg.org/policy/license.html

In [2]:
with open('Pride and Prejudice - Jane Austen Chapter 1 to 20.txt') as f:
    book = f.read()
    print(book)

Chapter 1

      It is a truth universally acknowledged, that a single man in
      possession of a good fortune, must be in want of a wife.

      However little known the feelings or views of such a man may be
      on his first entering a neighbourhood, this truth is so well
      fixed in the minds of the surrounding families, that he is
      considered as the rightful property of some one or other of their
      daughters.

      “My dear Mr. Bennet,” said his lady to him one day, “have you
      heard that Netherfield Park is let at last?”

      Mr. Bennet replied that he had not.

      “But it is,” returned she; “for Mrs. Long has just been here, and
      she told me all about it.”

      Mr. Bennet made no answer.

      “Do not you want to know who has taken it?” cried his wife
      impatiently.

      “_You_ want to tell me, and I have no objection to hearing it.”

      This was invitation enough.

      “Why, my dear, you must know, Mrs. Long says that Netherfield is
 

In [3]:
book_content_list = book.split()
new_book = " ".join(book_content_list)
print(new_book)

Chapter 1 It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife. However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered as the rightful property of some one or other of their daughters. “My dear Mr. Bennet,” said his lady to him one day, “have you heard that Netherfield Park is let at last?” Mr. Bennet replied that he had not. “But it is,” returned she; “for Mrs. Long has just been here, and she told me all about it.” Mr. Bennet made no answer. “Do not you want to know who has taken it?” cried his wife impatiently. “_You_ want to tell me, and I have no objection to hearing it.” This was invitation enough. “Why, my dear, you must know, Mrs. Long says that Netherfield is taken by a young man of large fortune from the north of England; that he came down on Monday in a chaise and four to se

In [28]:
len(new_book)

199920

# NLTK analysis

### Tokenize words

In [4]:
nltk_token_words_time = []

for i in range(0,10):
    
    # Start timer
    start = datetime.datetime.now()

    # tokenize words
    word_tokens = nltk.word_tokenize(new_book)
    
    # Stop timer
    finish = datetime.datetime.now()
    
    # Compute time for operation
    nltk_token_words_time.append((finish - start).total_seconds())

print('mean time: ' + str(np.mean(nltk_token_words_time)))
nltk_token_words_time

mean time: 0.2097773


[0.230353,
 0.219209,
 0.221305,
 0.215728,
 0.19878,
 0.195666,
 0.207996,
 0.205524,
 0.201866,
 0.201346]

In [5]:
nltk.word_tokenize(new_book)[5]

'truth'

### Tokenize sentences

In [6]:
nltk_token_sent_time = []

for i in range(0,10):
    
    # Start timer
    start = datetime.datetime.now()

    # tokenize sentences
    sentence_tokens = nltk.tokenize.sent_tokenize(new_book)
    
    # Stop timer
    finish = datetime.datetime.now()
    
    # Compute time for operation
    nltk_token_sent_time.append((finish - start).total_seconds())

print('mean time: ' + str(np.mean(nltk_token_sent_time)))
nltk_token_sent_time

mean time: 0.0528502


[0.063649,
 0.053437,
 0.051524,
 0.05036,
 0.053446,
 0.050253,
 0.054617,
 0.049641,
 0.049399,
 0.052176]

In [7]:
nltk.tokenize.sent_tokenize(new_book)[3]

'“But it is,” returned she; “for Mrs. Long has just been here, and she told me all about it.” Mr. Bennet made no answer.'

### POS tagging

In [8]:
nltk_pos_time = []

for i in range(0,10):
    
    # Start timer
    start = datetime.datetime.now()

    # tag words
    tags = nltk.pos_tag(nltk.tokenize.word_tokenize(book))
    
    # Stop timer
    finish = datetime.datetime.now()
    
    # Compute time for operation
    nltk_pos_time.append((finish - start).total_seconds())

print('mean time: ' + str(np.mean(nltk_pos_time)))
nltk_pos_time

mean time: 1.6045985000000003


[1.656468,
 1.558848,
 1.52993,
 1.578537,
 1.581752,
 1.480958,
 1.66525,
 1.765612,
 1.650739,
 1.577891]

In [9]:
nltk.pos_tag(nltk.tokenize.word_tokenize(book))[3]

('is', 'VBZ')

### Stemming

In [10]:
nltk_stemming_time = []
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

for i in range(0,10):
    
    # Start timer
    start = datetime.datetime.now()

    # stemming
    word_tokens = nltk.word_tokenize(new_book)
    stems = [lemmatizer.lemmatize(w) for w in word_tokens]
    
    # Stop timer
    finish = datetime.datetime.now()
    
    # Compute time for operation
    nltk_stemming_time.append((finish - start).total_seconds())

print('mean time: ' + str(np.mean(nltk_stemming_time)))
nltk_stemming_time

mean time: 0.4340769


[1.56573,
 0.316633,
 0.313845,
 0.301427,
 0.3063,
 0.313441,
 0.313131,
 0.312702,
 0.307508,
 0.290052]

In [11]:
print('before: ' + word_tokens[32])
print('after: ' +[lemmatizer.lemmatize(w) for w in word_tokens][32])

before: feelings
after: feeling


# Spacy

https://spacy.io/usage/processing-pipelines

Use the following command to find which pipes are activated: 'nlp.config'

In [12]:
nlp=spacy.load("en_core_web_sm")

### Tokenize words

In [13]:
spacy_token_words_time = []
tokenizer = Tokenizer(nlp.vocab)

for i in range(0,10):
    
    # Start timer
    start = datetime.datetime.now()

    # tokenize words
    tokens = tokenizer(new_book)
    words = []
    for token in tokens:
        words.append(token)

    # Stop timer
    finish = datetime.datetime.now()
    
    # Compute time for operation
    spacy_token_words_time.append((finish - start).total_seconds())

print('mean time: ' + str(np.mean(spacy_token_words_time)))
spacy_token_words_time

mean time: 0.0565929


[0.233735,
 0.017503,
 0.015317,
 0.016582,
 0.109682,
 0.015802,
 0.015615,
 0.015842,
 0.110455,
 0.015396]

In [14]:
words = []
for token in tokens:
    words.append(token)
words[5]

truth

### Tokenize sentences

In [15]:
nlp_sent = spacy.load("en_core_web_sm",disable=['tok2vec',
                                           'tagger',
                                           'parser',
                                           'senter',
                                           'attribute_ruler',
                                           'lemmatizer',
                                           'ner'])
nlp_sent.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7faf9582b300>

In [16]:
spacy_token_sent_time = []

for i in range(0,10):
    
    # Start timer
    start = datetime.datetime.now()

    # tokenize sentences
    doc = nlp_sent(new_book)
    sent = []
    for sentence in doc.sents:
        sent.append(sentence)    
        
    # Stop timer
    finish = datetime.datetime.now()
    
    # Compute time for operation
    spacy_token_sent_time.append((finish - start).total_seconds())

print('mean time: ' + str(np.mean(spacy_token_sent_time)))
spacy_token_sent_time

mean time: 0.2917318


[0.367783,
 0.289505,
 0.287211,
 0.289545,
 0.284176,
 0.290448,
 0.277058,
 0.277849,
 0.270133,
 0.28361]

In [39]:
doc = nlp_sent(new_book)
sent = []
for sentence in doc.sents:
    sent.append(sentence)   
    
sent[5]

Mr. Bennet made no answer. “

### POS tagging

In [18]:
nlp_tag = spacy.load("en_core_web_sm",exclude=[
                                               #'tokenizer',
                                               #'sentencizer'
                                               'parser',
                                               'senter',
                                               'lemmatizer',
                                               'ner'])

In [19]:
spacy_pos_time = []

for i in range(0,10):
    
    # Start timer
    start = datetime.datetime.now()

    # tag words
    tags = [(w.pos_) for w in nlp_tag(new_book)]
    
    # Stop timer
    finish = datetime.datetime.now()
    
    # Compute time for operation
    spacy_pos_time.append((finish - start).total_seconds())

print('mean time: ' + str(np.mean(spacy_pos_time)))
spacy_pos_time

mean time: 2.0550692


[2.152311,
 2.053978,
 2.039074,
 2.006386,
 2.17481,
 2.066651,
 2.033796,
 2.027939,
 2.032287,
 1.96346]

In [20]:
tags = [(w, w.pos_) for w in nlp_tag(new_book)]
tags

[(Chapter, 'NOUN'),
 (1, 'NUM'),
 (It, 'PRON'),
 (is, 'VERB'),
 (a, 'DET'),
 (truth, 'NOUN'),
 (universally, 'ADV'),
 (acknowledged, 'VERB'),
 (,, 'PUNCT'),
 (that, 'SCONJ'),
 (a, 'DET'),
 (single, 'ADJ'),
 (man, 'NOUN'),
 (in, 'ADP'),
 (possession, 'NOUN'),
 (of, 'ADP'),
 (a, 'DET'),
 (good, 'ADJ'),
 (fortune, 'NOUN'),
 (,, 'PUNCT'),
 (must, 'AUX'),
 (be, 'VERB'),
 (in, 'ADP'),
 (want, 'NOUN'),
 (of, 'ADP'),
 (a, 'DET'),
 (wife, 'NOUN'),
 (., 'PUNCT'),
 (However, 'ADV'),
 (little, 'ADV'),
 (known, 'VERB'),
 (the, 'DET'),
 (feelings, 'NOUN'),
 (or, 'CCONJ'),
 (views, 'NOUN'),
 (of, 'ADP'),
 (such, 'DET'),
 (a, 'DET'),
 (man, 'NOUN'),
 (may, 'AUX'),
 (be, 'VERB'),
 (on, 'ADP'),
 (his, 'PRON'),
 (first, 'ADJ'),
 (entering, 'VERB'),
 (a, 'DET'),
 (neighbourhood, 'NOUN'),
 (,, 'PUNCT'),
 (this, 'DET'),
 (truth, 'NOUN'),
 (is, 'VERB'),
 (so, 'ADV'),
 (well, 'ADV'),
 (fixed, 'VERB'),
 (in, 'ADP'),
 (the, 'DET'),
 (minds, 'NOUN'),
 (of, 'ADP'),
 (the, 'DET'),
 (surrounding, 'VERB'),
 (familie

### Stemming

In [21]:
nlp_stem = spacy.load("en_core_web_sm",disable=[
                                           'parser',
                                           'senter',
                                           'ner'])

In [22]:
spacy_stemming_time = []

for i in range(0,10):
    
    # Start timer
    start = datetime.datetime.now()

    # stemming
    [w.lemma_ for w in nlp_stem(book)]
    
    # Stop timer
    finish = datetime.datetime.now()
    
    # Compute time for operation
    spacy_stemming_time.append((finish - start).total_seconds())

print('mean time: ' + str(np.mean(spacy_stemming_time)))
spacy_stemming_time

mean time: 2.1329236000000003


[2.472914,
 2.067989,
 2.060613,
 2.118507,
 2.127596,
 2.095536,
 2.046723,
 2.067771,
 2.168949,
 2.102638]

In [23]:
[[w,w.lemma_] for w in nlp_stem(new_book)][7]

[acknowledged, 'acknowledge']

In [24]:
[[w,w.lemma_] for w in nlp_stem(new_book)][32]

[feelings, 'feeling']

# Multiprocessing

In [25]:
nlp_single=spacy.load("en_core_web_sm")

with open('Pride and Prejudice - Jane Austen Chapter 1 to 20.txt') as f:
    book1 = f.read()

# Start timer
start = datetime.datetime.now()

# Operation
docs = nlp_single(book1)

# Stop timer
finish = datetime.datetime.now()

# Compute time for operation
print((finish - start).total_seconds())

9.691942


In [26]:
nlp_multi=spacy.load("en_core_web_sm")

with open('Pride and Prejudice - Jane Austen Chapter 1 to 20.txt') as f:
    book2 = f.read()

# Start timer
start = datetime.datetime.now()

# Operation
docs = nlp_multi.pipe(book2, n_process=8)

# Stop timer
finish = datetime.datetime.now()

# Compute time for operation
print((finish - start).total_seconds())

0.00093


# Results df

In [42]:
df = pd.DataFrame(np.array([t for t in (np.mean(nltk_token_words_time), np.mean(spacy_token_words_time),
                                        np.mean(nltk_token_sent_time), np.mean(spacy_token_sent_time),
                                        np.mean(nltk_pos_time), np.mean(spacy_pos_time),
                                        nltk_stemming_time[0], np.mean(spacy_stemming_time))]).reshape((4, 2)),
             index=["Word token", "Sentence token", "POS tagging", "Stemming"],
             columns=["NLTK", "Spacy"]
            )

df

Unnamed: 0,NLTK,Spacy
Word token,0.209777,0.056593
Sentence token,0.05285,0.291732
POS tagging,1.604599,2.055069
Stemming,1.56573,2.132924
