# NLTK vs spaCy

In [1]:
import time

## 01 Tokenization (Sentence)

In [2]:
file = open("data/sample_text.txt", "r", encoding="utf8")
text = file.read()

### NLTK

In [3]:
from nltk import sent_tokenize, word_tokenize

In [4]:
start_time = time.clock()

sent_tokens = sent_tokenize(text)
sentences = [sent for sent in sent_tokens]

print(time.clock() - start_time, "seconds")

0.29363155000312174 seconds


### spaCy

In [5]:
import spacy
nlp = spacy.load('en', disable=['parser', 'tagger', 'ner'])
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [6]:
start_time = time.clock()

doc = nlp(text)
sentences = [sent for sent in doc.sents]

print(time.clock() - start_time, "seconds")

0.2420803837097818 seconds


## 02 Tokenization (Words)

In [7]:
start_time = time.clock()

word_tokens = word_tokenize(text)
words = [word_token for word_token in word_tokens]

print(time.clock() - start_time, "seconds")

0.004767292983252069 seconds


In [8]:
start_time = time.clock()

doc = nlp(text)
words = [token.text for token in doc]

print(time.clock() - start_time, "seconds")

0.014553756915953286 seconds


## 03 POS Tagging

In [9]:
from nltk import pos_tag

### NLTK

In [10]:
start_time = time.clock()

word_tokens = word_tokenize(text)
pos_tag(word_tokens)

print(time.clock() - start_time, "seconds")

0.3744629953207994 seconds


### spaCy

In [13]:
nlp = spacy.load('en', disable=['parser', 'tagger'])

In [14]:
doc = nlp(text)

### spaCy

In [None]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

In [None]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Apple is looking at buying Colombo based startup for $1 billion')

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
from spacy import displacy
 
doc = nlp('I just bought 2 shares of Apple at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ')
displacy.render(doc, style='ent', jupyter=True)

In [None]:
doc = nlp('I am Lahiru and I just bought 2 Apples at 9 a.m. from the Apple Inc. before the current stock went up by 1 billion $')
displacy.render(doc, style='ent', jupyter=True)

In [None]:
doc = nlp("These are apples. These are oranges.")
for sent in doc.sents:
    print(sent)

In [None]:
from spacy import displacy
 
doc = nlp('Wall Street Journal just published a piece on crypto currencies')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 80})

In [None]:
doc = nlp("Wall Street Journal just published an interesting piece on crypto currencies")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.label_, chunk.root.text)

In [None]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'I love coffee')
print(doc.vocab.strings[u'coffee'])  # 3197928453018144401
print(doc.vocab.strings[3197928453018144401])

In [None]:
nlp = spacy.load('en_core_web_sm')
doc = nlp('I love coffee')
for word in doc:
    lexeme = doc.vocab[word.text]
    print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
          lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)

In [None]:
target = nlp("Cats are beautiful animals.")
 
doc1 = nlp("Dogs are awesome.")
doc2 = nlp("Some gorgeous creatures are felines.")
doc3 = nlp("Dolphins are swimming mammals.")
 
print(target.similarity(doc1))  # 0.8901765218466683
print(target.similarity(doc2))  # 0.9115828449161616
print(target.similarity(doc3))  # 0.782295675287610

In [None]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)