# NLTK vs spaCy

In [1]:
import time

## Tokenization

In [None]:
file = open("data/sample_text.txt", "r", encoding="utf8")
text = file.read()

### NLTK

In [None]:
from nltk import sent_tokenize, word_tokenize

In [None]:
start_time = time.clock()

word_tokens = word_tokenize(text)
words = [word_token for word_token in word_tokens]

print(time.clock() - start_time, "seconds")

### spaCy

In [None]:
import spacy
nlp = spacy.load('en', disable=['parser', 'tagger', 'ner'])
print(nlp.pipe_names)

In [None]:
start_time = time.clock()

doc = nlp(text)
words = [token.text for token in doc]

print(time.clock() - start_time, "seconds")


In [None]:
start_time = time.time()

sentences = sent_tokenize(text)
words = [sentence for sentence in sentences]

print("--- %s seconds ---" % (time.time() - start_time))

for sentence in words:
    print(sentence)

In [None]:
start_time = time.time()

nlp.add_pipe(nlp.create_pipe('sentencizer'))
doc = nlp(text)
words = [sent for sent in doc.sents]

print("--- %s seconds ---" % (time.time() - start_time))
for sentence in words:
    print(sentence)

In [None]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion. And people still wonder why?')
for token in doc:
    print(token.text, token.pos_, token.dep_)

In [None]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

In [None]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Apple is looking at buying Colombo based startup for $1 billion')

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
from spacy import displacy
 
doc = nlp('I just bought 2 shares of Apple at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ')
displacy.render(doc, style='ent', jupyter=True)

In [None]:
doc = nlp('I am Lahiru and I just bought 2 Apples at 9 a.m. from the Apple Inc. before the current stock went up by 1 billion $')
displacy.render(doc, style='ent', jupyter=True)

In [None]:
doc = nlp("These are apples. These are oranges.")
for sent in doc.sents:
    print(sent)

In [None]:
from spacy import displacy
 
doc = nlp('Wall Street Journal just published a piece on crypto currencies')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 80})

In [None]:
doc = nlp("Wall Street Journal just published an interesting piece on crypto currencies")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.label_, chunk.root.text)

In [None]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'I love coffee')
print(doc.vocab.strings[u'coffee'])  # 3197928453018144401
print(doc.vocab.strings[3197928453018144401])

In [None]:
nlp = spacy.load('en_core_web_sm')
doc = nlp('I love coffee')
for word in doc:
    lexeme = doc.vocab[word.text]
    print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
          lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)

In [None]:
target = nlp("Cats are beautiful animals.")
 
doc1 = nlp("Dogs are awesome.")
doc2 = nlp("Some gorgeous creatures are felines.")
doc3 = nlp("Dolphins are swimming mammals.")
 
print(target.similarity(doc1))  # 0.8901765218466683
print(target.similarity(doc2))  # 0.9115828449161616
print(target.similarity(doc3))  # 0.782295675287610

In [None]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)