# Setup
- python -m spacy download en
- python -m spacy download en_core_web_md
- python -m spacy download parser
- python -m spacy download glove



In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_md')

In [3]:
doc1 = nlp(u"this's spacy tokenize test")
print(doc1)


this's spacy tokenize test


In [4]:
for token in doc1:
    print(token)

this
's
spacy
tokenize
test


### Sentence Tokenize Test or Sentence Segmentation Test:


In [5]:
doc2 = nlp(u"this is spacy sentence tokenize test. this is second sent! is this the third sent? final test.")

In [6]:
for sent in doc2.sents:
    print(sent)

this is spacy sentence tokenize test.
this is second sent!
is this the third sent?
final test.


### Lemmatize Test:

In [7]:
doc3 = nlp(u"this is spacy lemmatize testing. programming books are more better than others")

In [8]:
for token in doc3:
    print(token, token.lemma, token.lemma_)

this 530 this
is 522 be
spacy 173815 spacy
lemmatize 1484778 lemmatize
testing 2933 testing
. 453 .
programming 3441 programming
books 1045 book
are 522 be
more 563 more
better 649 better
than 589 than
others 598 other


### Pos Tagging Test:

http://www.winwaed.com/blog/2011/11/08/part-of-speech-tags/  
http://www.clips.ua.ac.be/pages/mbsp-tags

In [9]:
 doc4 = nlp(u"This is pos tagger test for spacy pos tagger")

In [10]:
for token in doc4:
    print(token, token.pos, token.pos_)

This 88 DET
is 98 VERB
pos 82 ADJ
tagger 90 NOUN
test 90 NOUN
for 83 ADP
spacy 90 NOUN
pos 90 NOUN
tagger 90 NOUN


### Named Entity Recognizer (NER) Test:

In [11]:
doc5 = nlp(u"Rami Eid is studying at Stony Brook University in New York")

In [12]:
for ent in doc5.ents:
    print(ent, ent.label, ent.label_)

Rami Eid 377 PERSON
Stony Brook University 380 ORG
New York 381 GPE


### Noun Chunk Test:

In [13]:
doc6 = nlp(u"Natural language processing (NLP) deals with the application of computational models to text or speech data.")

In [14]:
for np in doc6.noun_chunks:
    print(np)

Natural language processing (NLP) deals
the application
computational models
text
speech
data


### Word Vectors Test:

In [15]:
doc7 = nlp(u"Apples and oranges are similar. Boots and hippos aren't.")
apples = doc7[0]
oranges = doc7[2]
boots = doc7[6]
hippos = doc7[8]
print(apples.similarity(oranges))
print(boots.similarity(hippos))

0.77809414836
0.038474555379


### Multi-threaded generator

In [17]:
texts = [u'One document.', u'...', u'Lots of documents']
# .pipe streams input, and produces streaming output
iter_texts = (texts[i % 3] for i in range(100000000))
for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=4)):
    assert doc.is_parsed
    if i == 100:
        break

In [18]:
apples

Apples