# Setup
- python -m spacy download en
- python -m spacy download en_core_web_md
- python -m spacy download parser
- python -m spacy download glove



In [None]:
import spacy
import numpy as np
from collections import defaultdict

In [None]:
nlp = spacy.load('en_core_web_md')

In [None]:
doc1 = nlp(u"this's spacy tokenize test")
print(doc1)

In [None]:
doc1[0].rank #word index

In [None]:
[tok.text for tok in doc1]

In [6]:
for token in doc1:
    print(token)

this
's
spacy
tokenize
test


### Sentence Tokenize Test or Sentence Segmentation Test:


In [7]:
doc2 = nlp(u"this is spacy sentence tokenize test. this is second sent! is this the third sent? final test.")

In [8]:
for sent in doc2.sents:
    print(sent)

this is spacy sentence tokenize test.
this is second sent!
is this the third sent?
final test.


### Lemmatize Test:

In [9]:
doc3 = nlp(u"this is spacy lemmatize testing. programming books are more better than others")

In [10]:
for token in doc3:
    print(token, token.lemma, token.lemma_)

this 530 this
is 522 be
spacy 173815 spacy
lemmatize 1484778 lemmatize
testing 2933 testing
. 453 .
programming 3441 programming
books 1045 book
are 522 be
more 563 more
better 649 better
than 589 than
others 598 other


### Pos Tagging Test:

http://www.winwaed.com/blog/2011/11/08/part-of-speech-tags/  
http://www.clips.ua.ac.be/pages/mbsp-tags  
https://www.analyticsvidhya.com/blog/2017/04/natural-language-processing-made-easy-using-spacy-%E2%80%8Bin-python/

In [11]:
doc4 = nlp(u"This is pos tagger test for spacy pos tagger")

In [12]:
for token in doc4:
    print(token, token.pos, token.pos_)

This 88 DET
is 98 VERB
pos 82 ADJ
tagger 90 NOUN
test 90 NOUN
for 83 ADP
spacy 90 NOUN
pos 90 NOUN
tagger 90 NOUN


### Named Entity Recognizer (NER) Test:

In [13]:
doc5 = nlp(u"Rami Eid is studying at Stony Brook University in New York")

In [14]:
for ent in doc5.ents:
    print(ent, ent.label, ent.label_)

Rami Eid 377 PERSON
Stony Brook University 380 ORG
New York 381 GPE


### Noun Chunk Test:

In [15]:
doc6 = nlp(u"Natural language processing (NLP) deals with the application of computational models to text or speech data.")

In [26]:
for noun in doc6.noun_chunks:
    print(noun)

Natural language processing (NLP) deals
the application
computational models
text
speech
data


### Word Vectors Test:

In [17]:
doc7 = nlp(u"Apples and oranges are similar. Boots and hippos aren't.")
apples = doc7[0]
print(apples)
oranges = doc7[2]
boots = doc7[6]
hippos = doc7[8]
print(apples.similarity(oranges))
print(boots.similarity(hippos))

Apples
0.77809414836
0.038474555379


### Multi-threaded generator

In [18]:
texts = [u'One document.', u'...', u'Lots of documents']
# .pipe streams input, and produces streaming output
iter_texts = (texts[i % 3] for i in range(100000000))
for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=4)):
    assert doc.is_parsed
    if i == 100:
        break

### Deeplearning
https://spacy.io/docs/usage/deep-learning

In [34]:
test_sent = 'Let us see what comes for MACHINE, machine and an outtttoffword'

In [35]:
test_sent_parsed = nlp(test_sent)
test_sent_tok = [tok for tok in test_sent_parsed]

In [28]:
def get_spacy_embedding_matrix(nlp):
    vocab = nlp.vocab
    max_rank = max(lex.rank for lex in vocab if lex.has_vector)
    vectors = np.ndarray((max_rank + 1, vocab.vectors_length), dtype='float32')
    for lex in vocab:
        if lex.has_vector:
            vectors[lex.rank] = lex.vector
    return vectors

In [86]:
def get_word_and_vector(index, nlp):
     return (nlp.vocab[index].text, nlp.vocab[index].vector)

In [87]:
[get_word_and_vector(i, nlp) for i in range(10)]

[('', array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0., 

In [63]:
def spacy_word2vec(word, nlp):
    lex = nlp(word)
    if lex.has_vector:
        return lex.vector
    else:
        return nlp.vocab[0].vector #return all zeros for Out of vocab
    

In [67]:
spacy_word2vec('out', nlp)

array([  4.96499985e-02,   5.29359989e-02,  -1.97679996e-01,
         7.42449984e-02,   1.59140006e-01,   1.20999999e-02,
        -2.99290001e-01,   1.89219993e-02,   5.11710010e-02,
         2.51090002e+00,  -1.52899995e-01,   2.05960006e-01,
         1.25239998e-01,  -3.28860015e-01,  -3.43309999e-01,
        -1.52260005e-01,  -1.91770002e-01,   8.24530005e-01,
        -4.68430012e-01,   7.05149993e-02,   1.49629995e-01,
         4.56800014e-02,   7.76799978e-04,   1.80930004e-01,
        -7.81830028e-02,   8.67969997e-04,   6.80819973e-02,
        -1.10560000e-01,   7.20390007e-02,  -6.18350029e-01,
         1.24059998e-05,   1.79120004e-01,  -1.98819995e-01,
         4.12929989e-02,   2.91409999e-01,  -1.22970000e-01,
         7.59010017e-02,   2.95679986e-01,  -2.19359994e-03,
         1.19340001e-02,  -1.85090005e-01,   3.61110009e-02,
        -1.82980001e-01,  -3.82050008e-01,   2.73400009e-01,
         1.95230007e-01,  -4.38049994e-02,   5.28290011e-02,
        -1.59729999e-02,

### A small data indexer

In [70]:
out_of_word = 0
padding = 0
word_to_index = {}
index_to_word= {}
word_to_index[nlp.vocab[0].text] = 0
for i, token in enumerate(set(test_sent_tok),1):
    word_to_index[token.text] = i

print(word_to_index)

for word, i in word_to_index.items():
    index_to_word[i] = word

print(index_to_word)

print(word_to_index['Let'])
print(index_to_word[8])

{'': 0, 'see': 1, 'what': 2, 'comes': 3, 'for': 4, 'machine': 5, 'MACHINE': 6, 'and': 7, 'Let': 8, 'us': 9}
{0: '', 1: 'see', 2: 'what', 3: 'comes', 4: 'for', 5: 'machine', 6: 'MACHINE', 7: 'and', 8: 'Let', 9: 'us'}
8
Let


In [75]:
def get_embedding_matrix(index_to_word, nlp):
    vocab_size = len(index_to_word)
    vectors = np.ndarray((vocab_size, nlp.vocab.vectors_length), dtype='float32')
    for i, word in index_to_word.items():
        vectors[i] = spacy_word2vec(word, nlp)
    return vectors
    

In [77]:
embeddings = get_embedding_matrix(index_to_word, nlp)

embeddings.shape

(10, 300)

### Lets see the index value and their values

In [None]:
for i in range(1000): print(i, nlp.vocab[i].text, nlp.vocab[i].has_vector)

In [None]:
x = nlp('This is a tessssst')
[w.is_oov for w in x]

In [None]:
test_sent_parsed[6], test_sent_parsed[6].rank, '----->Glove Vector', test_sent_parsed[6].vector

In [None]:
test_sent_parsed[8], test_sent_parsed[8].rank, '----->Glove Vector', test_sent_parsed[8].vector

In [None]:
def token_to_index(tokens, max_length):
    Xs = []
    for i, token in enumerate(tokens[:max_length]):
        Xs.append(token.rank if token.has_vector else 0)
    return Xs

In [None]:
test_sent_tok

In [None]:
token_to_index(test_sent_tok, 10)