# Introduction Natrual Lanuage Processing (NLP) using Spacy

In [13]:
import spacy 
import numpy as np
nlp = spacy.load("en_core_web_md") 
doc = nlp("The big grey dog ate all of the chocolate, but fortunately he wasn't sick!")

## Tokenization

In [14]:
doc.text.split() 

['The',
 'big',
 'grey',
 'dog',
 'ate',
 'all',
 'of',
 'the',
 'chocolate,',
 'but',
 'fortunately',
 'he',
 "wasn't",
 'sick!']

In [15]:
[token.orth_ for token in doc] 

['The',
 'big',
 'grey',
 'dog',
 'ate',
 'all',
 'of',
 'the',
 'chocolate',
 ',',
 'but',
 'fortunately',
 'he',
 'was',
 "n't",
 'sick',
 '!']

In [16]:
[token.orth_ for token in doc if not token.is_punct | token.is_stop] 

['The',
 'big',
 'grey',
 'dog',
 'ate',
 'all',
 'of',
 'the',
 'chocolate',
 'but',
 'fortunately',
 'he',
 'was',
 "n't",
 'sick']

## Lemmatization

In [17]:
practice = "practice practiced practicing" 
nlp_practice = nlp(practice) 
[word.lemma_ for word in nlp_practice] 
['practice', 'practice', 'practice']

['practice', 'practice', 'practice']

## Entity recognization

In [18]:
wiki_obama = """Barack Obama is an American politician who served as ...: the 44th President of the United States from 2009 to 2017. He is the first ...: African American to have served as president, ...: as well as the first born outside the contiguous United States.""" 
nlp_obama = nlp(wiki_obama)
[(i, i.label_) for i in nlp_obama.ents] 

[(Barack Obama, 'PERSON'),
 (American, 'NORP'),
 (44th, 'ORDINAL'),
 (the United States, 'GPE'),
 (2009, 'DATE'),
 (2017, 'CARDINAL'),
 (first, 'ORDINAL'),
 (African American, 'NORP'),
 (first, 'ORDINAL'),
 (United States, 'GPE')]

## Part of speech (POS) tagging

In [19]:
doc2 = nlp("Yesterday Germany won the world cup again now the sixth time in a row.") 
pos_tags = [(i, i.tag_) for i in doc2]
pos_tags 

[(Yesterday, 'NN'),
 (Germany, 'NNP'),
 (won, 'VBD'),
 (the, 'DT'),
 (world, 'NN'),
 (cup, 'NN'),
 (again, 'RB'),
 (now, 'RB'),
 (the, 'DT'),
 (sixth, 'JJ'),
 (time, 'NN'),
 (in, 'IN'),
 (a, 'DT'),
 (row, 'NN'),
 (., '.')]

# Similarity between vectors

To measure how similar two words are, we need a way to measure the degree of similarity between two embedding vectors for the two words. Given two vectors $u$ and $v$, cosine similarity is defined as follows: 

$$\text{CosineSimilarity(u, v)} = \frac {u . v} {||u||_2 ||v||_2} = cos(\theta) \tag{1}$$

where $u.v$ is the dot product (or inner product) of two vectors, $||u||_2$ is the norm (or length) of the vector $u$, and $\theta$ is the angle between $u$ and $v$. This similarity depends on the angle between $u$ and $v$. If $u$ and $v$ are very similar, their cosine similarity will be close to 1; if they are dissimilar, the cosine similarity will take a smaller value. 

**Reminder**: The norm of $u$ is defined as $ ||u||_2 = \sqrt{\sum_{i=1}^{n} u_i^2}$

In [21]:
def cos_sim(vec1,vec2):
    return np.dot(vec1.vector, vec2.vector) / (vec1.vector_norm * vec2.vector_norm)

def cos_sim2(vec1,vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# => already builtin vector.similarity(othervector)

# king and queen
king = nlp.vocab['king']
man = nlp.vocab['man']
woman = nlp.vocab['woman']
queen = nlp.vocab['queen']

# countries
germany = nlp.vocab['germany']
italy = nlp.vocab['italy']
usa = nlp.vocab['usa']

#cities
berlin = nlp.vocab['berlin']
newyork = nlp.vocab['new york city']
rome = nlp.vocab['rome']
milano = nlp.vocab['milano']
munich = nlp.vocab['munich']

# food
pizza = nlp.vocab['pizza']
pasta = nlp.vocab['pasta']
burger = nlp.vocab['burger']


print("king and queen: {}".format(king.similarity(queen)))
print("italy and berlin: {}".format(italy.similarity(berlin)))
print("italy and rome: {}".format(italy.similarity(rome)))
print("italy and milano: {}".format(italy.similarity(milano)))
print("italy and munich: {}".format(italy.similarity(munich)))

print("italy and pasta: {}".format(italy.similarity(pasta)))
print("italy and pizza: {}".format(italy.similarity(pizza)))
print("italy and burger: {}".format(italy.similarity(burger)))


print(cos_sim2(germany.vector-berlin.vector,italy.vector-rome.vector))
print(cos_sim2(germany.vector-berlin.vector,italy.vector-milano.vector))
print(cos_sim2(germany.vector-berlin.vector,italy.vector-munich.vector))
print(cos_sim2(germany.vector-berlin.vector,italy.vector-munich.vector))

king and queen: 0.7252610325813293
italy and berlin: 0.5053220987319946
italy and rome: 0.722023069858551
italy and milano: 0.5679553151130676
italy and munich: 0.5211115479469299
italy and pasta: 0.334966242313385
italy and pizza: 0.34625598788261414
italy and burger: 0.1485363394021988
0.365557
0.4164321
0.33112225
0.33112225


# Synonym finder

In [22]:
def most_similar(word):
    queries = [w for w in word.vocab if w.is_lower == word.is_lower and w.prob >= -15]
    by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
    return by_similarity[:10]
 
[w.lower_ for w in most_similar(nlp.vocab['dog'])]

['kennel',
 'dog',
 'canine',
 'hound',
 'canines',
 'dogs',
 'puppy',
 'poodle',
 'terrier',
 'husky']

# Word analogy
In the word analogy task, we complete the sentence <font color='brown'>"*a* is to *b* as *c* is to **____**"</font>. An example is <font color='brown'> '*man* is to *woman* as *king* is to *queen*' </font>. In detail, we are trying to find a word *d*, such that the associated word vectors $e_a, e_b, e_c, e_d$ are related in the following manner: $e_b - e_a \approx e_d - e_c$. We will measure the similarity between $e_b - e_a$ and $e_d - e_c$ using cosine similarity. 

In [24]:
# a is to be as c is to d
# find d

a = nlp("germany")
b = nlp("soccer")
c = nlp("usa")

# b - a = d - c

def most_similar():
    queries = [w for w in nlp.vocab]
    by_similarity = sorted(queries, key=lambda w: cos_sim2(b.vector-a.vector, c.vector - w.vector), reverse=True)
    return by_similarity[:10]

[w.orth_ for w in most_similar()]

  """


['germany',
 'Germany',
 'GERMANY',
 'Rhine',
 'Salzburg',
 'lech',
 'livigno',
 'deutschland',
 'Styria',
 'Innsbruck']