# Introduction to NLP


## Installing Libraries
* spaCy https://spacy.io/


In [None]:
!pip install spacy --upgrade

In [None]:
import spacy
spacy.__version__

In [None]:
!python -m spacy download en_core_web_sm

# POS (part-of-speech)
* POS(part of speech): noun,adjective,verb
* important to find named entities
* Tokens: https://spacy.io/api/annotation#pos-tagging

Tags: https://ashutoshtripathi.com/2020/04/13/parts-of-speech-tagging-and-dependency-parsing-using-spacy-nlp/

In [None]:
import en_core_web_sm
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS

# get required lang loaded on spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
# string to be process by spacy
document = nlp('I am Learning natural language processing. The course is in India.')

# recognise each token in document
for token in document:
  # pos_ : gives part of speach in token
  print(token.text, token.pos_)

## Legend

- lemma: "root" of the word
- pos: part-of-speech  
- tag: morfological information (present, future, past)
- dep: syntatic dependency
- shape: lowercase, uppercasa
- alpha: if it is alphanumeric
- stop: if it is a stop word

In [None]:
for token in document:
  s = "{} | {} | {} | {} | {} | {} | {} | {} ".\
  format(token.text, token.pos_, token.lemma_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

  print(s)

In [None]:

for token in document:
  # get all proper noun
  if token.pos_ == 'PROPN':
    print('proper noun:',token.text)
  # get all verb
  if token.pos_ == 'VERB':
    print('verb:',token.text)

# Lemmatization and stemming

- Lemmatization: meaning of the word based on the dictionary (morphological analysis) - extract the base word
- Stemming: extract the root of the word

* Lemmatization is prefered as it extract words with meaning through morphological analysis while this can be lost in Stemming

In [None]:
for token in document:
  print("{} | {}".format(token.text, token.lemma_))

In [None]:
document2 = nlp('learn learning watch watching watched')
[token.lemma_ for token in document2]

## Lemmatization X stemming


In [None]:
import nltk

stemmer = nltk.stem.PorterStemmer()

stemmer.stem('learning')

In [None]:
for token in document:
  print("{} | {} | {}".format(token.text, token.lemma_, stemmer.stem(token.text)))

# Named-entity recognition (NER)

- List of tags: https://towardsdatascience.com/named-entity-recognition-ner-using-spacy-nlp-part-4-28da2ece57c6

* Find and classify entity in text such as
  * people
  * location
  * money
  * numbers
* Can be used to know the subjects in the spoken language

Labels -
 * GPE : geo location
 * ORG : organization
 * DATE : date
 * MONEY : money

In [33]:
text = 'cisco is a US company on networking, security and collaboration tech. It is located in Bangalore and revenue in 2020 was approximatly 500 billion dollars.'


In [None]:
document = nlp(text)

# extract entity
for entity in document.ents:
  print("{} | {}".format(entity.text,entity.label_))

In [None]:
from spacy import displacy

displacy.render(document, style='ent', jupyter=True)

In [None]:
text = 'Bill Gates was born in Seattle on 1955-10-28 and is the founder of Microsoft'
document = nlp(text)

for entity in document.ents:
  if entity.label_ == 'PERSON':
    print(entity.text)

displacy.render(document, style = 'ent', jupyter=True)

# Stopwords
- Words that appear very often and don't help to understand the context of the document

example : it

In [None]:
# from spacy.lang.en.stop_words import STOP_WORDS
print(STOP_WORDS)

print('it' in STOP_WORDS)

print(len(STOP_WORDS))

print(nlp.vocab['it'].is_stop)

In [None]:
document = nlp('I am Learning natural language processing. The course is in India.')

print('stop words - ')
for token in document:
  if nlp.vocab[token.text].is_stop:
    print(token.text)

print("----------------------------------")

print('not stop words - ')
for token in document:
  if not nlp.vocab[token.text].is_stop:
    print(token.text)

# Dependency parsing

- Parent-child relation



## Example 1 : find relation between London and Paris in text

In [65]:
document = nlp('book a ticket from London to Paris')

loc: list = []

for entity in document.ents:
  if entity.label_ == 'GPE':
    # entity.start gives index of entity
    loc.append(document[entity.start])


print(loc)

# identify ancestors, with which it will know how to relate
origin = loc[0]
print(list(origin.ancestors))

dest = loc[1]
print(list(dest.ancestors))

# check for ansetory
document[0].is_ancestor(document[2])

[London, Paris]


## Example 2

In [None]:
document = nlp('Book a table for the restaurant and a taxi to the hotel')

for token in document:
  if token.pos_ == 'NOUN':
    print("{} | {} | {} | {} | {} | {} | {} | {} ".\
    format(token.text, token.pos_, token.lemma_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop))

tasks = document[2],document[8]
locations = document[5], document[11]

print(tasks, locations)

for local in locations:
  print("------------",local)
  for obj in local.ancestors:
    print(obj)

In [None]:

for local in locations:
  for obj in local.ancestors:
    if obj in tasks:
      print('Reservation of {} to the {}'.format(obj,local))
      break

In [None]:
list(document[5].children)

## Example 3

In [None]:
from spacy import displacy
document = nlp('Book a table for the restaurant and a taxi to the hotel')

# visualize the dependency relation
displacy.render(document, style='dep', jupyter=True, options={'distance': 90})

In [None]:
# ancestor for table
list(document[2].ancestors)

In [None]:
# children for table
list(document[2].children)

## Example 4

In [None]:
document = nlp('What places can we visit in London and stay in Paris?')
locations: list = []
actions: list = []

# get action and location lists
for token in document:
  if token.pos_ == 'VERB':
    actions.append(token)
  elif token.pos_ == 'PROPN':
    locations.append(token)

print("--------actions: {} \n".format(actions))
print("--------locations: {} \n".format(locations))

for local in locations:
  for action in local.ancestors:
    if action in actions:
      print("{} to {}".format(local,action))
      break

In [None]:
displacy.render(document, style='dep', jupyter=True, options={'distance': 90})

# Similarity between words and sentences

- spaCy uses the GloVe algorithm (Global Vectors for Word Representation)
- Original paper: https://nlp.stanford.edu/pubs/glove.pdf

## Example 1

In [None]:
w1 = nlp('hello')
w2 = nlp('hi')
w3 = nlp('or')

# calculate similarity
print(w1.similarity(w2))
print(w2.similarity(w1))
print(w1.similarity(w3))
print(w2.similarity(w3))

In [104]:
text1 = nlp('When will the new movie be released?')
text2 = nlp('The new movie will be released next month')
text3 = nlp('What color is the car?')

print(text1.similarity(text2))
print(text1.similarity(text3))

0.701367333985553
0.4782758141062681


  print(text1.similarity(text2))
  print(text1.similarity(text3))


## Example 2

In [106]:
text = nlp('cat dog horse person')

for text1 in text:
  #print('----', text1)
  for text2 in text:
    #print(text2)
    similarity = text1.similarity(text2) * 100
    print('{} is {}% similar to {}'.format(text1, similarity, text2))


cat is 100.0% similar to cat
cat is 55.56725263595581% similar to dog
cat is 49.9476432800293% similar to horse
cat is 19.96726244688034% similar to person
dog is 55.56725263595581% similar to cat
dog is 100.0% similar to dog
dog is 66.69515371322632% similar to horse
dog is 35.0044310092926% similar to person
horse is 49.9476432800293% similar to cat
horse is 66.69515371322632% similar to dog
horse is 100.0% similar to horse
horse is 28.581640124320984% similar to person
person is 19.96726244688034% similar to cat
person is 35.0044310092926% similar to dog
person is 28.581640124320984% similar to horse
person is 100.0% similar to person


  similarity = text1.similarity(text2) * 100
