# Natural Language Processing: summarization, search, representation, and similarity

In [None]:
!pip install spacy

In [None]:
import spacy
spacy.__version__

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
# used for webscraping
!pip install goose3

In [None]:
# sumariztion lib
!pip install sumy

## Imports

In [19]:
import en_core_web_sm
import nltk
import matplotlib.pyplot as plt

from spacy.matcher import PhraseMatcher

from spacy import displacy
from goose3 import Goose
from wordcloud import WordCloud

from IPython.core.display import HTML

from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.sum_basic import SumBasicSummarizer

# ML lib
from sklearn.feature_extraction.text import CountVectorizer

# using nltk
nltk.download('punkt')

nlp = spacy.load('en_core_web_sm')
# goose3 helps in extract text from websites
g = Goose()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Loading texts from the Internet

In [None]:
url = 'https://en.wikipedia.org/wiki/Natural_language_processing'
# get data from url
article = g.extract(url)

In [None]:
article.title

In [None]:
# gets cleand text from site removing all css
article.cleaned_text

In [None]:
article.authors

In [None]:
# get all links in the site
article.links

# Named entity recognition

Tags: https://ashutoshtripathi.com/2020/04/13/parts-of-speech-tagging-and-dependency-parsing-using-spacy-nlp/

In [None]:
document = nlp(article.cleaned_text)

for token in document:
  print(token.text,'-',token.pos_)

In [None]:
displacy.render(document, style = 'ent', jupyter=True)

In [None]:
for entity in document.ents:
  if entity.label_ == 'PERSON':
    print(entity.text)

## Most frequent words

In [None]:
# using nltk
tokens = nltk.tokenize.word_tokenize(article.cleaned_text)


In [None]:
# tokens
len(tokens)

In [None]:
frequncy = nltk.FreqDist(tokens)

# get top 10 most frequent token
most_common = frequncy.most_common(50)

most_common

# Word cloud

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

cloud = WordCloud()
cloud = cloud.generate(article.cleaned_text)

plt.figure(figsize=(15,15))
plt.axis('off')
plt.imshow(cloud);

# Preprocessing the texts (frequency and word cloud)

In [None]:
def preprocess(s: str) -> list:
  s = s.lower()
  s = s.replace('.', '')
  s = s.replace('[', '')
  s = s.replace(']', '')
  tokens = []

  # remove stop words, puntuations, space, numbers
  for token in nlp(s):
    if not (token.is_stop or token.like_num or token.is_punct or token.is_space or len(token) == 1):
      tokens.append(token.text)

  tokens = ' '.join([element for element in tokens])

  return tokens

In [None]:
preprocess('TesT NlP it the process 1 1213 ! . ,      d ')

In [None]:
article_cleaned = preprocess(article.cleaned_text)
# article_cleaned

len(article.cleaned_text),len(article_cleaned)

# tokenize and get most frequent words
tokens = nltk.tokenize.word_tokenize(article_cleaned)
frequncy = nltk.FreqDist(tokens)

# get top 10 most frequent token
most_common = frequncy.most_common(50)

most_common

In [None]:
cloud = WordCloud()
cloud = cloud.generate(article_cleaned)

plt.figure(figsize=(15,15))
plt.axis('off')
plt.imshow(cloud);

# Text summarization

- sumy library: https://pypi.org/project/sumy/

Steps to follow -
* **Preprocessing of text**-
  remove stop words, puntuations, space, numbers

* **Calculate Word frequency** -
  get frequency of each word in preprocessed text

* **Define Weight for word frequency** -
  calculate the weight for each word
  
  Weight formule :
  ```
  number of times word appear / highest frequency value

  or

  current frecuency / highest frequency
```
* **Sentence tokenization** -
  convert the original paragraph in multiple sentences(based on .),
  NOTE - stop words are not considered

* **Score for the sentence** -
  use calculate weight for words and add them based on their appearence in the sentence

* **Order the sentence** -
  * order the sentences based on the score calculated,
  * for this we also need to define how many sentences to be selected.
  * larger the paragraph we need to select more sentences

* **Generate Summary**


In [None]:
url = 'https://en.wikipedia.org/wiki/Natural_language_processing'
# get data from url
article = g.extract(url)

In [None]:
article.cleaned_text

In [None]:
# use nltk to tokenize sentences in article
original_sentences: list = []

for sentence in nltk.sent_tokenize(article.cleaned_text):
  original_sentences.append(sentence)

In [None]:
# use summy to calculate high score for sentence
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.sum_basic import SumBasicSummarizer

from IPython.core.display import HTML

# create parser
parser = PlaintextParser.from_string(article.cleaned_text, Tokenizer('english'))
# get summerizer basic
summarizer = SumBasicSummarizer()
# create summary with 40% of best sentences
summary_size = (len(original_sentences)/10) * 5
summary = summarizer(parser.document, summary_size)

best_sentences: list = []
for sentence in summary:
  best_sentences.append(str(sentence))

In [None]:
from IPython.core.display import HTML
text = ''
display(HTML(f'<h2>Summary - {article.title}</h2>'))

for sentence in original_sentences:
  #print(sentence)
  if sentence in best_sentences:
    text += ' ' + str(sentence).replace(sentence, f"<mark>{sentence}</mark>")
  else:
    text += ' ' + sentence
display(HTML(f"""{text}"""))

# Key word search


In [None]:
string = 'Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.'

In [None]:
from spacy.matcher import PhraseMatcher

# create a search token list
search_string = ['artificial', 'computer']
token_list: list = []

for item in search_string:
  token_list.append(nlp(item))

## Basic word search

In [None]:
# create spacy matcher
matcher = PhraseMatcher(nlp.vocab)

matcher.add('SEARCH', None, *token_list)

document = nlp(string)
matches = matcher(document)

matches

[(8661325627334373315, 12, 13), (8661325627334373315, 16, 17)]

In [None]:
document[12:13], document[16:17]

In [None]:
# 5 words before computer and 5 words after computer
document[12-5:13+5]

In [None]:
# 5 words before artificial and 5 words after artificial
document[16-5:17+5]

## Word search in Wikipedia document

In [None]:
from IPython.core.display import HTML

# search words in document
number_of_words = 50
search_string_html = ' '.join([element for element in search_string])
# search_string_html

marked_text = ''
display(HTML(f'<h1>{search_string_html.upper()}'))

document = nlp(article.cleaned_text)
matches = matcher(document)

display(HTML(f"""<p><strong>Number of matches: </strong>{len(matches)}</p>"""))

for i in matches:
  # print(i)
  start = i[1]-number_of_words
  #  negative index check
  if start < 0 :
    start = 0

  for j in range(len(token_list)):
    # print(j, token_list[j])
    #  i is index extracted from matcher
    if document[i[1]:i[2]].similarity(token_list[j]) == 1.0:
      search_text = str(token_list[j])
      marked_text += str(document[start:i[2] + number_of_words]).replace(search_text, f"<mark>{search_text}</mark>")
      marked_text += "<br /><br />"

display(HTML(f"""<blockquote>{marked_text}</blockquote>"""))


# Models for text representation
Here we conver text to numerical representation for computers to understand.

For complex chat bot, sentiment analysis, searching for similar document we need to convert words into numbers.

* Bag Of Words
* TF - IDF

[Difference between BOG and TF-IDF](https://mayurji.github.io/blog/2021/09/20/Tf-Idf#:~:text=Unlike%2C%20bag%2Dof%2Dwords,documents%20this%20word%20appears%20in.&text=N%20is%20the%20total%20number,known%20as%20inverse%20document%20frequency.)

# Bag of words
A simple way to represent sentences/words.
In NLP we need to represent words in a numerical way as computer and algorithum understands numbers

For complex chat bot, sentiment analysis, searching for similar document we need to convert words into numbers.


consider example sentences -
1. This is the first document
2. this document is the second document
3. And this is third one
4. Is this the first document?

| Sentence# | and | document | first | is | one | second | the | third | these |
|----|-----|----------|-------|----|-----|--------|-----|-------|------|
| 1 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 1 |
| 2 | 0 | 2 | 0 | 1 | 0 | 1 | 1 | 0 | 1 |
| 3 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 1 |
| 4 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 1 |

- identify unique words in each sentences
- fill the table with the frequency in sentences
- now this is simple to represent the bag of word format which is a 2D matrix


## Drawback of bag of words represntation
* We only count number of times the unique words appear in a single sentences.
* This representation is not very good as frequent words dont innate the document and may not represent much information about the context.

Example a single word may appear 100 times and other may appear less often.
* Higher weight is given to the word that appear most often.
* Less weight is given to word that appear less often.
* Other problem is longer sentences will have greater weight than shorter sentences.

In [None]:
# ML lib
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

sentences = ['This is the first document.',
             'This document is the second document.',
             'And this is the third one.',
             'Is this the first document?']

# create vecotrizer
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(sentences)

# represent in table
header = vectorizer.get_feature_names_out()

ax = plt.gca()
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)

plt.box(on=None)
the_table = plt.table(colLabels=header,cellText=x.toarray(),loc='center');
the_table.scale(2, 2.5)

## Vectorize Wikipedia document

In [None]:
# ML lib
from sklearn.feature_extraction.text import CountVectorizer

url = 'https://en.wikipedia.org/wiki/Natural_language_processing'
# get data from url
article = g.extract(url)

# use nltk to tokenize sentences in article
original_sentences: list = []

for sentence in nltk.sent_tokenize(article.cleaned_text):
  original_sentences.append(sentence)

# vectorize wiki
vectorizer = CountVectorizer()
x_sentences = vectorizer.fit_transform(original_sentences)

print(vectorizer.get_feature_names_out())

print(x.toarray())

# TF-IDF (Time Frequency - Inverse Document Frequency)

* Created to overcome the limitations of Bag of words algorithm.
* It considers the frequency of words appearing in all sentences of document, where as in bag of words frequecy is consodered only for one sentence.

* Calculating TF is also called as Normalization, where all terms are considered as equally important.
* In IDF we increase weight of less frequent words and decrease weight of most frequent words.

TF Formule -
```
TF = Number of timers term T appears in document / total number of terms in document
```
IDF Formule -
```
IDF = 1 + log(Total number of document / number of documents term T appeared)
```
Calcuate TF-IDF -
```
  TF * IDF
```

<br>

consider previous example sentences -
1. This is the first document
2. this document is the second document
3. And this is third one
4. Is this the first document ?

## Calculate TF (Normalization)

Steps -
* create table similar to bag of words modeling
* get number of tokens/terms in each sentences
* use TF formule to fill the table of TF

<br>

**Bag of words representation** -

| Sentence# | and | document | first | is | one | second | the | third | these |
|----|-----|----------|-------|----|-----|--------|-----|-------|------|
| 1 | - | 1 | 1 | 1 | - | - | 1 | - | 1 |
| 2 | - | 2 | - | 1 | - | 1 | 1 | - | 1 |
| 3 | 1 | - | - | 1 | 1 | - | 1 | 1 | 1 |
| 4 | - | 1 | 1 | 1 | - | - | 1 | - | 1 |

**Number of token/ term in sentences** -

Sentence 1 - 5 ,
Sentence 2 - 6 ,
Sentence 3 - 6 ,
Sentence 4 - 5

**Apply TF formule to table**
```
TF = Number of timers term T appears in document / total number of terms in document
```
| Sentence# | and | document | first | is | one | second | the | third | these |
|----|-----|----------|-------|----|-----|--------|-----|-------|------|
| 1 | - | 0.20 | 0.20 | 0.20 | - | - | 0.20 | - | 0.20 |
| 2 | - | 0.33 | - | 0.16 | - | 0.16 | 0.16 | - | 0.16 |
| 3 | 0.16 | - | - | 0.16 | 0.16 | - | 0.16 | 0.16 | 0.16 |
| 4 | - | 0.20 | 0.20 | 0.20 | - | - | 0.20 | - | 0.20 |


## Calculate IDF

Steps -
* Get total number of sentences in document.
* Get the term T which we need to calculate.
* Get the sentences in which term T appear.
* apply formule -
```
IDF = 1 + log(Total number of document / number of documents term T appeared)
```

Example consider term document in the above sentences.
* it appears in sentence #1, #2, #4
* document appeares in each document appears total of 3 times,
* **NOTE - even thought term document appeares 2 times in the 2 sentence, it will be treated a 1 time, as we are interested in total no of sentence the term appear and not how many times it appear in each sentence**
* apply formule -
```
  IDF = 1 + log(4/3)
  IDF = 1.28
```

## Calculate TF-IDF value

to calcuate TF-IDF -
```
  TF * IDF
```

Lets create table for only two terms for now, but in reality it needs to be done for all words

| term | Sentence #1 | Sentence #2 | Sentence #3 | Sentence #4 |
|------|-------------|-------------|-------------|-------------|
| document| 0.20 x 1.28 = 0.25 | 0.33 x 1.28 = 0.42 | 0 | 0.20 x 1.28 = 0.25 |
| first   | 0.20 x 1.69 = 0.33 | 0                  | 0 | 0.20 x 1.69 = 0.33 |

In [None]:
# ML lib
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

sentences = ['This is the first document.',
             'This document is the second document.',
             'And this is the third one.',
             'Is this the first document?']

vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(sentences)

# vectorizer.idf_

# represent in table
header = vectorizer.get_feature_names_out()
vectorizedValue = x.toarray()

ax = plt.gca()
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)

plt.box(on=None)
the_table = plt.table(colLabels=header,cellText=vectorizedValue,loc='center');
the_table.scale(3, 3.5)

## Vectorize Wikipedia document

In [None]:
# ML lib
from sklearn.feature_extraction.text import TfidfVectorizer

url = 'https://en.wikipedia.org/wiki/Natural_language_processing'
# get data from url
article = g.extract(url)

# use nltk to tokenize sentences in article
original_sentences: list = []

for sentence in nltk.sent_tokenize(article.cleaned_text):
  original_sentences.append(sentence)

vectorizer = TfidfVectorizer()
# we need to preprocess text for better results
x_sentences = vectorizer.fit_transform(original_sentences)

print(vectorizer.get_feature_names_out())
print(vectorizer.idf_)
print(x_sentences.toarray())

# Cosine similarity

- Link: https://en.wikipedia.org/wiki/Cosine_similarity
- Step by step calculation: https://janav.wordpress.com/2013/10/27/tf-idf-and-cosine-similarity/

In [21]:
# ML lib
from sklearn.feature_extraction.text import TfidfVectorizer

url = 'https://en.wikipedia.org/wiki/Natural_language_processing'
# get data from url
article = g.extract(url)

# use nltk to tokenize sentences in article
original_sentences: list = []

for sentence in nltk.sent_tokenize(article.cleaned_text):
  original_sentences.append(sentence)

vectorizer = TfidfVectorizer()
# we need to preprocess text for better results
x_sentences = vectorizer.fit_transform(original_sentences)

x_sentences.shape

(80, 848)

In [28]:
import numpy as np

#  get two sentence from wiki
x_test_similarity = x_sentences[0:3]
x_test_similarity: np.ndarray = x_test_similarity.toarray()

# concatinate sentence 1 to end of array to have an extra copy
x_test_similarity = np.concatenate((x_test_similarity,x_test_similarity[0].reshape(1,-1)), axis=0)

print(x_test_similarity)
print(x_test_similarity.shape)



In [None]:
# Similarity between Sentence 1 and 2
from sklearn.metrics.pairwise import cosine_similarity

# calculate similarity b/w sentence 1 and sentence 2
# reshape to get matrix from vector
s = cosine_similarity(x_test_similarity[0].reshape(1,-1), x_test_similarity[1].reshape(1,-1))

print('similarity between sentences 1 and 2 \n',original_sentences[0],'\n',original_sentences[1])
print('is equal to')
print(s)

In [None]:
# Similarity between Sentence 2 and 3
from sklearn.metrics.pairwise import cosine_similarity

# calculate similarity b/w sentence 1 and sentence 2
# reshape to get matrix from vector
s = cosine_similarity(x_test_similarity[1].reshape(1,-1), x_test_similarity[2].reshape(1,-1))

print('similarity between sentences 1 and 3 \n',original_sentences[1],'\n',original_sentences[2])
print('is equal to')
print(s)

In [None]:
# Similarity between Sentence 1 and 4 (4 is same as 1)
from sklearn.metrics.pairwise import cosine_similarity

# calculate similarity b/w sentence 1 and sentence 2
# reshape to get matrix from vector
s = cosine_similarity(x_test_similarity[0].reshape(1,-1), x_test_similarity[3].reshape(1,-1))

print('similarity between sentences 1 and 4 \n {} \n {}'.format(original_sentences[0],original_sentences[3]))
print('is equal to')
print(s)

In [None]:
# Similarity between Sentence 1 and all

cosine_similarity(x_test_similarity[0].reshape(1,-1), x_test_similarity)

# Simulating a chatbot

In [66]:
import random
# ML lib
from sklearn.feature_extraction.text import TfidfVectorizer


welcome_words_inputs = ("hello","hi","hey")
welcome_words_outputs = ('hey', 'hello', 'hi', 'how are you?', 'welcome', 'how are you doing?')

class Chatbot() :

  def __init__(self):
    url = 'https://en.wikipedia.org/wiki/Natural_language_processing'
    # get data from url
    article = g.extract(url)

    # use nltk to tokenize sentences in article
    self.original_sentences: list = []

    for sentence in nltk.sent_tokenize(article.cleaned_text):
      self.original_sentences.append(sentence)

  def preprocess(self, s: str) -> list:
    s = s.lower()
    s = s.replace('.', '')
    s = s.replace('[', '')
    s = s.replace(']', '')
    tokens = []

    # remove stop words, puntuations, space, numbers
    for token in nlp(s):
      if not (token.is_stop or token.like_num or token.is_punct or token.is_space or len(token) == 1):
        tokens.append(token.text)

    tokens = ' '.join([element for element in tokens])

    return tokens


  def welcom_message(self, text: str) -> str:
    for word in text.split():
      if word.lower() in welcome_words_inputs:
        return random.choice(welcome_words_outputs)


  def answer(self, question: str, threshold=0.3):
    cleaned_sentences = []
    # clean the wiki document sentences
    for sentences in self.original_sentences:
      cleaned_sentences.append(self.preprocess(sentences))

    # print(cleaned_sentences)

    chatbot_answer = ''
    # clean user question
    question = self.preprocess(question)

    # append question to get it vectorized
    cleaned_sentences.append(question)
    # print(cleaned_sentences[-1])

    # vectorize
    tfidf = TfidfVectorizer()
    x_sentences = tfidf.fit_transform(cleaned_sentences)
    # print(x_sentences.toarray()[-1])

    similarity = cosine_similarity(x_sentences[-1],x_sentences)
    # print(similarity)

    # get sentence with highest similarity
    sentence_index = similarity.argsort()[0][-2]
    # print(sentence_index)
    # print(similarity[0][sentence_index])

    if similarity[0][sentence_index] < threshold:
      chatbot_answer += 'Sorry no answer was found!'
    else:
      chatbot_answer += original_sentences[sentence_index]

    return chatbot_answer


In [None]:
c = Chatbot()

c.answer("what is natural language processing")

In [None]:
c = Chatbot()

c.answer("who is alan turing")

In [None]:
c = Chatbot()

c.answer("what is NLP",0.1)

In [None]:
c = Chatbot()
cont = True
print('Hello! I am a chatbot and I will answer your questions about natural language processing')

while cont == True:
  user_text = input()

  if user_text != 'quit':
    if c.welcom_message(user_text) != None:
      print('Chatbot: ' + c.welcom_message(user_text))
    else:
      print('Chatbot:')
      print(c.answer(user_text))
  else:
    cont = False
    print('Chatbot: Bye! I will see you soon')

