In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import Image

# Text Analytics

After going through these materials, you will be able to use spaCy or other libraries for:

- execution of selected NLP use cases,
- preprocessing of unstructured texts,
- transformation of preprocessed texts into their structured vector representation.

First we import the spaCy library:

In [None]:
import numpy as np
import pandas as pd
import spacy
from spacy import displacy

In [None]:
!python -m spacy download en_core_web_md

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en_core_web_md==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4 MB)
[K     |████████████████████████████████| 96.4 MB 3.4 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


## Word2Vec

word embedding and semantic similarity

In [None]:
from gensim.models import Word2Vec 
import gensim.downloader as api
v2w_model = v2w_model = api.load('word2vec-google-news-300')

In [None]:
King_word2vec_embedding=v2w_model['king']
King_word2vec_embedding

array([ 1.25976562e-01,  2.97851562e-02,  8.60595703e-03,  1.39648438e-01,
       -2.56347656e-02, -3.61328125e-02,  1.11816406e-01, -1.98242188e-01,
        5.12695312e-02,  3.63281250e-01, -2.42187500e-01, -3.02734375e-01,
       -1.77734375e-01, -2.49023438e-02, -1.67968750e-01, -1.69921875e-01,
        3.46679688e-02,  5.21850586e-03,  4.63867188e-02,  1.28906250e-01,
        1.36718750e-01,  1.12792969e-01,  5.95703125e-02,  1.36718750e-01,
        1.01074219e-01, -1.76757812e-01, -2.51953125e-01,  5.98144531e-02,
        3.41796875e-01, -3.11279297e-02,  1.04492188e-01,  6.17675781e-02,
        1.24511719e-01,  4.00390625e-01, -3.22265625e-01,  8.39843750e-02,
        3.90625000e-02,  5.85937500e-03,  7.03125000e-02,  1.72851562e-01,
        1.38671875e-01, -2.31445312e-01,  2.83203125e-01,  1.42578125e-01,
        3.41796875e-01, -2.39257812e-02, -1.09863281e-01,  3.32031250e-02,
       -5.46875000e-02,  1.53198242e-02, -1.62109375e-01,  1.58203125e-01,
       -2.59765625e-01,  

In [None]:
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances

queen_word2vec_embedding=v2w_model['queen']
cosine_similarity(King_word2vec_embedding.reshape(1,-1),queen_word2vec_embedding.reshape(1,-1))

array([[0.6510957]], dtype=float32)

In [None]:
man_word2vec_embedding=v2w_model['man']
cosine_similarity(King_word2vec_embedding.reshape(1,-1),man_word2vec_embedding.reshape(1,-1))

array([[0.22942673]], dtype=float32)

In [None]:
woman_word2vec_embedding=v2w_model['woman']
cosine_similarity(King_word2vec_embedding.reshape(1,-1),woman_word2vec_embedding.reshape(1,-1))

array([[0.12847973]], dtype=float32)

How close is King-Man+Woman is to Queen?

In [None]:
x_word2vec_embedding=v2w_model['king']-v2w_model['man']+v2w_model['woman']
cosine_similarity(x_word2vec_embedding.reshape(1,-1),queen_word2vec_embedding.reshape(1,-1))

array([[0.7300518]], dtype=float32)

## spaCy language model

The Spacy library is built on trained language models. The language model is the result of training on an annotated corpus of documents in a certain language. 

Language models differ:
- the range of data on which they were trained,
- layers/methods that can be used when loading a document.

### Load  the model
The pre-trained language model is loaded after the library is imported with the load command.

In [None]:
#nlp = spacy.load("en_core_web_sm")
import en_core_web_md
nlp = en_core_web_md.load()
print(nlp)

<spacy.lang.en.English object at 0x7f3f13e2dd90>


### Language Processing Pipelines
The basic building block of the language model is *Language Processing Pipeline*, that defines the steps applied to unstructured texts within the processing. Default trained pipeline typically include following steps:


Each pipeline component returns the processed Doc, which is then
passed on to the next component. Spacy pipeline can be modified and additional steps added to it (see section Language detection)

In [None]:
nlp.pipe_names

['tagger', 'parser', 'ner']

## Text preprocessing and feature extraction

Example of using:
- spaCy for text preprocessing,
- models for feature extraction.

The obtained structured vector representations of the original unstructured documents have the following properties:

- appropriately represent the contents of the original unstructured text documents,
- are suitable for analysis or to drive machine learning (ML) algorithms.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import Binarizer

## Text preprocessing

In [None]:
def preprocess_text(unstructured_text):
    print(unstructured_text)
    
    unstructured_text = nlp(unstructured_text)
    
    # lemmatization of significant tokens of text
    lemmatized_tokenized_text = [token.lemma_ for token in unstructured_text
                                 if not token.is_punct | token.is_space | token.is_stop == True]
    print(lemmatized_tokenized_text)
    
    # joining tokens into stream
    processed_text = ' '.join(lemmatized_tokenized_text)
    print(processed_text)

    return processed_text

Extraction of features/terms from unstructured reviews:

In [None]:
rewiews = ['This movie is very scarier and long', 
           'This movie is not scary and is slow', 
           'This movie is spooky and good and good']

preprocessed_rewiews = [preprocess_text(r) for r in rewiews]
preprocessed_rewiews

This movie is very scarier and long
['movie', 'scary', 'long']
movie scary long
This movie is not scary and is slow
['movie', 'scary', 'slow']
movie scary slow
This movie is spooky and good and good
['movie', 'spooky', 'good', 'good']
movie spooky good good


['movie scary long', 'movie scary slow', 'movie spooky good good']

## Feature extraction

We want to represent each text document with a fixed structured numeric vector. The procedure of feature extraction depends on the selected model:

- Binary vectorizer
- Bag of Words (BoW) Model
- Term Frequency-Inverse Document Frequency (TF-IDF)

- pre-trained model BERT

### Binary vectorizer

The weight in the vector of the given document expresses the fact whether the given term from the dictionary appears in the list of terms of the given document or not.

In [None]:
bv = CountVectorizer(binary = True)
features = bv.fit_transform(preprocessed_rewiews)

# print vocabulary
bv.get_feature_names()
print(features)
print(type(features))

['good', 'long', 'movie', 'scary', 'slow', 'spooky']

  (0, 2)	1
  (0, 3)	1
  (0, 1)	1
  (1, 2)	1
  (1, 3)	1
  (1, 4)	1
  (2, 2)	1
  (2, 5)	1
  (2, 0)	1
<class 'scipy.sparse.csr.csr_matrix'>


Occurrences of features/terms from vocabulary in the list of features/terms of the given document:

In [None]:
dict(zip(bv.get_feature_names(), features.toarray()[0]))
dict(zip(bv.get_feature_names(), features.toarray()[1]))
dict(zip(bv.get_feature_names(), features.toarray()[2]))



{'good': 0, 'long': 1, 'movie': 1, 'scary': 1, 'slow': 0, 'spooky': 0}

{'good': 0, 'long': 0, 'movie': 1, 'scary': 1, 'slow': 1, 'spooky': 0}

{'good': 1, 'long': 0, 'movie': 1, 'scary': 0, 'slow': 0, 'spooky': 1}

Structured vector representation of three documents in the feature matrix. This matrix can already be folded as in the input to DM/ML algorithms:

In [None]:
features.toarray()

array([[0, 1, 1, 1, 0, 0],
       [0, 0, 1, 1, 1, 0],
       [1, 0, 1, 0, 0, 1]])

### Bag of Words (BoW) Model

The weight in the vector of a given document expresses the number of occurrences of the given feature/term from vocabulary in the list of features/terms of the given document.

In [None]:
bow = CountVectorizer()
features = bow.fit_transform(preprocessed_rewiews)

# print vocabulary
bow.get_feature_names()



['good', 'long', 'movie', 'scary', 'slow', 'spooky']

Occurrences of features/terms from vocabulary in the list of features/terms of the given document:

In [None]:

dict(zip(bow.get_feature_names(), features.toarray()[0]))
dict(zip(bow.get_feature_names(), features.toarray()[1]))
dict(zip(bow.get_feature_names(), features.toarray()[2]))



{'good': 0, 'long': 1, 'movie': 1, 'scary': 1, 'slow': 0, 'spooky': 0}

{'good': 0, 'long': 0, 'movie': 1, 'scary': 1, 'slow': 1, 'spooky': 0}

{'good': 2, 'long': 0, 'movie': 1, 'scary': 0, 'slow': 0, 'spooky': 1}

Structured vector representation of three documents in the feature matrix. This matrix can already be folded as in the input to DM/ MLalgorithms:

In [None]:
features.toarray()

array([[0, 1, 1, 1, 0, 0],
       [0, 0, 1, 1, 1, 0],
       [2, 0, 1, 0, 0, 1]])

###  Term Frequency-Inverse Document Frequency (TF-IDF)

Unlike the BoW model, it represents a more sophisticated approach to creating vector representations of lists of features/terms of the original documents.

The weight in the vector of a given document expresses the weight of individual feature/term from vocabulary in the document, in the context of all documents.

During the calculation of the vector weights of a given document, this approach does not take into account only the given document (individual list of features/terms), but takes into account the entire document base (all lists of features/terms).

Approach intuition:
- if the given feature/term occurs in the given document, but also in all others, then the weight of the given feature/term will be negligible in the given document
- if the given feature/term occurs in the given document and in no other, then the weight of the given feature/term will be significant in the given document

In [None]:
tfidf = TfidfVectorizer()
features = tfidf.fit_transform(preprocessed_rewiews)

# print vocabulary
tfidf.get_feature_names()



['good', 'long', 'movie', 'scary', 'slow', 'spooky']

Occurrences of features/terms from vocabulary in the list of features/terms of the given document:

In [None]:
dict(zip(tfidf.get_feature_names(), features.toarray()[0]))
dict(zip(tfidf.get_feature_names(), features.toarray()[1]))
dict(zip(tfidf.get_feature_names(), features.toarray()[2]))



{'good': 0.0,
 'long': 0.7203334490549893,
 'movie': 0.4254405389711991,
 'scary': 0.5478321549274363,
 'slow': 0.0,
 'spooky': 0.0}

{'good': 0.0,
 'long': 0.0,
 'movie': 0.4254405389711991,
 'scary': 0.5478321549274363,
 'slow': 0.7203334490549893,
 'spooky': 0.0}

{'good': 0.864770177579381,
 'long': 0.0,
 'movie': 0.25537359879528915,
 'scary': 0.0,
 'slow': 0.0,
 'spooky': 0.4323850887896905}

Structured vector representation of three documents in the feature matrix. This matrix can already be folded as in the input to DM/ ML algorithms:

In [None]:
features.toarray()

array([[0.        , 0.72033345, 0.42544054, 0.54783215, 0.        ,
        0.        ],
       [0.        , 0.        , 0.42544054, 0.54783215, 0.72033345,
        0.        ],
       [0.86477018, 0.        , 0.2553736 , 0.        , 0.        ,
        0.43238509]])

### BERT

For the purpose of feature extraction, we will now use the pre-trained BERT model. It works as a transformer encoder, performing both word and sentence / document embedding.

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

Import of BERT model including tool for tokenization:

In [None]:
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


We will now tokenize the preprocessed text. It consists in dividing individual texts into tokens, which are then replaced by their identifiers. Finally, the first [CSL] and last [SEP] token is added in the context of each text:

In [None]:
tokenized_texts = [tokenizer.encode(x, add_special_tokens=True) for x in preprocessed_rewiews]
tokenized_texts

[[101, 3185, 12459, 2146, 102],
 [101, 3185, 12459, 4030, 102],
 [101, 3185, 11867, 14659, 2100, 2204, 2204, 102]]

The result is a list of lists, where one list (a list of token identifiers of a given text) represents exactly one document. We are now transforming this output into a matrix form:

In [None]:
max_len = 0
for i in tokenized_texts:
    if len(i) > max_len:
        max_len = len(i)

tokenized_texts_matrix = np.array([i + [0]*(max_len-len(i)) for i in tokenized_texts])
np.array(tokenized_texts_matrix).shape
tokenized_texts_matrix

(3, 8)

array([[  101,  3185, 12459,  2146,   102,     0,     0,     0],
       [  101,  3185, 12459,  4030,   102,     0,     0,     0],
       [  101,  3185, 11867, 14659,  2100,  2204,  2204,   102]])

Then we create an auxiliary matrix. This instructs the BERT model to ignore the artificial fill we created during the generation of the above matrix.

In [None]:
attention_mask_matrix = np.where(tokenized_texts_matrix != 0, 1, 0)
attention_mask_matrix.shape
attention_mask_matrix

(3, 8)

array([[1, 1, 1, 1, 1, 0, 0, 0],
       [1, 1, 1, 1, 1, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1]])

We now create an input tensor out of the padded token matrix, and send that to BERT:

In [None]:
input_ids = torch.tensor(tokenized_texts_matrix)  
attention_mask = torch.tensor(attention_mask_matrix)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

last_hidden_states[0].numpy().shape

(3, 8, 768)

The results of the processing will be returned into last_hidden_states. It takes the form of 768 matrices with three rows (one for each document) and 7 columns (number of tokens in the longest document + added first and last token). Of the given output, we are mainly interested in the output corresponding to the first token [CLS]. It represents vector representations of given preprocessed texts.

We obtain vector representations of the given texts by selecting the first column from all matrices. The vector representation of, for example, the first document then corresponds to a vector composed of values located in the first column and the first row across the 768 matrices.

In [None]:
features = last_hidden_states[0][:,0,:].numpy()
features.shape

(3, 768)

In [None]:
df = pd.DataFrame(features)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.140288,-0.087131,-0.054506,-0.019884,-0.065086,-0.246025,0.417704,0.253575,-0.464968,0.246298,...,0.31279,-0.215165,0.134716,-0.12048,0.122271,0.041049,0.101864,-0.354337,0.330353,0.151576
1,-0.185637,-0.097629,0.030227,0.08513,-0.002963,-0.207856,0.342549,0.278062,-0.376936,0.175713,...,0.241079,-0.187037,0.08814,0.022385,0.105548,-0.016295,0.045817,-0.193568,0.346718,0.090901
2,-0.116938,0.305265,-0.030803,0.046325,-0.075579,-0.485094,0.517049,0.243507,-0.266563,-0.004577,...,0.079356,-0.266142,0.145712,0.326372,0.16202,0.217919,-0.279785,-0.284029,0.281404,0.125248


## NLP use cases with Spacy

We will now show the use of the Spacy library using examples based on the presentation on NLP.

### Language detection 

In [None]:
!pip install spacy_langdetect

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from spacy_langdetect import LanguageDetector
from spacy.language import Language

#Language.component("language_detector", func=LanguageDetector())
print(nlp.pipe_names)
if "language_detector" not in nlp.pipe_names:
    nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
#nlp.pipe_names
#nlp.add_pipe('language_detector', last=True)
print(nlp.pipe_names)

texts = ["My mom taught me to finish everything on my plate at dinner.",
          "Cvičení určené na procvičení stavby slov a vět a na určování slovních druhů.",
          "Tina ist neu in der Stadt und kennt sich noch nicht aus.",
          "Расцветали яблони и груши."]

docs = list(nlp.pipe(texts))

[text._.language for text in docs]

['tagger', 'parser', 'ner']
['tagger', 'parser', 'ner', 'language_detector']


[{'language': 'en', 'score': 0.9999965292307103},
 {'language': 'cs', 'score': 0.9999973635809726},
 {'language': 'de', 'score': 0.9999964289791994},
 {'language': 'bg', 'score': 0.9999947633281014}]

### Named Entity Recognition (NER)

NER allows easily identify the key elements in a text, like of:

- people,
- places,
- brands,
- monetary values,
- and more. 

Extracting the main entities in a text helps sort unstructured data and detect important information, which is crucial if you have to deal with large datasets.

#### Example 1: NER form short text

In [None]:
# define text document
text = "Apple is looking at buying U.K. startup for $1 billion"
doc = nlp(text)

# identify and display NEs
[(ent.text, ent.label_) for ent in doc.ents]
    
displacy.render(doc, style="ent")

text = "Is always good to eat apple in London."
doc = nlp(text)

# identify and display NEs
[(ent.text, ent.label_) for ent in doc.ents]
    
displacy.render(doc, style="ent")

[('Apple', 'ORG'), ('U.K.', 'GPE'), ('$1 billion', 'MONEY')]

'<div class="entities" style="line-height: 2.5; direction: ltr">\n<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Apple\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">ORG</span>\n</mark>\n is looking at buying \n<mark class="entity" style="background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    U.K.\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">GPE</span>\n</mark>\n startup for \n<mark class="entity" style="background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    $1 billion\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35e

[('London', 'GPE')]

'<div class="entities" style="line-height: 2.5; direction: ltr">Is always good to eat apple in \n<mark class="entity" style="background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    London\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">GPE</span>\n</mark>\n.</div>'

From the above output, it is clear that in the given document, three named entities are identified and classified:

- Apple (organization)
- U.K. (country)
- $1 billion (money)

#### Example 2: NER from newspaper article

Suppose we want to find out which entities are most mentioned in the article *F.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is Fired* published on August 13, 2018 in The New York Times (https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news).

In the first step, based on the url of the web page using the get method (HTTP method), we obtain an html file containing the analyzed article. Then we extract the text of the article from the html file.

In [None]:
from collections import Counter
from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    # get html
    res = requests.get(url)
    html = res.text
    
    # extract relevant text from html
    soup = BeautifulSoup(html, 'html5lib')
    [script.extract() for script in soup(["script", "style", 'aside'])]
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

# get atricle and it's text
ny_bb = url_to_string('https://www.theguardian.com/us-news/2018/aug/13/fbi-fires-peter-strzok-agent-who-criticized-trump-in-text-messages')
article = nlp(ny_bb)

In [None]:
ny_bb

"                      Peter Strzok: FBI fires agent who criticized Trump in text messages | FBI | The Guardian                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 Skip to main contentSkip to navigationAdvertisementUS editionUS editionUK editionAustralian editionInternational editionThe Guardian - Back to homeThe Guardian: news website of the yearSearch jobs Sign inSearchNewsOpinionSportCultureLifestyleShowMoreShow MoreNewsUS newsWorld newsEnvironmentSoccerUS politicsBusinessTechScienceNewslettersFight to voteOpinionThe Guardian viewColumnistsLetters

We will now display:
- total number of recognized NEs
- number of NEs by individual categories
- most common/most frequent NEs

In [None]:
# total number of recognized NEs
len(article.ents)

# number of NEs by individual categories
Counter([ent.label_ for ent in article.ents])

# most common/most frequent NEs
Counter([ent.text for ent in article.ents]).most_common(10)

96

Counter({'CARDINAL': 2,
         'DATE': 11,
         'GPE': 2,
         'NORP': 11,
         'ORDINAL': 1,
         'ORG': 34,
         'PERSON': 33,
         'PRODUCT': 2})

[('Strzok', 16),
 ('FBI', 12),
 ('Trump', 5),
 ('Peter Strzok', 4),
 ('Russian', 3),
 ('ArchiveGuardian', 2),
 ('appGuardian', 2),
 ('more than 3 years', 2),
 ('Clinton', 2),
 ('Evan Vucci', 2)]

From the above overview, it is clear that the article informs about certain issues that are, among other things, associated with:
- Strzok,
- F.B.I.,
- Trump,
- Russia.

Finally, we display a random sentence from the article, including the named entities contained in it:

In [None]:
sentences = [sen for sen in article.sents]

displacy.render(sentences[0], style="ent")

'<div class="entities" style="line-height: 2.5; direction: ltr">                      \n<mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Peter Strzok\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">PERSON</span>\n</mark>\n: \n<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    FBI\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">ORG</span>\n</mark>\n fires agent who criticized \n<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Trump\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1;

### Summarization

Summarization is the task of condensing a piece of text to a shorter version, reducing the size of the initial text while at the same time preserving key informational elements and the meaning of content.

#### Extractive Text Summarization with spaCy

In [None]:
import spacy
from collections import Counter
from heapq import nlargest

The purpose of the following procedure is to call up a summary that will consist of the most important sentences of the original text. The importance of a sentence is expressed by the sum of the weights of the keywords that occur in the given sentence. In other words, the sentence is important depending on the importance of the keywords it contains.

We first identify the keywords in the text:

In [None]:
pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']
keyword = [token.text for token in article 
           if not token.is_punct | token.is_space | token.is_stop == True 
           if token.pos_ in pos_tag]

len(keyword)
keyword[:10]

429

['Peter',
 'Strzok',
 'FBI',
 'fires',
 'agent',
 'criticized',
 'Trump',
 'text',
 'messages',
 '|']

The analyzed text contains 770 keywords.

We will now calculate the frequency of occurrence of individual keywords in the text. Then we will display the 5 most frequented of them.

In [None]:
freq_word = Counter(keyword)
freq_word.most_common(5)

[('Strzok', 20),
 ('FBI', 13),
 ('Trump', 9),
 ('investigation', 8),
 ('Guardian', 7)]

We are now normalizing these frequencies for better processing. This is accomplished by dividing the frequency of each keyword by the maximum frequency. We get the weights of individual keywords.

In [None]:
max_freq = Counter(keyword).most_common(1)[0][1]
max_freq

for word in freq_word.keys():  
        freq_word[word] = (freq_word[word]/max_freq)

freq_word.most_common(5)

20

[('Strzok', 1.0),
 ('FBI', 0.65),
 ('Trump', 0.45),
 ('investigation', 0.4),
 ('Guardian', 0.35)]

In this main part of the whole process, we determine the weights of the individual sentences of the text. The weight of a sentence is determined by the weights of individual keywords that occur in the given sentence. Sentence weight expresses the sum of the weights of individual keywords that occur in a sentence.

In [None]:
sent_strength={}
for sent in article.sents:
    for word in sent:
        if word.text in freq_word.keys(): # is word a keyword?
            if sent in sent_strength.keys():
                sent_strength[sent]+=freq_word[word.text]
            else:
                sent_strength[sent]=freq_word[word.text]
                
list(sent_strength.values())[:5]

[3.150000000000001, 0.85, 0.35, 0.25, 0.05]

Finally, nlargest function is used to summarize the string. The nlargest function returns a list containing the top 3 sentences which are stored as *summarized_sentences*.

In [None]:
summarized_sentences = nlargest(3, sent_strength, key=sent_strength.get)

This can be converted to a string by the following lines of code:

In [None]:
final_sentences = [w.text for w in summarized_sentences]
summary = '\n'.join(final_sentences)
summary

"Strzok’s lawyer said FBI deputy director David Bowdich ordered the firing on Friday – overruling the bureau’s office of professional responsibility and going against the recommendation of the career FBI official responsible for employee discipline, who had said Strzok should be suspended for 60 days and stripped of his supervisory responsibilities.\nFBI agent rejects allegations of anti-Trump bias as a 'notch in Putin's belt'Read moreStrzok, a 21-year veteran of the organisation, helped oversee both the Russia inquiry and the investigation of Hillary Clinton’s emails, but sent texts critical of Trump including one where he labeled the future president an “idiot”.\nEvan Vucci/APFBI agent Peter Strzok, who once helped lead the bureau’s investigation into Russian election interference and sent texts disparaging Donald Trump, has been fired by the bureau."

#### Extractive Text Summarization with Gensim

In [None]:
import gensim
from gensim.summarization import summarize

extractive_summary = summarize(ny_bb, word_count=100)
extractive_summary

"Photograph: Evan Vucci/APFBI agent Peter Strzok, who once helped lead the bureau’s investigation into Russian election interference and sent texts disparaging Donald Trump, has been fired by the bureau.FBI agent rejects allegations of anti-Trump bias as a 'notch in Putin's belt'Read moreStrzok, a 21-year veteran of the organisation, helped oversee both the Russia inquiry and the investigation of Hillary Clinton’s emails, but sent texts critical of Trump including one where he labeled the future president an “idiot”.An inspector general’s report in June revealed a history of text messages sent during the 2016 presidential between Strzok and Lisa Page, then an FBI lawyer with whom he was having an affair.In one exchange, Page asked: “[Trump’s] not ever going to become president, right?"

### Word/Document vectors

The spaCy library allows you to convert words and entire documents into their vector representation. The library uses the trained Word2Vec static embedding model, which was trained on an extensive corpus. 

The created vector representations can be used, for example, to compare words or documents with each other.

#### Word vectors and similarity

Each existing token has a relationship to the trained model of word vectors, which can be characterized by three attributes:

- *has_vector*, if the token has a vector,
- *vector_norm*, L2 norm of the token’s vector (the square root of the sum of the values squared),
- *OOV*, Out-of-vocabulary.

In [None]:
doc = nlp("I like salty fries and hamburgers.")

vectors = pd.DataFrame()

for token in doc:
    vectors.loc[token,"has_vector"] = token.has_vector
    vectors.loc[token,"vector_norm"] = token.vector_norm
    vectors.loc[token,"is_oov"] = token.is_oov    
vectors

Unnamed: 0,has_vector,vector_norm,is_oov
I,True,6.423194,False
like,True,4.78322,False
salty,True,6.918513,False
fries,True,7.299067,False
and,True,4.657798,False
hamburgers,True,7.088755,False
.,True,4.931635,False


If we have vectors of individual tokens, then we can proceed to compare these vectors. 

The *similarity* function is used to calculate the similarity of two vectors:

In [None]:
similarity_map = pd.DataFrame(columns=doc)

for token in doc:
    for token2 in doc:
        similarity_map.loc[token, token2] = token.similarity(token2)

similarity_map.apply(pd.to_numeric).style.background_gradient(cmap ='BuGn')

Unnamed: 0,I,like,salty,fries,and,hamburgers,.
I,1.0,0.555491,0.214086,0.212421,0.316079,0.181652,0.377928
like,0.555491,1.0,0.300753,0.280542,0.526748,0.306152,0.38702
salty,0.214086,0.300753,1.0,0.527844,0.249443,0.437844,0.100576
fries,0.212421,0.280542,0.527844,1.0,0.190629,0.828722,0.165359
and,0.316079,0.526748,0.249443,0.190629,1.0,0.175625,0.432417
hamburgers,0.181652,0.306152,0.437844,0.828722,0.175625,1.0,0.099524
.,0.377928,0.38702,0.100576,0.165359,0.432417,0.099524,1.0


#### Document vectors and similarity

As with words, you can measure the similarity of entire documents by calling the *similarity* function. The similarity of documents is measured using document vectors, the calculation of which also includes the vectors of individual words.

In the following example, we compare a query with two documents, in other words, we calculate the similarity between the query vector and the vectors of both documents.

In [None]:
texts = ['This is my sample sentence',
         'This car is beautiful',
        "Movie was not very good",
        "Driving around",
        "Political fights",
        "Boxing",
        "MMA"]
query = "Fistfight" #'Beautiful car'

docs = list(nlp.pipe(texts))
doc_q = nlp(query)

similarity_map = pd.DataFrame()

for doc in docs:
    similarity_map.loc[doc.text, doc_q.text] = doc_q.similarity(doc)
        
similarity_map.apply(pd.to_numeric).style.background_gradient(cmap ='BuGn')

Unnamed: 0,Fistfight
This is my sample sentence,0.155412
This car is beautiful,0.150065
Movie was not very good,0.185105
Driving around,0.164097
Political fights,0.445665
Boxing,0.443361
MMA,0.344525


In [None]:
type(doc_q[0])

spacy.tokens.token.Token

## NLP use cases with other libraries

### Sentiment analysis

SA analyses an incoming text and tells whether the underlying sentiment is:
- positive,
- negative or
- neutral. 

SA classifies texts according to the sentiment contained in them.

#### Rule-based/lexicon-based approach VADER 

The VADER (Valence Aware Dictionary and sEntiment Reasoner) method represents a modified dictionary approach (rule-based/lexicon-based approaches) to sentiment analysis. VADER is specifically attuned to sentiments expressed in social media.

Characteristics of the VADER method as rule-based/lexicon-based approach:

- unlike approaches based on ML methods, VADER does not require any training data,
- can very well understand the sentiment of a text containing emoticons, slangs, conjunctions, capital words, punctuations and much more,
- works excellent on social media text,
- can work with multiple domains.

After importing the *SentimentIntensityAnalyzer* method from the vaderSentiment library, we will use the method in the context of four reviews to determine:
- polarity score for each sentiment class,
- summary compound value.

In [None]:
! pip install vaderSentiment

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

sentiment = SentimentIntensityAnalyzer()

reviews = ["The food was great! But I didn't like the service.",
          "I will definitely come again. Great menu.",
          "The atmosphere is nice and the service was helpful.",
          "Not my style, I don't recommend it."]

# print sentence and it's sentiment's scores
[(r, sentiment.polarity_scores(r)) for r in reviews]

[("The food was great! But I didn't like the service.",
  {'compound': -0.1045, 'neg': 0.219, 'neu': 0.592, 'pos': 0.189}),
 ('I will definitely come again. Great menu.',
  {'compound': 0.7783, 'neg': 0.0, 'neu': 0.424, 'pos': 0.576}),
 ('The atmosphere is nice and the service was helpful.',
  {'compound': 0.6808, 'neg': 0.0, 'neu': 0.556, 'pos': 0.444}),
 ("Not my style, I don't recommend it.",
  {'compound': -0.2755, 'neg': 0.26, 'neu': 0.74, 'pos': 0.0})]

The calculated compound value can be used for classification purposes:

In [None]:
# function for determining sentiment based on the compond value calculated by VADER
def classify_sentiment(compound_value):
    if compound_value >= 0.05 : 
            return("Positive") 

    elif compound_value <= - 0.05 : 
            return("Negative") 

    else : 
            return("Neutral")
        
        
[(r, classify_sentiment(sentiment.polarity_scores(r)['compound'])) for r in reviews]

[("The food was great! But I didn't like the service.", 'Negative'),
 ('I will definitely come again. Great menu.', 'Positive'),
 ('The atmosphere is nice and the service was helpful.', 'Positive'),
 ("Not my style, I don't recommend it.", 'Negative')]