In [2]:
import spacy
#load the English Language model
nlp = spacy.load("en_core_web_sm")
#Example text
text = "Apple is looking at buying U.k. startup for $1 billion"
#process the text
doc = nlp(text)

In [3]:
doc

Apple is looking at buying U.k. startup for $1 billion

In [4]:
#print the named entities in the text
print("Name  Entities, Phrases, and concepts :")
for ent in doc.ents:
    print(f"{ent.text:15} {ent.label_:10} {ent.start_char:10} {ent.end_char:10}")

Name  Entities, Phrases, and concepts :
Apple           ORG                 0          5
$1 billion      MONEY              44         54


In [5]:
#Text Summarization using Spacy
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("data science and ai  & gen ai has a greate career ahead")

In [6]:
doc

data science and ai  & gen ai has a greate career ahead

In [8]:
for token in doc:
    print(token.text)

data
science
and
ai
 
&
gen
ai
has
a
greate
career
ahead


In [9]:
doc

data science and ai  & gen ai has a greate career ahead

In [10]:
for token in doc:
    print(token.pos_)

NOUN
NOUN
CCONJ
VERB
SPACE
CCONJ
PROPN
PROPN
VERB
DET
ADJ
NOUN
ADV


In [11]:
for token in doc:
    print(token.text, ":", token.pos_)

data : NOUN
science : NOUN
and : CCONJ
ai : VERB
  : SPACE
& : CCONJ
gen : PROPN
ai : PROPN
has : VERB
a : DET
greate : ADJ
career : NOUN
ahead : ADV


In [12]:
for token in doc:
    print(token.text, ":", token.pos_,"-->", token.lemma_, token.dep_)
    

data : NOUN --> data compound
science : NOUN --> science nsubj
and : CCONJ --> and cc
ai : VERB --> ai conj
  : SPACE -->   dep
& : CCONJ --> & cc
gen : PROPN --> gen conj
ai : PROPN --> ai conj
has : VERB --> have ROOT
a : DET --> a det
greate : ADJ --> greate amod
career : NOUN --> career dobj
ahead : ADV --> ahead advmod


In [13]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape, token.is_alpha, token.is_stop)

data data NOUN NN compound 13110060611322374290 True False
science science NOUN NN nsubj 13110060611322374290 True False
and and CCONJ CC cc 4088098365541558500 True True
ai ai VERB VB conj 4370460163704169311 True False
    SPACE _SP dep 8532415787641010193 False False
& & CCONJ CC cc 15473034735919704609 False False
gen gen PROPN NNP conj 4088098365541558500 True False
ai ai PROPN NNP conj 4370460163704169311 True False
has have VERB VBZ ROOT 4088098365541558500 True True
a a DET DT det 11123243248953317070 True True
greate greate ADJ JJ amod 13110060611322374290 True False
career career NOUN NN dobj 13110060611322374290 True False
ahead ahead ADV RB advmod 13110060611322374290 True False


In [14]:
text = """There are broadly two types of extractive summarization tasks depending on what the summarization program focuses on. The first is generic summarization, which focuses on obtaining a generic summary or abstract of the collection (whether documents, or sets of images, or videos, news stories etc.). The second is query relevant summarization, sometimes called query-based summarization, which summarizes objects specific to a query. Summarization systems are able to create both query relevant text summaries and generic machine-generated summaries depending on what the user needs.
An example of a summarization problem is document summarization, which attempts to automatically produce an abstract from a given document. Sometimes one might be interested in generating a summary from a single source document, while others can use multiple source documents (for example, a cluster of articles on the same topic). This problem is called multi-document summarization. A related application is summarizing news articles. Imagine a system, which automatically pulls together news articles on a given topic (from the web), and concisely represents the latest news as a summary.
Image collection summarization is another application example of automatic summarization. It consists in selecting a representative set of images from a larger set of images.[4] A summary in this context is useful to show the most representative images of results in an image collection exploration system. Video summarization is a related domain, where the system automatically creates a trailer of a long video. This also has applications in consumer or personal videos, where one might want to skip the boring or repetitive actions. Similarly, in surveillance videos, one would want to extract important and suspicious activity, while ignoring all the boring and redundant frames captured """

In [15]:
text

'There are broadly two types of extractive summarization tasks depending on what the summarization program focuses on. The first is generic summarization, which focuses on obtaining a generic summary or abstract of the collection (whether documents, or sets of images, or videos, news stories etc.). The second is query relevant summarization, sometimes called query-based summarization, which summarizes objects specific to a query. Summarization systems are able to create both query relevant text summaries and generic machine-generated summaries depending on what the user needs.\nAn example of a summarization problem is document summarization, which attempts to automatically produce an abstract from a given document. Sometimes one might be interested in generating a summary from a single source document, while others can use multiple source documents (for example, a cluster of articles on the same topic). This problem is called multi-document summarization. A related application is summa

In [16]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation      

In [17]:
stopwords = list(STOP_WORDS)
stopwords

['thereby',
 'along',
 'herein',
 'n‘t',
 'himself',
 'nine',
 'does',
 'most',
 'wherein',
 'also',
 'could',
 'them',
 'the',
 'around',
 'off',
 'name',
 'such',
 'your',
 'anyone',
 'elsewhere',
 'whose',
 'are',
 'you',
 'then',
 'in',
 'ten',
 "'d",
 'since',
 'anyhow',
 'enough',
 'fifty',
 'myself',
 'ours',
 'their',
 'as',
 'from',
 'thereupon',
 'none',
 'besides',
 'many',
 'just',
 'but',
 'on',
 'with',
 'other',
 'about',
 'beyond',
 'hundred',
 'out',
 'into',
 '‘ll',
 'made',
 'upon',
 'for',
 'hereby',
 'whole',
 'be',
 '‘ve',
 'side',
 'again',
 'against',
 'when',
 'within',
 'even',
 'between',
 'will',
 'another',
 'seemed',
 'during',
 'twenty',
 "'m",
 '‘re',
 'amount',
 'six',
 'mine',
 'while',
 'everyone',
 'because',
 'up',
 'seem',
 'put',
 'becomes',
 'how',
 'therein',
 'was',
 'except',
 'an',
 'whoever',
 'our',
 'twelve',
 'without',
 'towards',
 'where',
 'and',
 'must',
 'afterwards',
 'becoming',
 "'re",
 'hereafter',
 'further',
 'or',
 'than',
 'k

In [18]:
len(stopwords)

326

In [19]:
nlp = spacy.load("en_core_web_sm")

In [20]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

In [22]:
stopwords = list(STOP_WORDS)
stopwords

['thereby',
 'along',
 'herein',
 'n‘t',
 'himself',
 'nine',
 'does',
 'most',
 'wherein',
 'also',
 'could',
 'them',
 'the',
 'around',
 'off',
 'name',
 'such',
 'your',
 'anyone',
 'elsewhere',
 'whose',
 'are',
 'you',
 'then',
 'in',
 'ten',
 "'d",
 'since',
 'anyhow',
 'enough',
 'fifty',
 'myself',
 'ours',
 'their',
 'as',
 'from',
 'thereupon',
 'none',
 'besides',
 'many',
 'just',
 'but',
 'on',
 'with',
 'other',
 'about',
 'beyond',
 'hundred',
 'out',
 'into',
 '‘ll',
 'made',
 'upon',
 'for',
 'hereby',
 'whole',
 'be',
 '‘ve',
 'side',
 'again',
 'against',
 'when',
 'within',
 'even',
 'between',
 'will',
 'another',
 'seemed',
 'during',
 'twenty',
 "'m",
 '‘re',
 'amount',
 'six',
 'mine',
 'while',
 'everyone',
 'because',
 'up',
 'seem',
 'put',
 'becomes',
 'how',
 'therein',
 'was',
 'except',
 'an',
 'whoever',
 'our',
 'twelve',
 'without',
 'towards',
 'where',
 'and',
 'must',
 'afterwards',
 'becoming',
 "'re",
 'hereafter',
 'further',
 'or',
 'than',
 'k

In [23]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
doc

There are broadly two types of extractive summarization tasks depending on what the summarization program focuses on. The first is generic summarization, which focuses on obtaining a generic summary or abstract of the collection (whether documents, or sets of images, or videos, news stories etc.). The second is query relevant summarization, sometimes called query-based summarization, which summarizes objects specific to a query. Summarization systems are able to create both query relevant text summaries and generic machine-generated summaries depending on what the user needs.
An example of a summarization problem is document summarization, which attempts to automatically produce an abstract from a given document. Sometimes one might be interested in generating a summary from a single source document, while others can use multiple source documents (for example, a cluster of articles on the same topic). This problem is called multi-document summarization. A related application is summari

In [24]:
#lets get the tokens from the text
tokens = [token.text for token in doc]
tokens

['There',
 'are',
 'broadly',
 'two',
 'types',
 'of',
 'extractive',
 'summarization',
 'tasks',
 'depending',
 'on',
 'what',
 'the',
 'summarization',
 'program',
 'focuses',
 'on',
 '.',
 'The',
 'first',
 'is',
 'generic',
 'summarization',
 ',',
 'which',
 'focuses',
 'on',
 'obtaining',
 'a',
 'generic',
 'summary',
 'or',
 'abstract',
 'of',
 'the',
 'collection',
 '(',
 'whether',
 'documents',
 ',',
 'or',
 'sets',
 'of',
 'images',
 ',',
 'or',
 'videos',
 ',',
 'news',
 'stories',
 'etc',
 '.',
 ')',
 '.',
 'The',
 'second',
 'is',
 'query',
 'relevant',
 'summarization',
 ',',
 'sometimes',
 'called',
 'query',
 '-',
 'based',
 'summarization',
 ',',
 'which',
 'summarizes',
 'objects',
 'specific',
 'to',
 'a',
 'query',
 '.',
 'Summarization',
 'systems',
 'are',
 'able',
 'to',
 'create',
 'both',
 'query',
 'relevant',
 'text',
 'summaries',
 'and',
 'generic',
 'machine',
 '-',
 'generated',
 'summaries',
 'depending',
 'on',
 'what',
 'the',
 'user',
 'needs',
 '.',


In [25]:
#we have to calculate the word frequencies and then we will use those frequencies to get the weighted scores for each sentence
word_frequencies = {}
for word in doc:
    if word.text.lower() not in stopwords:
        if word.text.lower() not in punctuation:
            if word.text not in word_frequencies.keys():
                word_frequencies[word.text] = 1
            else:
                word_frequencies[word.text] += 1

In [27]:
word_frequencies

{'broadly': 1,
 'types': 1,
 'extractive': 1,
 'summarization': 11,
 'tasks': 1,
 'depending': 2,
 'program': 1,
 'focuses': 2,
 'generic': 3,
 'obtaining': 1,
 'summary': 4,
 'abstract': 2,
 'collection': 3,
 'documents': 2,
 'sets': 1,
 'images': 3,
 'videos': 3,
 'news': 4,
 'stories': 1,
 'etc': 1,
 'second': 1,
 'query': 4,
 'relevant': 2,
 'called': 2,
 'based': 1,
 'summarizes': 1,
 'objects': 1,
 'specific': 1,
 'Summarization': 1,
 'systems': 1,
 'able': 1,
 'create': 1,
 'text': 1,
 'summaries': 2,
 'machine': 1,
 'generated': 1,
 'user': 1,
 'needs': 1,
 '\n': 2,
 'example': 3,
 'problem': 2,
 'document': 4,
 'attempts': 1,
 'automatically': 3,
 'produce': 1,
 'given': 2,
 'interested': 1,
 'generating': 1,
 'single': 1,
 'source': 2,
 'use': 1,
 'multiple': 1,
 'cluster': 1,
 'articles': 3,
 'topic': 2,
 'multi': 1,
 'related': 2,
 'application': 2,
 'summarizing': 1,
 'Imagine': 1,
 'system': 3,
 'pulls': 1,
 'web': 1,
 'concisely': 1,
 'represents': 1,
 'latest': 1,
 'Ima

In [29]:
max_frequency = max(word_frequencies.values())
max_frequency

11

In [30]:
#to get normalized frequency we will divide each frequency by the maximum frequency
for word in word_frequencies.keys():
    word_frequencies[word] = word_frequencies[word]/max_frequency

In [31]:
word_frequencies

{'broadly': 0.09090909090909091,
 'types': 0.09090909090909091,
 'extractive': 0.09090909090909091,
 'summarization': 1.0,
 'tasks': 0.09090909090909091,
 'depending': 0.18181818181818182,
 'program': 0.09090909090909091,
 'focuses': 0.18181818181818182,
 'generic': 0.2727272727272727,
 'obtaining': 0.09090909090909091,
 'summary': 0.36363636363636365,
 'abstract': 0.18181818181818182,
 'collection': 0.2727272727272727,
 'documents': 0.18181818181818182,
 'sets': 0.09090909090909091,
 'images': 0.2727272727272727,
 'videos': 0.2727272727272727,
 'news': 0.36363636363636365,
 'stories': 0.09090909090909091,
 'etc': 0.09090909090909091,
 'second': 0.09090909090909091,
 'query': 0.36363636363636365,
 'relevant': 0.18181818181818182,
 'called': 0.18181818181818182,
 'based': 0.09090909090909091,
 'summarizes': 0.09090909090909091,
 'objects': 0.09090909090909091,
 'specific': 0.09090909090909091,
 'Summarization': 0.09090909090909091,
 'systems': 0.09090909090909091,
 'able': 0.09090909090

In [32]:
sentence_tokens = [sent for sent in doc.sents]
sentence_tokens

[There are broadly two types of extractive summarization tasks depending on what the summarization program focuses on.,
 The first is generic summarization, which focuses on obtaining a generic summary or abstract of the collection (whether documents, or sets of images, or videos, news stories etc.).,
 The second is query relevant summarization, sometimes called query-based summarization, which summarizes objects specific to a query.,
 Summarization systems are able to create both query relevant text summaries and generic machine-generated summaries depending on what the user needs.,
 An example of a summarization problem is document summarization, which attempts to automatically produce an abstract from a given document.,
 Sometimes one might be interested in generating a summary from a single source document, while others can use multiple source documents (for example, a cluster of articles on the same topic).,
 This problem is called multi-document summarization.,
 A related applica

In [33]:
len(sentence_tokens)

15

In [34]:
#we are gong to calculate the sentence scores, to calculate the sentence scores we will add the word frequencies of each word in the sentence
sentence_scores = {}
for sent in sentence_tokens:
    for word in sent:
        if word.text.lower() in word_frequencies.keys():
            if sent not in sentence_scores.keys():
                sentence_scores[sent] = word_frequencies[word.text.lower()]
            else:
                sentence_scores[sent] += word_frequencies[word.text.lower()]

In [35]:
sentence_scores

{There are broadly two types of extractive summarization tasks depending on what the summarization program focuses on.: 2.818181818181818,
 The first is generic summarization, which focuses on obtaining a generic summary or abstract of the collection (whether documents, or sets of images, or videos, news stories etc.).: 3.9999999999999987,
 The second is query relevant summarization, sometimes called query-based summarization, which summarizes objects specific to a query.: 3.909090909090909,
 Summarization systems are able to create both query relevant text summaries and generic machine-generated summaries depending on what the user needs.: 3.2727272727272716,
 An example of a summarization problem is document summarization, which attempts to automatically produce an abstract from a given document.: 3.9999999999999996,
 Sometimes one might be interested in generating a summary from a single source document, while others can use multiple source documents (for example, a cluster of artic

In [36]:
from heapq import nlargest

In [38]:
select_length = int(len(sentence_tokens)*0.4)
select_length

6

In [39]:
#we have to select maximum 40% of the sentences based on their scores
summary = nlargest(select_length, sentence_scores, key=sentence_scores.get)
summary

[An example of a summarization problem is document summarization, which attempts to automatically produce an abstract from a given document.,
 The first is generic summarization, which focuses on obtaining a generic summary or abstract of the collection (whether documents, or sets of images, or videos, news stories etc.).,
 The second is query relevant summarization, sometimes called query-based summarization, which summarizes objects specific to a query.,
 Summarization systems are able to create both query relevant text summaries and generic machine-generated summaries depending on what the user needs.,
 Image collection summarization is another application example of automatic summarization.,
 Imagine a system, which automatically pulls together news articles on a given topic (from the web), and concisely represents the latest news as a summary.]

In [41]:
#if i need to combine these top 3 sentence then
final_summary = [word.text for word in summary]
final_summary

['An example of a summarization problem is document summarization, which attempts to automatically produce an abstract from a given document.',
 'The first is generic summarization, which focuses on obtaining a generic summary or abstract of the collection (whether documents, or sets of images, or videos, news stories etc.).',
 'The second is query relevant summarization, sometimes called query-based summarization, which summarizes objects specific to a query.',
 'Summarization systems are able to create both query relevant text summaries and generic machine-generated summaries depending on what the user needs.\n',
 'Image collection summarization is another application example of automatic summarization.',
 'Imagine a system, which automatically pulls together news articles on a given topic (from the web), and concisely represents the latest news as a summary.\n']