In [12]:
import pandas as pd
import nltk
import textwrap
import numpy as np
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer , PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# https://www.kaggle.com/shivamkushwaha/bbc-full-text-document-classification
!wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv


--2023-07-30 19:46:17--  https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv
Resolving lazyprogrammer.me (lazyprogrammer.me)... 172.67.213.166, 104.21.23.210, 2606:4700:3030::ac43:d5a6, ...
Connecting to lazyprogrammer.me (lazyprogrammer.me)|172.67.213.166|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5085081 (4.8M) [text/csv]
Saving to: ‘bbc_text_cls.csv’


2023-07-30 19:46:17 (54.7 MB/s) - ‘bbc_text_cls.csv’ saved [5085081/5085081]



In [4]:
df=pd.read_csv('bbc_text_cls.csv')
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [5]:
doc=df[df.labels=='business']['text'].sample(random_state=42)

In [6]:
def wrap(x):
  return textwrap.fill(x,replace_whitespace=False,fix_sentence_endings=True)

In [7]:
print(wrap(doc.iloc[0]))

Christmas sales worst since 1981

UK retail sales fell in December,
failing to meet expectations and making it by some counts the worst
Christmas since 1981.

Retail sales dropped by 1% on the month in
December, after a 0.6% rise in November, the Office for National
Statistics (ONS) said.  The ONS revised the annual 2004 rate of growth
down from the 5.9% estimated in November to 3.2%. A number of
retailers have already reported poor figures for December.  Clothing
retailers and non-specialist stores were the worst hit with only
internet retailers showing any significant growth, according to the
ONS.

The last time retailers endured a tougher Christmas was 23 years
previously, when sales plunged 1.7%.

The ONS echoed an earlier
caution from Bank of England governor Mervyn King not to read too much
into the poor December figures.  Some analysts put a positive gloss on
the figures, pointing out that the non-seasonally-adjusted figures
showed a performance comparable with 2003. The Novembe

In [8]:
sents =nltk.sent_tokenize(doc.iloc[0].split("\n",1)[1])  #The split funtion is used to remove the title which is the first line of the doc.
#we weill tokenize on the basis of sentences and give score to each sentence
#based on the tfidf values of words in those sentence and then average those values.
#The sentences with higher averaged score will be displayed as summary.
sents

['\nUK retail sales fell in December, failing to meet expectations and making it by some counts the worst Christmas since 1981.',
 'Retail sales dropped by 1% on the month in December, after a 0.6% rise in November, the Office for National Statistics (ONS) said.',
 'The ONS revised the annual 2004 rate of growth down from the 5.9% estimated in November to 3.2%.',
 'A number of retailers have already reported poor figures for December.',
 'Clothing retailers and non-specialist stores were the worst hit with only internet retailers showing any significant growth, according to the ONS.',
 'The last time retailers endured a tougher Christmas was 23 years previously, when sales plunged 1.7%.',
 'The ONS echoed an earlier caution from Bank of England governor Mervyn King not to read too much into the poor December figures.',
 'Some analysts put a positive gloss on the figures, pointing out that the non-seasonally-adjusted figures showed a performance comparable with 2003.',
 'The November-De

In [9]:
featurizer =TfidfVectorizer(stop_words=stopwords.words('english'),norm='l1',)
#l1 normlizer to ensure we are not biased towards longer sentences as they have more frequency of words.

In [10]:
X= featurizer.fit_transform(sents)

Difference starts from here as textrank is a way to compute scores. From initial to this point, all of work will remain same.

In [13]:
#compute scores
s = cosine_similarity(X)

In [15]:
s.shape

(17, 17)

In [16]:
len(sents)

17

In [17]:
#normalize the matrix for each row to be summed to 1 (markov model)
s/=s.sum( axis=1,keepdims=True)
s[0].sum()

1.0

In [18]:
s[1].sum()

1.0

In [20]:
# uniform tranisition matrix
u = np.ones_like(s)/len(s)
u

array([[0.05882353, 0.05882353, 0.05882353, 0.05882353, 0.05882353,
        0.05882353, 0.05882353, 0.05882353, 0.05882353, 0.05882353,
        0.05882353, 0.05882353, 0.05882353, 0.05882353, 0.05882353,
        0.05882353, 0.05882353],
       [0.05882353, 0.05882353, 0.05882353, 0.05882353, 0.05882353,
        0.05882353, 0.05882353, 0.05882353, 0.05882353, 0.05882353,
        0.05882353, 0.05882353, 0.05882353, 0.05882353, 0.05882353,
        0.05882353, 0.05882353],
       [0.05882353, 0.05882353, 0.05882353, 0.05882353, 0.05882353,
        0.05882353, 0.05882353, 0.05882353, 0.05882353, 0.05882353,
        0.05882353, 0.05882353, 0.05882353, 0.05882353, 0.05882353,
        0.05882353, 0.05882353],
       [0.05882353, 0.05882353, 0.05882353, 0.05882353, 0.05882353,
        0.05882353, 0.05882353, 0.05882353, 0.05882353, 0.05882353,
        0.05882353, 0.05882353, 0.05882353, 0.05882353, 0.05882353,
        0.05882353, 0.05882353],
       [0.05882353, 0.05882353, 0.05882353, 0.058823

In [21]:
u[0].sum()

1.0

In [28]:
#smoothed similarity matrix
factor = 0.15
s = (1-factor) * s + factor*u
s[0].sum()

1.0

In [24]:
#find the limiting, stationary distribution
eigenvals, eigenvecs =  np.linalg.eig(s.T)


In [25]:
eigenvals

array([1.        , 0.24245466, 0.72108199, 0.67644122, 0.34790129,
       0.34417302, 0.3866884 , 0.40333562, 0.41608572, 0.44238593,
       0.63909999, 0.62556792, 0.58922572, 0.57452382, 0.48511399,
       0.51329157, 0.52975372])

In [26]:
eigenvecs[:,0]


array([-0.24206557, -0.27051337, -0.2213806 , -0.28613638, -0.25065894,
       -0.2499217 , -0.279622  , -0.21515455, -0.2226665 , -0.22745415,
       -0.2059112 , -0.20959727, -0.23526242, -0.24203809, -0.23663025,
       -0.2940483 , -0.20865607])

In [31]:
S=s

In [32]:
eigenvecs[:,0].dot(S)

array([-0.24178159, -0.26233512, -0.22683669, -0.27362274, -0.24799029,
       -0.24745764, -0.2689161 , -0.22233837, -0.22776575, -0.23122484,
       -0.21566005, -0.21832324, -0.23686631, -0.24176173, -0.23785456,
       -0.2793391 , -0.21764322])

In [34]:
eigenvecs[:,0] / eigenvecs[:,0].sum()

array([0.05907327, 0.06601563, 0.05402535, 0.06982824, 0.06117038,
       0.06099047, 0.06823848, 0.05250595, 0.05433915, 0.05550753,
       0.05025022, 0.05114976, 0.05741304, 0.05906657, 0.05774684,
       0.07175905, 0.05092007])

In [35]:
limiting_dist = np.ones(len(S)) / len(S)
threshold = 1e-8
delta = float('inf')
iters = 0
while delta > threshold:
  iters += 1

  # Markov transition
  p = limiting_dist.dot(S)

  # compute change in limiting distribution
  delta = np.abs(p - limiting_dist).sum()

  # update limiting distribution
  limiting_dist = p

print(iters)

21


In [36]:
limiting_dist

array([0.0583793 , 0.06293876, 0.0559302 , 0.06533972, 0.06033116,
       0.05979707, 0.06393736, 0.0551528 , 0.05611272, 0.05673308,
       0.05465814, 0.05531059, 0.05775591, 0.05879387, 0.05759406,
       0.06645494, 0.05478031])

In [37]:
limiting_dist.sum()

0.9999999999999984

In [38]:
np.abs(eigenvecs[:,0] / eigenvecs[:,0].sum() - limiting_dist).sum()

0.040645343551232796

In [39]:
scores = limiting_dist

In [40]:
sort_idx = np.argsort(-scores)

In [41]:
print("Generated summary:")
for i in sort_idx[:5]:
  print(wrap("%.2f: %s" % (scores[i], sents[i])))

Generated summary:
0.07: "The retail sales figures are very weak, but as Bank of England
governor Mervyn King indicated last night, you don't really get an
accurate impression of Christmas trading until about Easter," said Mr
Shaw.
0.07: A number of retailers have already reported poor figures for
December.
0.06: The ONS echoed an earlier caution from Bank of England governor
Mervyn King not to read too much into the poor December figures.
0.06: Retail sales dropped by 1% on the month in December, after a
0.6% rise in November, the Office for National Statistics (ONS) said.
0.06: Clothing retailers and non-specialist stores were the worst hit
with only internet retailers showing any significant growth, according
to the ONS.


In [43]:
doc.iloc[0].split("\n")[0]

'Christmas sales worst since 1981'

In [47]:
def summarize(text, factor = 0.15):
  # extract sentences
  sents = nltk.sent_tokenize(text)

  # perform tf-idf
  featurizer = TfidfVectorizer(
      stop_words=stopwords.words('english'),
      norm='l1')
  X = featurizer.fit_transform(sents)

  # compute similarity matrix
  S = cosine_similarity(X)

  # normalize similarity matrix
  S /= S.sum(axis=1, keepdims=True)

  # uniform transition matrix
  U = np.ones_like(S) / len(S)

  # smoothed similarity matrix
  S = (1 - factor) * S + factor * U

  # find the limiting / stationary distribution
  eigenvals, eigenvecs = np.linalg.eig(S.T)

  # compute scores
  scores = eigenvecs[:,0] / eigenvecs[:,0].sum()

  # sort the scores
  sort_idx = np.argsort(-scores)

  # print summary
  print("Summary: ")
  print("Title: " + doc.iloc[0].split("\n",1)[0] + "\n\n")
  for i in sort_idx[:5]:
    print(wrap("%.2f: %s" % (scores[i], sents[i])))

In [48]:
doc = df[df.labels == 'entertainment']['text'].sample(random_state=123)
summarize(doc.iloc[0].split("\n", 1)[1])

Summary: 
Title: Goodrem wins top female MTV prize


0.11: Goodrem, Green Day and the Black Eyed Peas took home two awards
each.
0.10: As well as best female, Goodrem also took home the Pepsi Viewers
Choice Award, whilst Green Day bagged the prize for best rock video
for American Idiot.
0.10: Other winners included Green Day, voted best group, and the
Black Eyed Peas.
0.10: The Black Eyed Peas won awards for best R 'n' B video and
sexiest video, both for Hey Mama.
0.10: Local singer and songwriter Missy Higgins took the title of
breakthrough artist of the year, with Australian Idol winner Guy
Sebastian taking the honours for best pop video.


## Summarization through library

In [None]:
!pip install sumy

In [50]:
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer

In [51]:
summarizer = TextRankSummarizer()
parser = PlaintextParser.from_string(
    doc.iloc[0].split("\n", 1)[1],
    Tokenizer("english"))
summary = summarizer(parser.document, sentences_count=5)

In [52]:
summary

(<Sentence: The 21-year-old singer won the award for best female artist, with Australian Idol runner-up Shannon Noll taking the title of best male at the ceremony.>,
 <Sentence: As well as best female, Goodrem also took home the Pepsi Viewers Choice Award, whilst Green Day bagged the prize for best rock video for American Idiot.>,
 <Sentence: The Black Eyed Peas won awards for best R 'n' B video and sexiest video, both for Hey Mama.>,
 <Sentence: Local singer and songwriter Missy Higgins took the title of breakthrough artist of the year, with Australian Idol winner Guy Sebastian taking the honours for best pop video.>,
 <Sentence: The ceremony was held at the Luna Park fairground in Sydney Harbour and was hosted by the Osbourne family.>)

In [53]:
for s in summary:
  print(wrap(str(s)))

The 21-year-old singer won the award for best female artist, with
Australian Idol runner-up Shannon Noll taking the title of best male
at the ceremony.
As well as best female, Goodrem also took home the Pepsi Viewers
Choice Award, whilst Green Day bagged the prize for best rock video
for American Idiot.
The Black Eyed Peas won awards for best R 'n' B video and sexiest
video, both for Hey Mama.
Local singer and songwriter Missy Higgins took the title of
breakthrough artist of the year, with Australian Idol winner Guy
Sebastian taking the honours for best pop video.
The ceremony was held at the Luna Park fairground in Sydney Harbour
and was hosted by the Osbourne family.


In [54]:
summarizer = LsaSummarizer()
summary = summarizer(parser.document, sentences_count=5)
for s in summary:
  print(wrap(str(s)))

Goodrem, known in both Britain and Australia for her role as Nina
Tucker in TV soap Neighbours, also performed a duet with boyfriend
Brian McFadden.
Other winners included Green Day, voted best group, and the Black Eyed
Peas.
Goodrem, Green Day and the Black Eyed Peas took home two awards each.
As well as best female, Goodrem also took home the Pepsi Viewers
Choice Award, whilst Green Day bagged the prize for best rock video
for American Idiot.
Artists including Carmen Electra, Missy Higgins, Kelly Osbourne, Green
Day, Ja Rule and Natalie Imbruglia gave live performances at the
event.


In [None]:
# https://radimrehurek.com/gensim_3.8.3/summarization/summariser.html
# https://arxiv.org/abs/1602.03606
# Parameters
# text (str) – Given text.
# ratio (float, optional) – Number between 0 and 1 that determines the
#     proportion of the number of sentences of the original text to be
#     chosen for the summary.
# word_count (int or None, optional) – Determines how many words will the
#     output contain. If both parameters are provided, the ratio will be
#     ignored.
# split (bool, optional) – If True, list of sentences will be returned.
#     Otherwise joined strings will bwe returned.
from gensim.summarization.summarizer import summarize #--->outdated code, gensim not included anymore
summary = summarize(doc.iloc[0].split("\n", 1)[1])
print(wrap(summary))

## The output should have looked something like this
The 21-year-old singer won the award for best female artist, with
Australian Idol runner-up Shannon Noll taking the title of best male
at the ceremony.
Local singer and songwriter Missy Higgins took the
title of breakthrough artist of the year, with Australian Idol winner
Guy Sebastian taking the honours for best pop video.