In [1]:
import pandas as pd
import numpy as np
import textwrap
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv(r'D:\Notes and Exercises\Machine-Learning\dataset\bbc_text_cls.csv')

In [3]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [4]:
doc = df[df.labels == 'business']['text'].sample(random_state=33)

In [5]:
def wrap(x):
    return textwrap.fill(x, replace_whitespace=False, fix_sentence_endings=True)

In [6]:
print(wrap(doc.iloc[0]))

Karachi stocks hit historic high

The Karachi Stock Exchange (KSE) has
recorded its largest single day gain, surging 3.5% to a new high.

The
index rose 225.79 points in four hours of furious trading, with many
investors optimistic that political stability could bring an economic
boom.  The KSE index closed at 6709.93 - an overall gain of nearly 400
points in the first two trading days of the week.  Energy and
telecommunication stocks performed particularly well, recording an
8%-10% rise since Monday morning.

In 2002, the KSE was the world's
best performing stock market, with the index rising 112%.

Pakistani
investors are expecting the KSE to repeat, if not improve on, its 2002
performance.  Jubilant investors danced on the streets as the market
closed for the day on Tuesday, confident that the boom will continue
at least until the public holiday on 22 January.  Others, however, who
had stayed out fearing an imminent collapse because of prices
overheating, continued to warn that the 

In [7]:
print(doc.iloc[0].split("\n", 1)[1])


The Karachi Stock Exchange (KSE) has recorded its largest single day gain, surging 3.5% to a new high.

The index rose 225.79 points in four hours of furious trading, with many investors optimistic that political stability could bring an economic boom. The KSE index closed at 6709.93 - an overall gain of nearly 400 points in the first two trading days of the week. Energy and telecommunication stocks performed particularly well, recording an 8%-10% rise since Monday morning.

In 2002, the KSE was the world's best performing stock market, with the index rising 112%.

Pakistani investors are expecting the KSE to repeat, if not improve on, its 2002 performance. Jubilant investors danced on the streets as the market closed for the day on Tuesday, confident that the boom will continue at least until the public holiday on 22 January. Others, however, who had stayed out fearing an imminent collapse because of prices overheating, continued to warn that the "bubble may burst any time". "That's 

In [8]:
sents = nltk.sent_tokenize(doc.iloc[0].split("\n",1)[1])

In [9]:
featurizer = TfidfVectorizer(
    stop_words = stopwords.words('english'),
    norm = 'l1'
)

In [10]:
X = featurizer.fit_transform(sents)

In [11]:
# compute cosine similarity
S = cosine_similarity(X)
S.shape

(29, 29)

In [12]:
# normalize similarity matrix
S /= S.sum(axis = 1, keepdims=True)

In [13]:
S[0].sum()

1.0

In [14]:
#uniform transition matrix
U = np.ones_like(S) / len(S)

In [15]:
U[0].sum()

0.9999999999999998

In [16]:
#smoothed similarity matrix
factor = 0.15
S = (1-factor) * S + factor * U

In [17]:
S[0].sum()

0.9999999999999998

In [18]:
# find the limiting / stationary distribution
eigenvals, eigenvecs = np.linalg.eig(S.T)

In [19]:
eigenvals

array([1.        , 0.29288592, 0.79674115, 0.77334548, 0.73421504,
       0.72570055, 0.70664256, 0.69688054, 0.64291039, 0.33510499,
       0.34284901, 0.36394732, 0.6178493 , 0.38247313, 0.39419023,
       0.3992621 , 0.42273888, 0.44771178, 0.59102348, 0.58050112,
       0.5563091 , 0.47141411, 0.48480058, 0.53159703, 0.50659103,
       0.50272007, 0.52332544, 0.85      , 0.85      ])

In [20]:
eigenvecs[:,0]

array([0.17638882, 0.18244345, 0.18276364, 0.16647279, 0.20056689,
       0.18498961, 0.17627064, 0.16195718, 0.16678443, 0.19651731,
       0.19018671, 0.2206736 , 0.1773583 , 0.1634555 , 0.18495442,
       0.1836724 , 0.2148786 , 0.20217924, 0.18181727, 0.17954981,
       0.21938245, 0.21276207, 0.18495442, 0.1693887 , 0.18308151,
       0.1627039 , 0.19488777, 0.17113507, 0.1715018 ])

In [21]:
eigenvecs[:,0].dot(S)

array([0.17638882, 0.18244345, 0.18276364, 0.16647279, 0.20056689,
       0.18498961, 0.17627064, 0.16195718, 0.16678443, 0.19651731,
       0.19018671, 0.2206736 , 0.1773583 , 0.1634555 , 0.18495442,
       0.1836724 , 0.2148786 , 0.20217924, 0.18181727, 0.17954981,
       0.21938245, 0.21276207, 0.18495442, 0.1693887 , 0.18308151,
       0.1627039 , 0.19488777, 0.17113507, 0.1715018 ])

In [22]:
eigenvecs[:,0] / eigenvecs[:,0].sum()

array([0.03288579, 0.03401461, 0.03407431, 0.03103706, 0.03739353,
       0.03448932, 0.03286376, 0.03019517, 0.03109516, 0.03663853,
       0.03545826, 0.04114221, 0.03306654, 0.03047452, 0.03448276,
       0.03424374, 0.0400618 , 0.03769414, 0.03389787, 0.03347513,
       0.04090149, 0.03966719, 0.03448276, 0.0315807 , 0.03413357,
       0.03033439, 0.03633472, 0.03190629, 0.03197466])

In [23]:
limiting_dist = np.ones(len(S)) / len(S)
threshold = 1e-8
delta = float('inf')
iters = 0
while delta > threshold:
  iters += 1

  # Markov transition
  p = limiting_dist.dot(S)

  # compute change in limiting distribution
  delta = np.abs(p - limiting_dist).sum()

  # update limiting distribution
  limiting_dist = p

print(iters)

54


In [24]:
limiting_dist

array([0.03288579, 0.03401461, 0.03407431, 0.03103707, 0.03739353,
       0.03448932, 0.03286376, 0.03019517, 0.03109516, 0.03663853,
       0.03545826, 0.04114221, 0.03306654, 0.03047452, 0.03448276,
       0.03424374, 0.0400618 , 0.03769414, 0.03389787, 0.03347513,
       0.04090149, 0.03966719, 0.03448276, 0.0315807 , 0.03413357,
       0.03033439, 0.03633472, 0.03190629, 0.03197466])

In [25]:
limiting_dist.sum()

0.9999999999999957

In [26]:
np.abs(eigenvecs[:,0] / eigenvecs[:,0].sum() - limiting_dist).sum()

3.3809605580470414e-08

In [27]:
scores = limiting_dist

In [28]:
sort_idx = np.argsort(-scores)

In [29]:
# Many options for how to choose which sentences to include:

# 1) top N sentences
# 2) top N words
# 3) top X% sentences or top X% words
# 4) sentences with scores > average score
# 5) sentences with scores > factor * average score

# You also don't have to sort. May make more sense in order.

print("Generated summary:")
for i in sort_idx[:5]:
  print(wrap("%.2f: %s" % (scores[i], sents[i])))

Generated summary:
0.04: Market analysts are inclined to agree with Mr Lakhani, arguing
that there are a number of major factors behind the KSE's performance.
0.04: Pakistanis are now hoping that energy and telecoms, two of the
strongest sectors in Pakistan, draw some of the Arab money to the KSE.
0.04: "Coupled with the 7% GDP growth expected by June this year, I am
least surprised at the market's performance," says Mr Lakhani.
0.04: Locally, too, say analysts, recent political developments have
worked to the market's advantage.
0.04: One leading Karachi broker said the real reasons may be
political.


In [30]:
doc.iloc[0].split("\n")[0]

'Karachi stocks hit historic high'

In [31]:
def summarize(text, factor = 0.15):
      # extract sentences
  sents = nltk.sent_tokenize(text)

  # perform tf-idf
  featurizer = TfidfVectorizer(
      stop_words=stopwords.words('english'),
      norm='l1')
  X = featurizer.fit_transform(sents)

  # compute similarity matrix
  S = cosine_similarity(X)

  # normalize similarity matrix
  S /= S.sum(axis=1, keepdims=True)

  # uniform transition matrix
  U = np.ones_like(S) / len(S)

  # smoothed similarity matrix
  S = (1 - factor) * S + factor * U

  # find the limiting / stationary distribution
  eigenvals, eigenvecs = np.linalg.eig(S.T)

  # compute scores
  scores = eigenvecs[:,0] / eigenvecs[:,0].sum()
  
  # sort the scores
  sort_idx = np.argsort(-scores)

  # print summary
  for i in sort_idx[:5]:
    print(wrap("%.2f: %s" % (scores[i], sents[i])))

In [32]:
doc = df[df.labels == 'entertainment']['text'].sample(random_state=123)
summarize(doc.iloc[0].split("\n", 1)[1])

0.11: Goodrem, Green Day and the Black Eyed Peas took home two awards
each.
0.10: As well as best female, Goodrem also took home the Pepsi Viewers
Choice Award, whilst Green Day bagged the prize for best rock video
for American Idiot.
0.10: Other winners included Green Day, voted best group, and the
Black Eyed Peas.
0.10: The Black Eyed Peas won awards for best R 'n' B video and
sexiest video, both for Hey Mama.
0.10: Local singer and songwriter Missy Higgins took the title of
breakthrough artist of the year, with Australian Idol winner Guy
Sebastian taking the honours for best pop video.


In [33]:
doc.iloc[0].split("\n")[0]

'Goodrem wins top female MTV prize'

### Libraries for Text Summarization

In [34]:
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer

In [35]:
summarizer = TextRankSummarizer()
parser = PlaintextParser.from_string(
    doc.iloc[0].split("\n", 1)[1],
    Tokenizer("english"))
summary = summarizer(parser.document, sentences_count=5)

In [36]:
summary

(<Sentence: The 21-year-old singer won the award for best female artist, with Australian Idol runner-up Shannon Noll taking the title of best male at the ceremony.>,
 <Sentence: As well as best female, Goodrem also took home the Pepsi Viewers Choice Award, whilst Green Day bagged the prize for best rock video for American Idiot.>,
 <Sentence: The Black Eyed Peas won awards for best R 'n' B video and sexiest video, both for Hey Mama.>,
 <Sentence: Local singer and songwriter Missy Higgins took the title of breakthrough artist of the year, with Australian Idol winner Guy Sebastian taking the honours for best pop video.>,
 <Sentence: The ceremony was held at the Luna Park fairground in Sydney Harbour and was hosted by the Osbourne family.>)

In [37]:
for s in summary:
  print(wrap(str(s)))

The 21-year-old singer won the award for best female artist, with
Australian Idol runner-up Shannon Noll taking the title of best male
at the ceremony.
As well as best female, Goodrem also took home the Pepsi Viewers
Choice Award, whilst Green Day bagged the prize for best rock video
for American Idiot.
The Black Eyed Peas won awards for best R 'n' B video and sexiest
video, both for Hey Mama.
Local singer and songwriter Missy Higgins took the title of
breakthrough artist of the year, with Australian Idol winner Guy
Sebastian taking the honours for best pop video.
The ceremony was held at the Luna Park fairground in Sydney Harbour
and was hosted by the Osbourne family.


In [38]:
summarizer = LsaSummarizer()
summary = summarizer(parser.document, sentences_count=5)
for s in summary:
  print(wrap(str(s)))

Goodrem, known in both Britain and Australia for her role as Nina
Tucker in TV soap Neighbours, also performed a duet with boyfriend
Brian McFadden.
Other winners included Green Day, voted best group, and the Black Eyed
Peas.
Goodrem, Green Day and the Black Eyed Peas took home two awards each.
As well as best female, Goodrem also took home the Pepsi Viewers
Choice Award, whilst Green Day bagged the prize for best rock video
for American Idiot.
Artists including Carmen Electra, Missy Higgins, Kelly Osbourne, Green
Day, Ja Rule and Natalie Imbruglia gave live performances at the
event.


In [53]:
import gensim

In [54]:
from gensim.summarization.summarizer import summarizer
# from gensim.summarization import keywords

ImportError: cannot import name 'summarizer' from 'gensim.summarization.summarizer' (c:\Users\Loyumba\AppData\Local\Programs\Python\Python310\lib\site-packages\gensim\summarization\summarizer.py)

In [39]:
# https://radimrehurek.com/gensim_3.8.3/summarization/summariser.html
# https://arxiv.org/abs/1602.03606
# Parameters
# text (str) – Given text.
# ratio (float, optional) – Number between 0 and 1 that determines the
#     proportion of the number of sentences of the original text to be
#     chosen for the summary.
# word_count (int or None, optional) – Determines how many words will the
#     output contain. If both parameters are provided, the ratio will be
#     ignored.
# split (bool, optional) – If True, list of sentences will be returned.
#     Otherwise joined strings will bwe returned.
from gensim.summarization.summarizer import summarize
summary = summarize(doc.iloc[0].split("\n", 1)[1])
print(wrap(summary))

ModuleNotFoundError: No module named 'gensim.summarization'

The summarization code was removed from Gensim 4.0. See:https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4#12-removed-gensimsummarization

If you need it, you could try:

- installing an older gensim version (such as 3.8.3, the last official release in which it remained); or…
- copy the source code out to your own local module

This can be done by creating an env and then installing gensim=3.8.3

pip install gensim=3.8.3
