In [2]:
import pandas as pd
import nltk
import textwrap
import numpy as np
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer , PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# https://www.kaggle.com/shivamkushwaha/bbc-full-text-document-classification
!wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv


--2023-07-30 18:44:07--  https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv
Resolving lazyprogrammer.me (lazyprogrammer.me)... 172.67.213.166, 104.21.23.210, 2606:4700:3030::ac43:d5a6, ...
Connecting to lazyprogrammer.me (lazyprogrammer.me)|172.67.213.166|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5085081 (4.8M) [text/csv]
Saving to: ‘bbc_text_cls.csv’


2023-07-30 18:44:08 (4.37 MB/s) - ‘bbc_text_cls.csv’ saved [5085081/5085081]



In [6]:
df=pd.read_csv('bbc_text_cls.csv')

In [7]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [9]:
doc=df[df.labels=='business']['text'].sample(random_state=42)

In [12]:
def wrap(x):
  return textwrap.fill(x,replace_whitespace=False,fix_sentence_endings=True)

In [13]:
print(wrap(doc.iloc[0]))

Christmas sales worst since 1981

UK retail sales fell in December,
failing to meet expectations and making it by some counts the worst
Christmas since 1981.

Retail sales dropped by 1% on the month in
December, after a 0.6% rise in November, the Office for National
Statistics (ONS) said.  The ONS revised the annual 2004 rate of growth
down from the 5.9% estimated in November to 3.2%. A number of
retailers have already reported poor figures for December.  Clothing
retailers and non-specialist stores were the worst hit with only
internet retailers showing any significant growth, according to the
ONS.

The last time retailers endured a tougher Christmas was 23 years
previously, when sales plunged 1.7%.

The ONS echoed an earlier
caution from Bank of England governor Mervyn King not to read too much
into the poor December figures.  Some analysts put a positive gloss on
the figures, pointing out that the non-seasonally-adjusted figures
showed a performance comparable with 2003. The Novembe

In [15]:
sents =nltk.sent_tokenize(doc.iloc[0].split("\n",1)[1])  #The split funtion is used to remove the title which is the first line of the doc.
#we weill tokenize on the basis of sentences and give score to each sentence
#based on the tfidf values of words in those sentence and then average those values.
#The sentences with higher averaged score will be displayed as summary.
sents

['\nUK retail sales fell in December, failing to meet expectations and making it by some counts the worst Christmas since 1981.',
 'Retail sales dropped by 1% on the month in December, after a 0.6% rise in November, the Office for National Statistics (ONS) said.',
 'The ONS revised the annual 2004 rate of growth down from the 5.9% estimated in November to 3.2%.',
 'A number of retailers have already reported poor figures for December.',
 'Clothing retailers and non-specialist stores were the worst hit with only internet retailers showing any significant growth, according to the ONS.',
 'The last time retailers endured a tougher Christmas was 23 years previously, when sales plunged 1.7%.',
 'The ONS echoed an earlier caution from Bank of England governor Mervyn King not to read too much into the poor December figures.',
 'Some analysts put a positive gloss on the figures, pointing out that the non-seasonally-adjusted figures showed a performance comparable with 2003.',
 'The November-De

In [22]:
featurizer =TfidfVectorizer(stop_words=stopwords.words('english'),norm='l1',)
#l1 normlizer to ensure we are not biased towards longer sentences as they have more frequency of words.

In [23]:
X= featurizer.fit_transform(sents)

In [26]:
def get_sentences_score(tfidf_row):
  #return avg of the non-zero values
  #of the tf-idf vector representation of a sentence
  x=tfidf_row[tfidf_row!=0]
  return x.mean()

In [28]:
scores=np.zeros(len(sents))
for i in range(len(sents)):
  score=get_sentences_score(X[i,:])
  scores[i]=score

In [32]:
sort_idx=np.argsort(-scores)
sort_idx

array([ 3, 10,  2, 16, 12,  5,  1, 11,  4,  7,  6, 14,  0,  9, 13,  8, 15])

We have muiltple ways to display sentences:
1. Top N sentences
2. Top N characters or words
3. Some threshold like avg> threshold etc.


In [33]:
print("Generated summary: ")
for i in sort_idx[:5]:
  print(wrap("%.2f: %s" % (scores[i],sents[i])))

Generated summary: 
0.14: A number of retailers have already reported poor figures for
December.
0.13: However, reports from some High Street retailers highlight the
weakness of the sector.
0.12: The ONS revised the annual 2004 rate of growth down from the
5.9% estimated in November to 3.2%.
0.10: "Our view is the Bank of England will keep its powder dry and
wait to see the big picture."
0.10: And a British Retail Consortium survey found that Christmas 2004
was the worst for 10 years.


In [34]:
doc.iloc[0].split("\n",1)[0]  #Title

'Christmas sales worst since 1981'

In [41]:
#a function to do all the tasks of the summarizer
def summarize(text):
  sents=nltk.sent_tokenize(text)

  X=featurizer.fit_transform(sents)

  scores = np.zeros(len(sents))
  for i in range(len(sents)):
    score = get_sentences_score(X[i,:])
    scores[i]=score

  sort = np.argsort(-scores)
  print("Summary: ")
  print("Title: " + doc.iloc[0].split("\n",1)[0]+"\n\n")
  for i in sort[:5]:
      print(wrap("%.2f: %s" % (scores[i],sents[i])))

In [42]:
#lets try new document
doc = df[df.labels == 'entertainment']['text'].sample(random_state=123)
#try our function
summarize(doc.iloc[0].split("\n",1)[1])

Summary: 
Title: Goodrem wins top female MTV prize


0.11: The Black Eyed Peas won awards for best R 'n' B video and
sexiest video, both for Hey Mama.
0.10: The ceremony was held at the Luna Park fairground in Sydney
Harbour and was hosted by the Osbourne family.
0.10: Goodrem, Green Day and the Black Eyed Peas took home two awards
each.
0.10: Other winners included Green Day, voted best group, and the
Black Eyed Peas.
0.10: The VH1 First Music Award went to Cher honouring her
achievements within the music industry.
