In [1]:
import pandas as pd
import numpy as np
import textwrap
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Loyumba\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Loyumba\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# https://www.kaggle.com/shivamkushwaha/bbc-full-text-document-classification
df = pd.read_csv(r'D:\Notes and Exercises\Machine-Learning\dataset\bbc_text_cls.csv')

In [4]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [5]:
doc = df[df.labels == 'business']['text'].sample(random_state=33)

In [6]:
def wrap(x):
    return textwrap.fill(x, replace_whitespace=False, fix_sentence_endings=True)

In [7]:
print(wrap(doc.iloc[0]))

Karachi stocks hit historic high

The Karachi Stock Exchange (KSE) has
recorded its largest single day gain, surging 3.5% to a new high.

The
index rose 225.79 points in four hours of furious trading, with many
investors optimistic that political stability could bring an economic
boom.  The KSE index closed at 6709.93 - an overall gain of nearly 400
points in the first two trading days of the week.  Energy and
telecommunication stocks performed particularly well, recording an
8%-10% rise since Monday morning.

In 2002, the KSE was the world's
best performing stock market, with the index rising 112%.

Pakistani
investors are expecting the KSE to repeat, if not improve on, its 2002
performance.  Jubilant investors danced on the streets as the market
closed for the day on Tuesday, confident that the boom will continue
at least until the public holiday on 22 January.  Others, however, who
had stayed out fearing an imminent collapse because of prices
overheating, continued to warn that the 

In [8]:
sents = nltk.sent_tokenize(doc.iloc[0].split("\n", 1)[1])

In [9]:
featurizer = TfidfVectorizer(stop_words=stopwords.words('english'), norm='l1')

In [10]:
X = featurizer.fit_transform(sents)

In [11]:
def get_sentence_score(tfidf_row):
    # return the average of the non-zero values
    # of the tf-idf vector representation of a sentence
    x = tfidf_row[tfidf_row != 0]
    return x.mean()

In [12]:
scores = np.zeros(len(sents))
for i in range(len(scores)):
    score = get_sentence_score(X[i,:])
    scores[i] = score

In [13]:
sort_idx = np.argsort(-scores)

In [14]:
# Many options for how to choose which sentences to include:

# 1) top N sentences
# 2) top N words or characters.
# 3) top X% sentences or top X% words
# 4) sentences with scores > average score
# 5) sentences with scores > factor * average score

# You also don't have to sort. May make more sense in order.

In [15]:
print("Generated summary")
for i in sort_idx[:5]:
    print(wrap("%.2f:%s" %(scores[i], sents[i])))

Generated summary
0.17:There are indications of yet another upgrade by the end of
February.
0.17:"You can call it a peace dividend," smiles one broker.
0.17:Standard & Poor's upgraded Pakistan a few weeks ago.
0.14:"Let us see how long one can reap its benefits."
0.12:Pakistani investors are expecting the KSE to repeat, if not
improve on, its 2002 performance.


In [16]:
doc.iloc[0].split("\n", 1)[0]

'Karachi stocks hit historic high'

In [17]:
def summarize(text):
    #extract sentences
    sents = nltk.sent_tokenize(text)
    
    #perform tf-idf
    X = featurizer.fit_transform(sents)
    
    # compute scores for each sentences
    scores = np.zeros(len(sents))
    for i in range(len(sents)):
        score = get_sentence_score(X[i,:])
        scores[i] = score
    
    #sort the scores
    sort_idx = np.argsort(-scores)
    
    #print summary
    for i in sort_idx[:5]:
        print(wrap("%.2f: %s" % (scores[i], sents[i])))

In [18]:
doc = df[df.labels == 'entertainment']['text'].sample(random_state=123)
summarize(doc.iloc[0].split("\n", 1)[1])

0.11: The Black Eyed Peas won awards for best R 'n' B video and
sexiest video, both for Hey Mama.
0.10: The ceremony was held at the Luna Park fairground in Sydney
Harbour and was hosted by the Osbourne family.
0.10: Goodrem, Green Day and the Black Eyed Peas took home two awards
each.
0.10: Other winners included Green Day, voted best group, and the
Black Eyed Peas.
0.10: The VH1 First Music Award went to Cher honouring her
achievements within the music industry.


In [19]:
doc.iloc[0].split("\n", 1)[0]

'Goodrem wins top female MTV prize'

In [20]:
print(wrap(doc.iloc[0]))

Goodrem wins top female MTV prize

Pop singer Delta Goodrem has
scooped one of the top individual prizes at the first Australian MTV
Music Awards.

The 21-year-old singer won the award for best female
artist, with Australian Idol runner-up Shannon Noll taking the title
of best male at the ceremony.  Goodrem, known in both Britain and
Australia for her role as Nina Tucker in TV soap Neighbours, also
performed a duet with boyfriend Brian McFadden.  Other winners
included Green Day, voted best group, and the Black Eyed Peas.
Goodrem, Green Day and the Black Eyed Peas took home two awards each.
As well as best female, Goodrem also took home the Pepsi Viewers
Choice Award, whilst Green Day bagged the prize for best rock video
for American Idiot.  The Black Eyed Peas won awards for best R 'n' B
video and sexiest video, both for Hey Mama.  Local singer and
songwriter Missy Higgins took the title of breakthrough artist of the
year, with Australian Idol winner Guy Sebastian taking the honours f