In [None]:
import pandas as pd
import numpy as np

In [None]:
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [None]:
# path to csv file
path='/content/drive/MyDrive/extractive summarisation/csv files/Copy of data.csv'
df=pd.read_csv(path)
df.head()

Unnamed: 0,business,entertainment,sport,tech,politics
0,China now top trader with Japan\n\nChina overt...,Indie film nominations announced\n\nMike Leigh...,Uefa approves fake grass\n\nUefa says it will ...,File-swappers ready new network\n\nLegal attac...,Minister defends hunting ban law\n\nThe law ba...
1,Oil prices reach three-month low\n\nOil prices...,Berlin celebrates European cinema\n\nOrganiser...,,'Evil twin' fear for wireless net\n\nPeople us...,Tory expert denies defeat warning\n\nThe Conse...
2,Fed chief warning on US deficit\n\nFederal Res...,Robots march to US cinema summit\n\nAnimated m...,Man Utd through after Exeter test\n\nMancheste...,New Year's texting breaks record\n\nA mobile p...,Parties warned over 'grey vote'\n\nPolitical p...
3,US regulator to rule on pain drug\n\nUS food a...,Spike Lee backs student directors\n\nFilm-make...,Wenger handed summer war chest\n\nArsenal boss...,Blinx sequel purrs nicely\n\nThe original Blin...,Baron Kinnock makes Lords debut\n\nFormer Labo...
4,Insurance bosses plead guilty\n\nAnother three...,Singer Knight backs anti-gun song\n\nR&B star ...,Borders 19-20 Ulster\n\nUlster clung on for a ...,"Gamers could drive high-definition\n\nTV, film...",No to Royal succession shake-up\n\nA Labour pe...


In [None]:
df.shape

(347, 5)

In [None]:
articles=df['entertainment'].to_list()[:10]
len(articles),len(articles[0])

(10, 2198)

In [None]:
import re,spacy
nlp=spacy.load('en_core_web_sm')

In [None]:
def preprocess(text):
    '''
    function used to preprocess the text

    '''

    text = re.sub(r'\n', ' ', text)

    # Remove punctuation (you can customize this pattern for your needs)
    text = re.sub(r'[^\w\s]', '', text)

    # Remove extra spaces and replace with a single space
    text = re.sub(r'\s+', ' ', text).strip()

    # tokenize word
    doc=nlp(text)
    toks=[i.text.lower() for i in doc if not i.is_stop  and i.text.lower()[:5] not in set(['xmath','xcite'])]


    return " " .join(toks)

In [None]:
def get_sents(text):
    '''
    function used to split the text into sentences

    parameters:text
    returns:list of sentences

    '''
    doc = nlp(text)
    sents=[]
    for i, sentence in enumerate(doc.sents):
        sents.append(sentence.text)
    return sents

In [None]:
def sent_order_dict(text):
    '''
    function used to give ordering to sentences

    parameters:text
    returns:two dicts with id as key and sentence as value then reverse in other dict

    '''
    sents=get_sents(text)
    d,d2={},{}
    for i,j in enumerate(sents):
        d[j]=i
        d2[i]=j

    return d,d2

In [None]:
!pip install sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer

# Load pre-trained SBERT model
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
# Encode a single sentence
sentence=get_sents(articles[1])[0]
print("Sentence:", sentence)
sentence_embedding = sbert_model.encode(sentence)
print("Sentence Embedding Shape:", sentence_embedding.shape,type(sentence_embedding))

Sentence: Oil prices reach three-month low

Oil prices have fallen heavily for a second day, closing at three-month lows after news that US crude stocks have improved ahead of winter.


Sentence Embedding Shape: (768,) <class 'numpy.ndarray'>


In [None]:
def vec_sent_map(d):
    '''
    function used to create sentence vector and stores it in a dict

    parameters:dictionary with sentences as key
    returns:two dicts with key as vector and sentence as value then reverse in other dict

    '''
    sent_vec,vec_sent={},{}
    for key,value in d.items():
        vec=sbert_model.encode(key)
        sent_vec[value]=vec
        vec_sent[tuple(vec)]=value

    return sent_vec,vec_sent


In [None]:
!pip install umap-learn


In [None]:
def reduce_dim(sent_vec,n=100):
    '''
    function used to reduce the dimensions of sentence embedding

    parameters:dictionary with vector as value
    returns:two dictionaries with reduced vector as key and original vector as value then reverse in other dictionary

    '''
    from umap import UMAP
    u1=UMAP(n_components=n,n_neighbors=45)
    x=[]
    for key,value in sent_vec.items():
        x.append(value)

    red_vec=u1.fit_transform(np.array(x))
    high_low,low_high={},{}

    for i in range(len(x)):
        high_low[tuple(x[i])]=red_vec[i]
        low_high[tuple(red_vec[i])]=x[i]

    return high_low,low_high

In [None]:
def kmeans_k(data):
    '''
    function used to find best k value using silhouette score

    parameters:reduced sentence vectors
    returns:k value,kmeans model

    '''
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score

    krange=list(range(5,11))
    k,model_kmeans=None,None
    silh=-1.1
    for i in krange:
        kmeans = KMeans(n_clusters=i)
        kmeans.fit(data)
        silhouette_avg = silhouette_score(data, kmeans.labels_)

        if silhouette_avg>silh:
            k,model_kmeans=i,kmeans
            silh=silhouette_avg

        print(f'k: {i}\tsilhouette score: {silhouette_avg}')

    print()
    return k,model_kmeans


In [None]:
def distance(x,y):
    '''
    function used calculate euclidean distance
    parameters:center,reduced vector
    returns:euclidean distance

    '''
    return np.linalg.norm(x - y)

In [None]:

def get_summary(text):
    '''
    function used to get one sentence per each cluster

    parameters:text
    returns:list of sentences

    '''
    sent_order,id_sent=sent_order_dict(text)

    sent_vec,vec_sent=vec_sent_map(sent_order)

    high_low,low_high=reduce_dim(sent_vec,n=len(sent_order)-2)

    data=[]
    for i in high_low.values():
        data.append(i)

    k,kmeans=kmeans_k(data)
    centers,labels=kmeans.cluster_centers_,kmeans.labels_

    k_sents=[]
    for i in range(k):
        c=centers[i]
        min_d,point=float('inf'),None

        for j in range(len(labels)):
            if i==labels[j] and distance(c,data[j])<min_d:
                min_d,point=distance(c,data[j])<min_d,data[j]

        high=low_high[tuple(point)]
        sent_no=vec_sent[tuple(high)]
        sent=id_sent[sent_no]
        k_sents.append((sent,sent_no))


    k_sents.sort(key=lambda x:x[1])
    print('\nExtractive summary : ',k,'(no of clusters) indicates no of sentences :\n')
    for i in k_sents:
        print(i[1],',',f'{i[0]}')
    return k_sents





In [None]:
extractive_summary=get_summary(articles[1])

k: 5	silhouette score: 0.23807385563850403
k: 6	silhouette score: 0.19890525937080383
k: 7	silhouette score: 0.159412682056427
k: 8	silhouette score: 0.11233250796794891
k: 9	silhouette score: 0.10403744876384735
k: 10	silhouette score: 0.059301797300577164


Extractive summary :  5 (no of clusters) indicates no of sentences :

0 , Berlin celebrates European cinema

Organisers say this year's Berlin Film Festival, which opens on Thursday with period epic Man to Man, will celebrate a revitalised European cinema.


5 , "It's just that there are more good European films.
7 , "There is no anti-American mood," he said.
9 , More than a dozen celebrities are scheduled to attend, among them Will Smith, Kevin Spacey and Keanu Reeves.
12 , The 10-day Berlinale runs until 20 February.



In [None]:
extractive_summary=get_summary(articles[5])

k: 5	silhouette score: 0.18146561086177826
k: 6	silhouette score: 0.18473830819129944
k: 7	silhouette score: 0.16574089229106903
k: 8	silhouette score: 0.12016880512237549
k: 9	silhouette score: 0.12461133301258087
k: 10	silhouette score: 0.10553476214408875


Extractive summary :  6 (no of clusters) indicates no of sentences :

7 , If this isn't one of the films of the year, I don't know what is."
9 , The production company is behind films such as My Beautiful Laundrette, Billy Elliot, About A Boy, Shaun of The Dead and Bridget Jones: The Edge of Reason.


10 , Simon Pegg, who stars in and co-wrote Shaun of the Dead, won the 2004 Peter Sellers Award For Comedy.
11 , Other winners included Emily Blunt and Nathalie Press who were jointly named ITV London Most Promising Newcomer Award for their performances in Pawel Pawlikowski's rites-of-passage story, My Summer of Love.
12 , Pawlikowski won the best screenplay statuette, while Roger Deakins won the Technical Achievement Award for his c

In [None]:
extractive_summary=get_summary(articles[3])

k: 5	silhouette score: 0.21380215883255005
k: 6	silhouette score: 0.13171420991420746
k: 7	silhouette score: 0.10425417125225067
k: 8	silhouette score: 0.09970640391111374
k: 9	silhouette score: 0.044045235961675644
k: 10	silhouette score: 0.023077398538589478


Extractive summary :  5 (no of clusters) indicates no of sentences :

4 , Returning to his old university, which educates only African American students, Lee discussed the challenges facing black people in the entertainment industry.
6 , But when it comes time to do a movie, he has to go to one of those gatekeepers," Lee said.
7 , He told aspiring young film-makers in the audience not to ignore non-traditional routes to getting a movie made, including raising funds independently and releasing films straight to DVD.
9 , "It's not something that should be looked upon as a stepchild."
10 , Lee has made more than 25 films, including Jungle Fever, Do the Right Thing, Summer of Sam and 1986 hit She's Gotta Have It.

