Centroids - most relevant tokens; tokens that contain the same meaning
1. Sum up vector representation of words that are part of a centroid => get embedding representation of the centroid.
2. Every sentence is scored (cosine similarity) based on how similar they are to the centroid embedding.
3. Select sentences based on their score until a certain number of words (hyperparameter) is reached
4. Avoid redundancy - if a chosen sentence is too similar to the ones in the already produced summary, don't add it (cosine similarity + predefined threshold)

https://aclanthology.org/W17-1003.pdf

https://arxiv.org/pdf/1707.02268v3.pdf

News headlines

Web snippets from search results

In [122]:
text = 'Some sample text containing punctuation. This is just an example - for testing. Nothing more'

In [123]:
import nltk
from nltk.corpus import stopwords
import numpy as np
import re
import string

from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from typing import List

In [124]:
STOP_WORDS = set(stopwords.words('english'))

In [125]:
vector = List[float]

def dot(v: vector, w: vector):
    return sum([vi * wi for vi, wi in zip(v,w)])

def cos_sim(v: vector, w: vector):
    return dot(v, w) / (dot(v,v) * dot(w,w)) ** .5


In [126]:
class Preprocessing(object):
    def __init__(self, text):
        self.text = text
        self.oryg = text

    def lower(self):
        self.text = self.text.lower() 
        return self.text
    
    def remove_punctuation(self):
        self.text = self.text.translate(self.text.maketrans('', '', string.punctuation))
        return self.text 
    
    def remove_stop_words(self):
        self.text = ' '.join([word for word in self.text.split() if word not in STOP_WORDS])
        return self.text
    
    def remove_digits(self):
        self.text = re.sub(r'[\d+]', '', self.text)
        return self.text
    
    def basic_pipeline(self):
        self.lower()
        self.remove_digits()
        self.remove_punctuation()
        self.remove_stop_words()
        return self.text

    def __call__(self):
        return self.text

In [127]:
cleaned_text = Preprocessing(text)
cleaned_text.basic_pipeline()

'sample text containing punctuation example testing nothing'

In [128]:
sentences = sent_tokenize(cleaned_text())

In [129]:
tfidf = Pipeline([
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer(norm = None, sublinear_tf = False, smooth_idf = False))
]).fit_transform(sentences).toarray()