# **Extractive Summarisation**

# Libraries & Packages

In [None]:
import pandas as pd
import nltk
import spacy
import re
from bs4 import BeautifulSoup
!pip install pytextrank
import pytextrank
!python -m spacy download en_core_web_md
from collections import defaultdict
from collections import Counter
from nltk.corpus import wordnet
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.tag import pos_tag
from heapq import nlargest
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
from nltk.corpus import stopwords
nlp = spacy.load('en_core_web_md')
nlp.add_pipe("textrank")
stops = stopwords.words('english')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
df = pd.read_csv("all_articles_cleaned_no_unnecessary_words.csv")

In [None]:
df['body'] = df['body'].str.replace("\n", "")
df['body'] = df['body'].str.replace("  ", "")
df['body'] = df['body'].str.replace("\'", "'")

In [None]:
df = df.iloc[:1000]

In [None]:
df.dtypes

id                   float64
body                  object
headline              object
article_name          object
article_url           object
date_published        object
article_length         int64
date_uploaded         object
article_start_url     object
source                object
dtype: object

In [None]:
df['body'] = df['body'].values.astype(str)

In [None]:
df['headline'] = df['headline'].values.astype(str)

# Lexical Chain

In [None]:
def noun_relations(nouns):
    relation_list = defaultdict(list)
    for i in range(len(nouns)):
        relation = []
        for syn in wordnet.synsets(nouns[i], pos = wordnet.NOUN):
            for a in syn.lemmas():
                relation.append(a.name())
                if a.antonyms():
                    relation.append(a.antonyms()[0].name())
            for b in syn.hyponyms():
                if b.hyponyms():
                    relation.append(b.hyponyms()[0].name().split('.')[0])
            for c in syn.hypernyms():
                if c.hypernyms():
                    relation.append(c.hypernyms()[0].name().split('.')[0])
        relation_list[nouns[i]].append(relation)
    return relation_list

def generate_lexical_chain(nouns, relation_list):
    lexical = []
    threshold = 0.5
    for noun in nouns:
        flag = 0
        for j in range(len(lexical)):
            if flag == 0:
                for key in list(lexical[j]):
                    if key == noun and flag == 0:
                        lexical[j][noun] +=1
                        flag = 1
                    elif key in relation_list[noun][0] and flag == 0:
                        syns1 = wordnet.synsets(key, pos = wordnet.NOUN)
                        syns2 = wordnet.synsets(noun, pos = wordnet.NOUN)
                        if syns1[0].wup_similarity(syns2[0]) >= threshold:
                            lexical[j][noun] = 1
                            flag = 1
                    elif noun in relation_list[key][0] and flag == 0:
                        syns1 = wordnet.synsets(key, pos = wordnet.NOUN)
                        syns2 = wordnet.synsets(noun, pos = wordnet.NOUN)
                        if syns1[0].wup_similarity(syns2[0]) >= threshold:
                            lexical[j][noun] = 1
                            flag = 1
        if flag == 0: 
            new_dict = {}
            new_dict[noun] = 1
            lexical.append(new_dict)
            flag = 1
    return lexical

def prune(lexical):
    final_chain = []
    while lexical:
        result = lexical.pop()
        if len(result.keys()) == 1:
            for value in result.values():
                if value != 1: 
                    final_chain.append(result)
        else:
            final_chain.append(result)
    return final_chain

threshold_min = 0.1
threshold_max = 0.9

def return_frequencies(words, lexical_chain):
    frequencies = defaultdict(int)
    for word in words:
        for w in word:
            if w not in stops:
                flag = 0
                for i in lexical_chain:
                    if w in list(i.keys()):
                        frequencies[w] = sum(list(i.values()))
                        flag = 1
                        break
                if flag == 0: 
                    frequencies[w] += 1
    m = float(max(frequencies.values()))
    for w in list(frequencies.keys()):
        frequencies[w] = frequencies[w]/m
        if frequencies[w] >= threshold_max or frequencies[w] <= threshold_min:
            del frequencies[w]
    return frequencies

def summarize(sentence, lexical_chain, n):
    assert n <= len(sentence)
    word_sentence = [word_tokenize(s.lower()) for s in sentence]
    frequencies = return_frequencies(word_sentence, lexical_chain)
    ranking = defaultdict(int)
    for i, sent in enumerate(word_sentence):
        for word in sent:
            if word in frequencies:
                ranking[i] += frequencies[word]
                idx = rank(ranking, n)
    final_index = sorted(idx)
    return [sentence[j] for j in final_index]

def rank(ranking, n):
    return nlargest(n, ranking, key=ranking.get)

position = ['NN', 'NNS', 'NNP', 'NNPS']
tokenizer = RegexpTokenizer(r'\w+')

def generate_lexical_chain_summary(input_content):
    sentence = sent_tokenize(input_content)
    tokens = [tokenizer.tokenize(w) for w in sentence]
    tagged = [pos_tag(tok) for tok in tokens]
    nouns = [word.lower() for i in range(len(tagged)) for word, pos in tagged[i] if pos in position]
    relation = noun_relations(nouns)
    lexical = generate_lexical_chain(nouns, relation)
    final_chain = prune(lexical)
    if len(sentence) >= 3:
        n = 3
    else: 
        n = 1
    s = summarize(sentence, final_chain, n)
    return [final_chain, s]

In [None]:
def lc_summ(df):
  try:
    if type(df['body']) == str:
      return " ".join(generate_lexical_chain_summary(df['body'])[1])
    else:
      return ""
  except:
    return ""

In [None]:
df['lc_summ'] = df.apply(lc_summ, axis = 1)

# TF-IDF

In [None]:
def basic_preprocess(self_text):
     # 1. Remove html tags
    words = BeautifulSoup(self_text).get_text()
    # 2. Convert words to lower case and split each word up
    words = self_text.lower()
    # 3. Remove non-letters aka punctuation
    words = re.sub("[^a-zA-Z]", " ", words).split()    
    # 4 Remove stopwords
    words = [word for word in words if word not in stops]
    # 5 LEMMATIZE!
    words = [lemmatizer.lemmatize(w) for w in words]
    # 7. Join words back into one string, with a space in between each word
    return(" ".join(words))

In [None]:
def top_sentence(input_doc, limit):
    keyword = []
    pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']
    doc = nlp(input_doc)
    processed_doc = nlp(basic_preprocess(input_doc))
    for token in processed_doc:
        if token.pos_ in pos_tag:
            keyword.append(token.text)
    freq_word = Counter(keyword)
    max_freq = Counter(keyword).most_common(1)[0][1]
    for w in freq_word:
        freq_word[w] = freq_word[w] / max_freq
    sent_strength = {}
    for sent in doc.sents:
        for word in sent:
            if word.text in freq_word.keys():
                if sent in sent_strength.keys():
                    sent_strength[sent] += freq_word[word.text]
                else:
                    sent_strength[sent] = freq_word[word.text]
    summary = []
    sorted_x = sorted(sent_strength.items(), key = lambda kv: kv[1], reverse = True)
    counter = 0
    for i in range(len(sorted_x)):
        summary.append(str(sorted_x[i][0]).capitalize())
        counter += 1
        if(counter >= limit):
            break
    return ' '.join(summary)

In [None]:
def tf_idf_summ(df):
  try:
    if type(df['body']) == str:
      return top_sentence(df['body'], 3)
    else:
      return ""
  except:
    return ""

In [None]:
df['tf_idf_summ'] = df.apply(tf_idf_summ, axis = 1)

# TextRank

In [None]:
def tr_summ(df):
  try:
    if type(df['body']) == str:
      summary = []
      for sent in nlp(df['body'])._.textrank.summary(limit_phrases = 15, limit_sentences = 3):
        summary.append(str(sent))
      return " ".join(summary)
    else:
      return ""
  except:
    return ""

In [None]:
df['tr_summ'] = df.apply(tr_summ, axis = 1)

# **Abstractive Summarisation**

# Libraries & Packages

In [None]:
!pip install torch torchvision torchaudio
!pip install transformers
!pip install sentencepiece
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch
import sentencepiece

In [None]:
bodies = df['body'].values.tolist()

# Pegasus

In [None]:
model_name = 'google/pegasus-xsum'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

In [None]:
batch = tokenizer.prepare_seq2seq_batch(bodies, truncation = True, padding = 'longest', return_tensors ='pt')
translated = model.generate(**batch)
output = tokenizer.batch_decode(translated, skip_special_tokens = True)

In [None]:
df['ab_summ'] = output

# **Summaries exported to CSV**

In [None]:
df.to_csv("articles with summaries.csv")

# **Model Evaluation**

In [None]:
!pip install rouge

In [None]:
from rouge import Rouge

In [None]:
lc_predictions = df['lc_summ'].values.tolist()
tf_idf_predictions = df['tf_idf_summ'].values.tolist()
tr_predictions = df['tr_summ'].values.tolist()
ab_predictions = df['ab_summ'].values.tolist()
references = df['headline'].values.tolist()

In [None]:
eval_metric = Rouge()
eval_results_lc = eval_metric.get_scores(lc_predictions, references, avg = True)
eval_results_tf_idf = eval_metric.get_scores(tf_idf_predictions, references, avg = True)
eval_results_tr = eval_metric.get_scores(tr_predictions, references, avg = True)
eval_results_ab = eval_metric.get_scores(ab_predictions, references, avg = True)

In [None]:
display(eval_results_lc)

In [None]:
display(eval_results_tf_idf)

In [None]:
display(eval_results_tr)

In [None]:
display(eval_results_ab)