## Data Preprocessing

In [12]:
import re
import string
import numpy as np
import nltk                         # NLP toolbox
from os import getcwd
import pandas as pd                 # Library for Dataframes 
from nltk.corpus import twitter_samples 
import matplotlib.pyplot as plt     # Library for visualization
import numpy as np                  # Library for math function
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
def process_sentence(sentence):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    sentence = re.sub(r'\$\w*', '', sentence)
    # remove old style retweet text "RT"
    sentence = re.sub(r'^RT[\s]+', '', sentence)
    # remove hyperlinks
    sentence = re.sub(r'https?:\/\/.*[\r\n]*', '', sentence)
    # remove hashtags
    # only removing the hash # sign from the word
    sentence = re.sub(r'#', '', sentence)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    sentence_tokens = tokenizer.tokenize(sentence)

    sentence_clean = []
    for word in sentence_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            sentence_clean.append(stem_word)

    return sentence_clean


In [13]:
def build_freqs(sentences):
    freqs={}
    for word in process_sentence(sentences):
        if word in freqs:
            freqs[word]+=1
        else:
            freqs[word]=1
    return freqs

In [14]:
import nltk
nltk.download('punkt')
def separate_into_sentences(text):
    

    # Use the sent_tokenize function to split the text into sentences
    sentences = nltk.sent_tokenize(text)
    
    return sentences

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jaydaksharora/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
sen = """
The International Society for Krishna Consciousness (ISKCON), known colloquially as the Hare Krishna movement, is a Gaudiya Vaishnava Hindu religious organization. It was founded on 13 July 1966 in New York City by A. C. Bhaktivedanta Swami Prabhupada.[2] Its main headquarters is located today in West Bengal, India.

Its unique form of monotheistic core beliefs are based on Hindu scriptures, particularly Prabhupada's commentaries and translations of the Bhagavad Gita and the Bhagavata Purana. ISKCON is "the largest and, arguably, most important branch" of Gaudiya Vaishnava tradition,[3] which has had adherents in India since the early 16th century and its American and European devotees since the early 1900s.[4] It has around 10 million followers worldwide.[5]

The religious organization practices vegetarianism and was initially formed to spread the practice of Bhakti yoga. Its followers, called bhaktas, dedicate both their thoughts and actions towards pleasing Krishna, whom they consider the Supreme Lord Godhead. They regard the rest of Hindu deities as secondary "demigods".[6] Its most rapid expansion in registered membership has been within India and (after the collapse of the Soviet Union) in Russia and other formerly Soviet-aligned states of Eastern Europe.[7]"""


## 1) Extractive Summarization

In [16]:
freqs=build_freqs(sen)

In [17]:
max_length=max(freqs.values())
for word in freqs.keys():
    freqs[word]=freqs[word]/max_length

In [18]:
sentence_tokens=separate_into_sentences(sen)
sentence_tokens

['\nThe International Society for Krishna Consciousness (ISKCON), known colloquially as the Hare Krishna movement, is a Gaudiya Vaishnava Hindu religious organization.',
 'It was founded on 13 July 1966 in New York City by A. C. Bhaktivedanta Swami Prabhupada.',
 '[2] Its main headquarters is located today in West Bengal, India.',
 "Its unique form of monotheistic core beliefs are based on Hindu scriptures, particularly Prabhupada's commentaries and translations of the Bhagavad Gita and the Bhagavata Purana.",
 'ISKCON is "the largest and, arguably, most important branch" of Gaudiya Vaishnava tradition,[3] which has had adherents in India since the early 16th century and its American and European devotees since the early 1900s.',
 '[4] It has around 10 million followers worldwide.',
 '[5]\n\nThe religious organization practices vegetarianism and was initially formed to spread the practice of Bhakti yoga.',
 'Its followers, called bhaktas, dedicate both their thoughts and actions toward

In [19]:
sentence_scores = {}

for sent in sentence_tokens:
    total_score = 0
    for word in process_sentence(sent):
        if word.lower() in freqs:
            total_score += freqs[word.lower()]
    
    # Normalize the score by sentence length
    sentence_scores[sent] = total_score / len(sent)

print(sentence_scores)


{'\nThe International Society for Krishna Consciousness (ISKCON), known colloquially as the Hare Krishna movement, is a Gaudiya Vaishnava Hindu religious organization.': 0.05284552845528455, 'It was founded on 13 July 1966 in New York City by A. C. Bhaktivedanta Swami Prabhupada.': 0.04166666666666667, '[2] Its main headquarters is located today in West Bengal, India.': 0.05128205128205128, "Its unique form of monotheistic core beliefs are based on Hindu scriptures, particularly Prabhupada's commentaries and translations of the Bhagavad Gita and the Bhagavata Purana.": 0.03558052434456928, 'ISKCON is "the largest and, arguably, most important branch" of Gaudiya Vaishnava tradition,[3] which has had adherents in India since the early 16th century and its American and European devotees since the early 1900s.': 0.045662100456620995, '[4] It has around 10 million followers worldwide.': 0.04761904761904762, '[5]\n\nThe religious organization practices vegetarianism and was initially formed 

In [20]:
summary_len = int(len(sentence_tokens)*0.3)
final_draft=[]
scores=list(sentence_scores.values())
for i in range(summary_len):
    score=max(scores)
    scores.remove(score)
    for sentence in sentence_scores.keys() :
        if sentence_scores[sentence]==score:
            final_draft.append(sentence)

In [21]:
summary = ' '.join(final_draft)

In [22]:
summary

'[7] \nThe International Society for Krishna Consciousness (ISKCON), known colloquially as the Hare Krishna movement, is a Gaudiya Vaishnava Hindu religious organization. [2] Its main headquarters is located today in West Bengal, India.'

In [23]:
len(sen)

1285

In [24]:
len(summary)

234

## Abstractive Summarization

In [25]:
from transformers import PegasusForConditionalGeneration,PegasusTokenizer

In [26]:
import sentencepiece as spm
from transformers import PegasusTokenizer

tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')


In [27]:
model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
tokens=tokenizer(sen,truncation=True, padding="longest",return_tensors="pt")

In [29]:
tokens

{'input_ids': tensor([[  139,  1142,  2377,   118, 19154, 45398,   143,   187, 25275, 15289,
           312,   606, 92382,   130,   109, 32034, 19154,  1815,   108,   117,
           114, 83791,  4518,  7863,  3855, 81559, 14286,  3503,  1134,   107,
           168,   140,  3271,   124,  1428,  1307, 20556,   115,   351,   859,
           672,   141,   202,   107,   597,   107, 84648, 27002, 18363, 36864,
         69373, 50242,   107,  4101, 50558,  3096,   674,  7243,   117,   746,
           380,   115,  1167, 19655,   108,  1144,   107,  3096,   620,   515,
           113, 11325,   544, 10855,  1962,  6621,   127,   451,   124, 14286,
         28690,   108,  1533, 69373, 50242,   131,   116, 39687,   111, 16207,
           113,   109, 94472, 69188,   111,   109, 44885, 52667,   304, 94247,
           107,   125, 25275, 15289,   117,   198,   544,  1368,   111,   108,
         12488,   108,   205,   356,  4444,   194,   113, 83791,  4518,  7863,
          3855, 81559,  3636,   108,  

In [30]:
summary=model.generate(**tokens)

In [31]:
summary

tensor([[    0,   139,  1142,  2377,   118, 19154, 45398,   143,   187, 25275,
         15289,   312,   606, 92382,   130,   109, 32034, 19154,  1815,   108,
           117,   114, 83791,  4518,  7863,  3855, 81559, 14286,  3503,  1134,
           107,     1]])

In [32]:
tokenizer.decode(summary[0])

'<pad>The International Society for Krishna Consciousness (ISKCON), known colloquially as the Hare Krishna movement, is a Gaudiya Vaishnava Hindu religious organization.</s>'

In [33]:
from transformers import pipeline 
summarizer=pipeline("summarization",model=model,tokenizer=tokenizer)

In [36]:
summary = summarizer(sen,max_length=100,min_length=100)
print("Generated summary:\n", summary[0]['summary_text'])


Generated summary:
 The International Society for Krishna Consciousness (ISKCON), known colloquially as the Hare Krishna movement, is a Gaudiya Vaishnava Hindu religious organization. The International Society for Krishna Consciousness (ISKCON), known colloquially as the Hare Krishna movement, was founded on 13 July 1966 in New York City by A. C. Bhaktivedanta Swami Prabhupada, the founder of the International Society for Krishna Consciousness (ISKCON), has died at the age of 100 in the Indian city of Kolkata


In [37]:
len(sen)

1285

In [41]:
len(summary[0]['summary_text'])

494