# Text Rank

In [2]:
import os
import codecs

corpus_path = "news-corpus//"
article_paths = [os.path.join(corpus_path,p) for p in os.listdir(corpus_path)]

doc_complete = []
for path in article_paths:
    with open(path, 'rb') as f:
        doc_content = f.read().decode(errors='ignore')
        doc_complete.append(doc_content)

In [3]:
import re
for i in range(len(doc_complete)):
    doc_complete[i] = re.sub(r'[^\w\s.]', '', doc_complete[i])

In [4]:
if len(doc_complete) >= 2:
    doc_complete.pop(1)

In [5]:
len(doc_complete)

5

In [6]:
import nltk
from nltk.corpus import stopwords
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

def textrank_summarize(document, num_sentences=3):
    
    # Tokenize the document into sentences
    sentences = nltk.sent_tokenize(document)
    
    # Combine the sentences into a single string
    text = ' '.join(sentences)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_text = ' '.join(word for word in text.split() if word.lower() not in stop_words)
    
    # Create a parser and tokenizer for the filtered text
    parser = PlaintextParser.from_string(filtered_text, Tokenizer("english"))
    
    # Create a LexRank summarizer
    summarizer = LexRankSummarizer()
    
    # Generate the summary
    summary = summarizer(parser.document, num_sentences)
    
    # Extract the sentences from the summary
    extracted_sentences = [str(sentence) for sentence in summary]
    
    return extracted_sentences


In [7]:
generated_summaries = []
for doc in doc_complete:
    summary = textrank_summarize(doc)
    for sentence in summary:
        generated_summaries.append(summary)
        print(sentence)
    print("===================================================================================================")

large contributor Indians balanced diets.The Government India identified millet safe bet enhance farmers income reliable grain ensure Indias nutritional food security.We largest producer second largest exporter Shri Anna Millets world.
Public perception millets also changed due market increasingly dominated wheat.Pearl millet crop high content protein good fat content good fibre content carbohydrate content besides lot micronutrients especially iron zinc said Dr Suneha Goswami Scientist IARI New Delhi.However realising nutritious value climatic reliance millet production Indian government took upon revive practice adding millet countrys food basket again.
government also began referring millets nutricereals giving image makeover.It efforts India exported USD 64 million millet 2021. remarkable achievement considering India even touched export mark USD 30 million worth millets 2019 2020.India home 20 per cent global millet production staggering 80 per cent contribution Asias millet produ

In [8]:
def extract_text_between_markers(text, start_marker, end_marker):
    extracted_texts = []
    start_index = 0

    while True:
        start_index = text.find(start_marker, start_index)
        if start_index == -1:
            break

        start_index += len(start_marker)
        end_index = text.find(end_marker, start_index)
        if end_index == -1:
            break

        extracted_text = text[start_index:end_index].strip()
        extracted_texts.append(extracted_text)

        start_index = end_index + len(end_marker)

    return extracted_texts

In [9]:
from docx import Document

def read_docx(file_path):
    doc = Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

reference_summaries =[]
file_path = "News articles.docx"
text_content = read_docx(file_path)


start_marker = "Summary:"
end_marker = "Top Sentences:"

extracted_texts = extract_text_between_markers(text_content, start_marker, end_marker)
for extracted_text in extracted_texts:
    reference_summaries.append(extracted_text)


In [10]:
reference_summaries

["The Indian government, led by Agriculture Minister Narendra Singh Tomar, has opened the 'Millets Experience Centre' in New Delhi to promote the production and consumption of millets. The center aims to raise awareness about the nutritional benefits of millets and encourage their adoption among the public. Tomar emphasized the health advantages of millets and their climate resilience, highlighting that increased millet production would benefit farmers and startups in the sector. The United Nations has declared 2023 as the International Year of Millets, further positioning India as a leader in promoting this crop.",
 "India is taking initiatives to educate people about the nutritional value of millets and promote their consumption. Despite being rich in protein and antioxidants, millets have not been considered fashionable foods. However, the Indian government has recognized millets as a reliable grain for enhancing farmers' income and ensuring food security. Efforts to revive millet p

# Bleu Scores

In [11]:
from nltk.translate.bleu_score import sentence_bleu

bleu_score = sentence_bleu(generated_summaries[0],reference_summaries[0])
print("BLEU score for Article",":", bleu_score)


BLEU score for Article : 0.5190636567203483


In [12]:
bleu_score = sentence_bleu(generated_summaries[2],reference_summaries[1])
print("BLEU score for Article",":", bleu_score)

BLEU score for Article : 0.562164828296763


In [13]:
bleu_score = sentence_bleu(generated_summaries[3],reference_summaries[2])
print("BLEU score for Article",":", bleu_score)

BLEU score for Article : 0.10843114042916478


In [14]:
bleu_score = sentence_bleu(generated_summaries[1],reference_summaries[3])
print("BLEU score for Article",":", bleu_score)

BLEU score for Article : 0.37687325899401314


In [15]:
bleu_score = sentence_bleu(generated_summaries[4],reference_summaries[4])
print("BLEU score for Article",":", bleu_score)

BLEU score for Article : 0.10063145766311772


In [17]:
type(generated_summaries)

list

# Perplexity

In [25]:
import nltk
from nltk.lm import MLE
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.tokenize import word_tokenize

def calculate_perplexity(reference_text, generated_summary):
    # Tokenize the reference text and generated summary
    reference_tokens = word_tokenize(reference_text)
    summary_tokens = word_tokenize(generated_summary)

    # Create training data for the language model
    n = 2# Order of the n-gram language model
    train_data, padded_sents = padded_everygram_pipeline(n, reference_tokens)

    # Train the language model
    model = MLE(n)
    model.fit(train_data, padded_sents)

    # Calculate perplexity
    test_data = list(nltk.ngrams(summary_tokens, n))
    perplexity = model.perplexity(test_data)

    return perplexity


perplexity = calculate_perplexity(str(reference_summaries[0]), str(generated_summaries[0]))
print("Perplexity:", perplexity)


Perplexity: inf
