# Lex Rank

In [1]:
import os
import codecs
import warnings
warnings.filterwarnings("ignore")

corpus_path = "news-corpus//"
article_paths = [os.path.join(corpus_path,p) for p in os.listdir(corpus_path)]

doc_complete = []
for path in article_paths:
    with open(path, 'rb') as f:
        doc_content = f.read().decode(errors='ignore')
        doc_complete.append(doc_content)


In [2]:
import re
for i in range(len(doc_complete)):
    doc_complete[i] = re.sub(r'[^\w\s.]', '', doc_complete[i])

In [3]:
if len(doc_complete) >= 2:
    doc_complete.pop(1)

In [4]:
len(doc_complete)

5

In [5]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_distances
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# Preprocess the text data

# if len(doc_complete) >= 2:
#     doc_complete.pop(1)

# Define stopwords and lemmatizer

generated_summaries = []

stopwords = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to preprocess a document
def preprocess_document(document):
    # Split the document into sentences
    sentences = sent_tokenize(document)
    
    # Preprocess each sentence
    preprocessed_sentences = []
    for sentence in sentences:
        # Tokenize the sentence
        tokens = word_tokenize(sentence.lower())
        
        # Remove stopwords and punctuation
        tokens = [token for token in tokens if token.isalpha() and token not in stopwords]
        
        # Lemmatize the tokens
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        
        preprocessed_sentences.append(tokens)
    
    return preprocessed_sentences

# Create a list of preprocessed documents
preprocessed_documents = [preprocess_document(doc) for doc in doc_complete]

# Flatten the list of tokens
flattened_documents = [" ".join(token for sentence in document for token in sentence) for document in preprocessed_documents]

# Vectorize the documents (TF-IDF)
vectorizer = TfidfVectorizer()
vectorized_docs = vectorizer.fit_transform(flattened_documents)

# Perform clustering (K-means)
num_clusters = 5  # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=num_clusters, init='k-means++', random_state=42)
kmeans.fit(vectorized_docs)
cluster_labels = kmeans.labels_

# Generate summaries for each cluster
for cluster_id in range(num_clusters):
    cluster_docs = [doc_complete[i] for i, label in enumerate(cluster_labels) if label == cluster_id]
    
    # Concatenate the cluster documents
    cluster_text = "\n".join(cluster_docs)
    
    # Create a parser for the cluster text
    parser = PlaintextParser.from_string(cluster_text, Tokenizer("english"))
    
    # Create a LexRank summarizer
    summarizer = LexRankSummarizer()
    
    # Summarize the cluster text
    summary = summarizer(parser.document, 3)  # Adjust the number of sentences for summary
    
    # Extract key points from the summary
    key_points = [str(sentence) for sentence in summary]
    
    # Print the key points for the cluster
    print(f"Cluster {cluster_id + 1} Summary:")
    print("\n".join(key_points))
    generated_summaries.append("".join((key_points)))
    print("\n\n")
    print(f"Cluster {cluster_id + 1} Key Points:")
    for point in key_points:
        print("- " + point)
    print("==========================")


Cluster 1 Summary:
Food regulator FSSAI on Friday said there are 32 new cases where food business operators FBOs have been primafacie found in violation of misleading advertisements and claims.
FSSAI has referred the matter to licensing authorities concerned to issue notices to these FBOs for withdrawing misleading claims or scientifically substantiate claims.
In order to keep a close tab on the claims and advertisements being made by the FBOs on their products Advertisement Monitoring Committee of FSSAI has reported 32 fresh cases which have been found prima facie in contravention of the provisions of Food Safety and Standards Advertisements  Claims Regulations 2018 the regulator said in a statement.



Cluster 1 Key Points:
- Food regulator FSSAI on Friday said there are 32 new cases where food business operators FBOs have been primafacie found in violation of misleading advertisements and claims.
- FSSAI has referred the matter to licensing authorities concerned to issue notices to 

In [6]:
#Generated_summaries = list(generated_summaries)

In [7]:
#Generated_summaries 

In [8]:
#pip install python-docx

In [9]:
def extract_text_between_markers(text, start_marker, end_marker):
    extracted_texts = []
    start_index = 0

    while True:
        start_index = text.find(start_marker, start_index)
        if start_index == -1:
            break

        start_index += len(start_marker)
        end_index = text.find(end_marker, start_index)
        if end_index == -1:
            break

        extracted_text = text[start_index:end_index].strip()
        extracted_texts.append(extracted_text)

        start_index = end_index + len(end_marker)

    return extracted_texts

In [10]:
from docx import Document

def read_docx(file_path):
    doc = Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

reference_summaries =[]
file_path = "News articles.docx"
text_content = read_docx(file_path)


start_marker = "Summary:"
end_marker = "Top Sentences:"

extracted_texts = extract_text_between_markers(text_content, start_marker, end_marker)
for extracted_text in extracted_texts:
    reference_summaries.append(extracted_text)


In [11]:
reference_summaries

["The Indian government, led by Agriculture Minister Narendra Singh Tomar, has opened the 'Millets Experience Centre' in New Delhi to promote the production and consumption of millets. The center aims to raise awareness about the nutritional benefits of millets and encourage their adoption among the public. Tomar emphasized the health advantages of millets and their climate resilience, highlighting that increased millet production would benefit farmers and startups in the sector. The United Nations has declared 2023 as the International Year of Millets, further positioning India as a leader in promoting this crop.",
 "India is taking initiatives to educate people about the nutritional value of millets and promote their consumption. Despite being rich in protein and antioxidants, millets have not been considered fashionable foods. However, the Indian government has recognized millets as a reliable grain for enhancing farmers' income and ensuring food security. Efforts to revive millet p

# Blue scores

In [13]:
from nltk.translate.bleu_score import sentence_bleu

bleu_score = sentence_bleu(generated_summaries[1],reference_summaries[0])
print("BLEU score for Article",":", bleu_score)


BLEU score for Article : 8.687066696188942e-232


In [14]:
bleu_score = sentence_bleu(generated_summaries[2],reference_summaries[1])
print("BLEU score for Article",":", bleu_score)


BLEU score for Article : 8.789870675111121e-232


In [15]:
bleu_score = sentence_bleu(generated_summaries[0],reference_summaries[2])
print("BLEU score for Article",":", bleu_score)

BLEU score for Article : 9.186109474684186e-232


In [16]:
bleu_score = sentence_bleu(generated_summaries[4],reference_summaries[3])
print("BLEU score for Article",":", bleu_score)

BLEU score for Article : 8.096973930238044e-232


In [17]:
bleu_score = sentence_bleu(generated_summaries[3],reference_summaries[4])
print("BLEU score for Article",":", bleu_score)

BLEU score for Article : 9.404418754951175e-232


# Lex Rank

In [18]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# Define stopwords and lemmatizer
stopwords = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to preprocess a document
def preprocess_document(document):
    # Split the document into sentences
    sentences = sent_tokenize(document)
    
    # Preprocess each sentence
    preprocessed_sentences = []
    for sentence in sentences:
        # Tokenize the sentence
        tokens = nltk.word_tokenize(sentence.lower())
        
#         # Remove stopwords and punctuation
#         tokens = [token for token in tokens if token.isalpha() and token not in stopwords]
        
        # Lemmatize the tokens
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        
        preprocessed_sentences.append(" ".join(tokens))
    
    return preprocessed_sentences

# Create a list of preprocessed documents
generated_summaries =[]
preprocessed_documents = [preprocess_document(doc) for doc in doc_complete]

# Generate summaries for each document
for i, document in enumerate(preprocessed_documents):
    # Concatenate the document sentences
    document_text = "\n".join(document)
    
    # Create a parser for the document text
    parser = PlaintextParser.from_string(document_text, Tokenizer("english"))
    
    # Create a LexRank summarizer
    summarizer = LexRankSummarizer()
    
    # Summarize the document text
    summary = summarizer(parser.document, 3)  # Adjust the number of sentences for summary
    
    # Extract key points from the summary
    key_points = [str(sentence) for sentence in summary]
    
    # Print the key points for the document
    print(f"Document {i + 1} Summary:")
    print("\n")
    print("\n".join(key_points))
    generated_summaries.append("".join((key_points)))
    print("==========================")


Document 1 Summary:


at a time when the two extreme of malnourishment and obesity plague large portion of the world india ha taken it upon herself to educate the mass about these smallseeded grass that are highly beneficial to human health.millet and grain cereal despite being rich source of protein and antioxidant with high nutritional value have never been considered fashionable food however india ha done remarkably well when it ha come to meeting the caloric need and demand of her people.india the world largest producer and the world secondlargest exporter of millet are hoping to change the humble millet reputation worldwide.unlike a large part of the rest of the world almost every indian household is acquainted with the taste and the benefit of millet.millets have been a staple of the indian diet especially in rural india for year and remain prevalent even today .
they have been a large contributor to indian balanced diets.the government of india ha identified millet a a safe bet 

# Blue scores

In [19]:
bleu_score = sentence_bleu(generated_summaries[1],reference_summaries[0])
print("BLEU score for Article",":", bleu_score)

BLEU score for Article : 7.998667067471733e-232


In [20]:
bleu_score = sentence_bleu(generated_summaries[2],reference_summaries[1])
print("BLEU score for Article",":", bleu_score)


BLEU score for Article : 8.281277492238267e-232


In [21]:
bleu_score = sentence_bleu(generated_summaries[0],reference_summaries[2])
print("BLEU score for Article",":", bleu_score)

BLEU score for Article : 8.827980943354901e-232


In [22]:
bleu_score = sentence_bleu(generated_summaries[4],reference_summaries[3])
print("BLEU score for Article",":", bleu_score)

BLEU score for Article : 8.096973930238044e-232


In [23]:
bleu_score = sentence_bleu(generated_summaries[3],reference_summaries[4])
print("BLEU score for Article",":", bleu_score)

BLEU score for Article : 8.585020487185389e-232
