<a href="https://colab.research.google.com/github/KarkiAnuj17/Automatic-text-summarization/blob/main/Automatic_text_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import zipfile

zip_ref = zipfile.ZipFile("/content/drive/MyDrive/dataset.zip", 'r')
zip_ref.extractall("/content/dataset")
zip_ref.close()

In [3]:
import cudf
import pandas as pd
import nltk
from tqdm import tqdm
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet


In [4]:
path="/content/dataset/dataset/test.csv"
df=pd.read_csv(path)
print(df.columns)
print(df.shape)

Index(['id', 'article', 'highlights'], dtype='object')
(11490, 3)


In [5]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [6]:
def get_wordnet_pos(treebank_tag):
    # Map POS tag to WordNet POS tag
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def preprocess_text(text):
    # Split text into sentences
    sentences = sent_tokenize(text)

    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Get English stopwords
    stop_words = set(stopwords.words('english'))

    # Preprocess each sentence
    processed_sentences = []
    for sentence in sentences:
        # Tokenize words in the sentence
        words = word_tokenize(sentence)

        # Get POS tags
        pos_tags = pos_tag(words)

        # Lemmatize and remove stopwords
        processed_words = []
        for word, pos in pos_tags:
            if word.lower() not in stop_words and word.isalnum():
                lemmatized_word = lemmatizer.lemmatize(word.lower(), get_wordnet_pos(pos))
                processed_words.append(lemmatized_word)

        # Append processed sentence
        processed_sentences.append(processed_words)

    return processed_sentences

def preprocess_csv(file_path, text_column):
    # Read the CSV file into a cuDF DataFrame
    df = cudf.read_csv(file_path)

    # Convert text column to a pandas Series for processing with NLTK
    text_series = df[text_column].to_pandas()

    # Apply preprocessing to the text column with progress bar
    processed_text = text_series.progress_apply(preprocess_text)

    # Convert the processed text back to a cuDF DataFrame
    df['processed_text'] = cudf.from_pandas(processed_text)

    return df

# Enable the tqdm progress_apply
tqdm.pandas()

file_path = '/content/dataset/dataset/test.csv'
text_column = 'article'
processed_df = preprocess_csv(file_path, text_column)

# Display the processed DataFrame
print(processed_df.head())


100%|██████████| 11490/11490 [08:02<00:00, 23.80it/s]


                                         id  \
0  92c514c913c0bdfe25341af9fd72b29db544099b   
1  2003841c7dc0e7c5b1a248f9cd536d727f27a45a   
2  91b7d2311527f5c2b63a65ca98d21d9c92485149   
3  caabf9cbdf96eb1410295a673e953d304391bfbb   
4  3da746a7d9afcaa659088c8366ef6347fe6b53ea   

                                             article  \
0  Ever noticed how plane seats appear to be gett...   
1  A drunk teenage boy had to be rescued by secur...   
2  Dougie Freedman is on the verge of agreeing a ...   
3  Liverpool target Neto is also wanted by PSG an...   
4  Bruce Jenner will break his silence in a two-h...   

                                          highlights  \
0  Experts question if  packed out planes are put...   
1  Drunk teenage boy climbed into lion enclosure ...   
2  Nottingham Forest are close to extending Dougi...   
3  Fiorentina goalkeeper Neto has been linked wit...   
4  Tell-all interview with the reality TV star, 6...   

                                      proce

In [7]:
import cupy as cp
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

# Cosine Similarity Function
def cosine_similarity_matrix(matrix):
    matrix = cp.array(matrix)
    dot_product = cp.dot(matrix, matrix.T)  # Compute dot product
    norm = cp.linalg.norm(matrix, axis=1)  # Compute norm (magnitude) of each vector
    similarity_matrix = dot_product / (cp.outer(norm, norm) + 1e-10)  # Cosine similarity
    return similarity_matrix

# Vectorization Function using TF-IDF
def vectorize_sentences_with_tfidf(sentences):
    vectorizer = TfidfVectorizer(stop_words='english')  # Optional: Remove stopwords
    term_doc_matrix_cpu = vectorizer.fit_transform(sentences)  # Compute the TF-IDF matrix

    # Convert the sparse matrix to a dense NumPy array
    term_doc_matrix_cpu = term_doc_matrix_cpu.toarray()

    # Convert to CuPy array for GPU acceleration
    term_doc_matrix = cp.array(term_doc_matrix_cpu)

    return term_doc_matrix

# Function to Compute Similarity for Articles
def compute_similarity_for_articles(df, text_column='processed_text'):
    text_series = df[text_column].to_pandas()  # Get text data from DataFrame
    similarity_matrices = []

    # Iterate over each article's processed text
    for index, processed_text in tqdm(text_series.items(), total=len(text_series), desc="Processing Articles"):

        # Ensure processed_text is not empty and is a list of sentences
        if processed_text and isinstance(processed_text, list):
            processed_sentences = [' '.join(sentence) for sentence in processed_text]  # Join words into sentences
        else:
            processed_sentences = []

        # Skip if the processed_sentences list is empty
        if not processed_sentences:
            similarity_matrices.append(None)
            continue

        # Vectorize sentences using TF-IDF approach
        term_doc_matrix = vectorize_sentences_with_tfidf(processed_sentences)

        # Compute the cosine similarity matrix
        similarity_matrix = cosine_similarity_matrix(term_doc_matrix)

        # Convert to CPU (NumPy) array for easier handling if necessary
        similarity_matrix_cpu = cp.asnumpy(similarity_matrix)

        # Store the similarity matrix for the article
        similarity_matrices.append(similarity_matrix_cpu)

    return similarity_matrices

# Enable tqdm for progress bar in pandas apply
tqdm.pandas()

# Assuming you have a DataFrame `processed_df` with a 'processed_text' column
final_similarity_matrices = compute_similarity_for_articles(processed_df, text_column='processed_text')

# Example output for the first article
print("Final Cosine Similarity Matrix for the First Article:\n", final_similarity_matrices[0])


Processing Articles: 100%|██████████| 11490/11490 [00:50<00:00, 226.36it/s]


Final Cosine Similarity Matrix for the First Article:
 [[1.         0.03550455 0.         0.04097448 0.02367313 0.
  0.         0.07210004 0.04264119 0.0403271  0.03092843 0.07493094
  0.11954024 0.         0.02426954 0.06009226]
 [0.03550455 1.         0.         0.03528285 0.02038477 0.
  0.08077285 0.02888779 0.03671804 0.         0.08621036 0.03002202
  0.         0.         0.         0.        ]
 [0.         0.         1.         0.53670032 0.0884582  0.12443423
  0.         0.04675813 0.         0.         0.         0.
  0.         0.         0.02943534 0.        ]
 [0.04097448 0.03528285 0.53670032 1.         0.0548335  0.04404127
  0.         0.07770597 0.04237492 0.         0.0307353  0.0346473
  0.         0.         0.02793048 0.        ]
 [0.02367313 0.02038477 0.0884582  0.0548335  1.         0.21782932
  0.         0.04489486 0.07925059 0.         0.0574819  0.06479821
  0.         0.         0.01613692 0.        ]
 [0.         0.         0.12443423 0.04404127 0.2178293

In [8]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np

# Function to cluster sentences within a document
def cluster_sentences(similarity_matrix, n_clusters=5):
    # Ensure the similarity matrix is NumPy array
    similarity_matrix = np.array(similarity_matrix)

    # Convert similarity matrix to distance matrix (1 - similarity)
    distance_matrix = 1 - similarity_matrix

    # Ensure that the number of clusters does not exceed the number of sentences
    n_clusters = min(n_clusters, distance_matrix.shape[0])

    # Apply Agglomerative Clustering
    clustering_model = AgglomerativeClustering(
        n_clusters=n_clusters, metric='precomputed', linkage='average'
    )
    cluster_labels = clustering_model.fit_predict(distance_matrix)

    return cluster_labels

# Function to cluster sentences for all documents
def cluster_sentences_for_documents(similarity_matrices, n_clusters=5):
    sentence_clusters = []

    for similarity_matrix in similarity_matrices:
        if similarity_matrix is None:
            sentence_clusters.append(None)  # Skip if no similarity matrix
            continue

        # Cluster sentences for the document
        clusters = cluster_sentences(similarity_matrix, n_clusters)
        sentence_clusters.append(clusters)

    return sentence_clusters

# Apply clustering to your computed similarity matrices
n_clusters = 5  # Number of clusters per document (adjust based on data)
sentence_clusters = cluster_sentences_for_documents(final_similarity_matrices, n_clusters=n_clusters)

# Example: Display sentence clusters for the first document
print("Sentence Clusters for the First Document:")
print(sentence_clusters[0])


Sentence Clusters for the First Document:
[2 1 0 0 0 0 4 2 3 3 1 3 2 3 3 3]


In [12]:
import networkx as nx
import itertools
from tqdm import tqdm

# Function to generate a word graph from clustered sentences
def generate_word_graph(clustered_sentences, processed_sentences, threshold=1):
    G = nx.Graph()

    # Iterate through each unique cluster index
    for cluster_idx in set(clustered_sentences):
        # Get sentences belonging to the current cluster
        sentences_in_cluster = [processed_sentences[idx] for idx in range(len(clustered_sentences)) if clustered_sentences[idx] == cluster_idx]

        # Iterate through each sentence in the cluster
        for sentence in sentences_in_cluster:
            # Get the unique words in the sentence
            words = set(sentence)

            # Create edges between all pairs of words in the sentence (undirected graph)
            for word1, word2 in itertools.combinations(words, 2):
                # Add edge with weight (frequency of co-occurrence)
                if G.has_edge(word1, word2):
                    G[word1][word2]['weight'] += 1
                else:
                    G.add_edge(word1, word2, weight=1)

    # Remove edges with weight less than the threshold
    edges_to_remove = [(u, v) for u, v, data in G.edges(data=True) if data['weight'] < threshold]
    G.remove_edges_from(edges_to_remove)

    return G

# Function to process all articles without visualizing the word graph
def process_all_articles(processed_df, sentence_clusters, threshold=1):
    # Loop through all articles and process each article
    for article_idx in tqdm(range(len(processed_df)), desc="Processing Articles"):
        # Get the clustered sentences and processed sentences for the current article
        clustered_sentences = sentence_clusters[article_idx]
        processed_sentences = processed_df['processed_text'].iloc[article_idx]

        # Flatten the processed sentences if needed (flatten each sentence into individual words)
        processed_sentences_flattened = [item for sublist in processed_sentences for item in sublist]

        # Generate word graph for the clustered sentences
        word_graph = generate_word_graph(clustered_sentences, processed_sentences_flattened, threshold)

# Example: Process all articles without visualization
process_all_articles(processed_df, sentence_clusters, threshold=1)


Processing Articles: 100%|██████████| 11490/11490 [00:57<00:00, 201.07it/s]


In [13]:
import networkx as nx
import itertools
from collections import Counter
from tqdm import tqdm  # Import tqdm for progress bar

# Function to generate a word graph from clustered sentences
def generate_word_graph(clustered_sentences, processed_sentences, threshold=1):
    G = nx.Graph()

    # Iterate through each cluster in clustered_sentences
    for cluster_idx in clustered_sentences:
        # Ensure each cluster is a list of sentences
        if isinstance(cluster_idx, list):
            for sentence in cluster_idx:
                # Ensure the sentence is a list of words
                words = set(sentence)

                # Create edges between all pairs of words in the sentence (undirected graph)
                for word1, word2 in itertools.combinations(words, 2):
                    # Add edge with weight (frequency of co-occurrence)
                    if G.has_edge(word1, word2):
                        G[word1][word2]['weight'] += 1
                    else:
                        G.add_edge(word1, word2, weight=1)

    # Remove edges with weight less than the threshold
    edges_to_remove = [(u, v) for u, v, data in G.edges(data=True) if data['weight'] < threshold]
    G.remove_edges_from(edges_to_remove)

    return G


# Function to get the central words based on degree centrality
def get_central_words(word_graph, top_n=10):
    centrality = nx.degree_centrality(word_graph)
    sorted_centrality = sorted(centrality.items(), key=lambda x: x[1], reverse=True)
    central_words = [word for word, _ in sorted_centrality[:top_n]]
    return central_words


# Function to rank sentences based on central word count
def rank_sentences_by_central_words(clustered_sentences, processed_sentences, central_words):
    sentence_scores = []

    for i, sentence in enumerate(processed_sentences):
        central_word_count = sum(1 for word in sentence if word in central_words)
        sentence_scores.append((i, central_word_count))

    # Sort sentences by their centrality (most central words first)
    ranked_sentences = sorted(sentence_scores, key=lambda x: x[1], reverse=True)
    return ranked_sentences


# Function to generate summary based on ranked sentences
def generate_summary(ranked_sentences, processed_sentences, top_n_sentences=5):
    summary_sentences = [processed_sentences[i] for i, _ in ranked_sentences[:top_n_sentences]]
    summary = ' '.join([' '.join(sentence) for sentence in summary_sentences])
    return summary


# Process all articles and generate summaries
def process_all_articles(processed_df, sentence_clusters, top_n_sentences=5, word_graph_threshold=1, central_words_top_n=10):
    all_summaries = []

    # Use tqdm to add a progress bar to the iteration over the articles
    for article_idx in tqdm(range(len(processed_df)), desc="Processing Articles"):
        # Get the clustered sentences for the current article
        clustered_sentences = sentence_clusters[article_idx]

        # Get the processed sentences for the current article
        processed_sentences = processed_df['processed_text'].iloc[article_idx]

        # Generate word graph for the clustered sentences
        word_graph = generate_word_graph(clustered_sentences, processed_sentences, threshold=word_graph_threshold)

        # Get the central words from the word graph
        central_words = get_central_words(word_graph, top_n=central_words_top_n)

        # Rank sentences by their central word count
        ranked_sentences = rank_sentences_by_central_words(clustered_sentences, processed_sentences, central_words)

        # Generate the summary for the current article
        summary = generate_summary(ranked_sentences, processed_sentences, top_n_sentences)

        all_summaries.append(summary)

    return all_summaries


# Example: Generate summaries for all articles in the dataset
all_summaries = process_all_articles(processed_df, sentence_clusters, top_n_sentences=5, word_graph_threshold=1, central_words_top_n=10)

# Print the generated summary for the first article
print("Generated Summary for the First Article:")
print(all_summaries[0])  # Only print the summary for the first article


Processing Articles: 100%|██████████| 11490/11490 [00:54<00:00, 209.50it/s]

Generated Summary for the First Article:
ever notice plane seat appear get small small increase number people take sky expert question pack plane put passenger risk say shrink space aeroplane uncomfortable put health safety danger squabbling arm rest shrink space plane put health safety danger week consumer advisory group set department transportation say public hearing government happy set standard animal fly plane stipulate minimum amount space human





In [14]:
from tqdm import tqdm

# Function to reconstruct and improve sentence formatting
def reconstruct_summary(ranked_sentences, processed_sentences, top_n_sentences=5):
    # Check the number of available ranked sentences and limit the selection to that number
    max_sentences = min(top_n_sentences, len(ranked_sentences))

    # Ensure we don't exceed the length of processed_sentences
    summary_sentences = []
    for i, _ in ranked_sentences[:max_sentences]:
        if i < len(processed_sentences):  # Make sure the index is within range
            summary_sentences.append(processed_sentences[i])

    # Ensure each sentence is well-formed: capitalize and add a period if necessary
    summary = []
    for sentence in summary_sentences:
        sentence = ' '.join(sentence)  # Join the words in the sentence
        sentence = sentence.capitalize()  # Capitalize the first letter
        if not sentence.endswith('.'):  # Add period at the end if missing
            sentence += '.'
        summary.append(sentence)

    # Join the summary sentences into one final summary text
    return ' '.join(summary)

# Function to process all articles in the dataset
def process_all_articles(processed_df, sentence_clusters, top_n_sentences=5, word_graph_threshold=1, central_words_top_n=10):
    all_summaries = []

    for article_idx in tqdm(range(len(processed_df))):  # Use tqdm to display progress for all articles
        # Extract clustered sentences and processed sentences for the current article
        clustered_sentences = sentence_clusters[article_idx]
        processed_sentences = processed_df['processed_text'].iloc[article_idx]

        # Flatten the processed sentences
        processed_sentences_flattened = [item for sublist in processed_sentences for item in sublist]

        # Generate word graph for the clustered sentences
        word_graph = generate_word_graph(clustered_sentences, processed_sentences_flattened, threshold=word_graph_threshold)

        # Get central words from the word graph
        central_words = get_central_words(word_graph, top_n=central_words_top_n)

        # Rank sentences based on central words
        ranked_sentences = rank_sentences_by_central_words(clustered_sentences, processed_sentences_flattened, central_words)

        # Reconstruct summary for the current article
        summary = reconstruct_summary(ranked_sentences, processed_sentences, top_n_sentences)
        all_summaries.append(summary)

    return all_summaries

# Example: Process all articles in the dataset and get the summary for the first article
all_summaries = process_all_articles(processed_df, sentence_clusters, top_n_sentences=5, word_graph_threshold=1, central_words_top_n=10)

# Print the summary for the first article
print("Generated Summary for the First Article:")
print(all_summaries[0])


100%|██████████| 11490/11490 [00:55<00:00, 208.44it/s]


Generated Summary for the First Article:
Ever notice plane seat appear get small small. Increase number people take sky expert question pack plane put passenger risk. Say shrink space aeroplane uncomfortable put health safety danger. Squabbling arm rest shrink space plane put health safety danger. Week consumer advisory group set department transportation say public hearing government happy set standard animal fly plane stipulate minimum amount space human.


In [18]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding
!sudo apt-get update
!sudo apt-get install -y locales
!sudo locale-gen en_US.UTF-8
!sudo update-locale LANG=en_US.UTF-8
import locale
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
!pip install rouge-score

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (185.10% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (185.1                                                                               Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (185.10% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (185.10% [Connecting to archive.ubuntu.com (91.189.91.82)] [Waiting for headers] [Wai                                                                               Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64 

In [23]:
import numpy as np
from tqdm import tqdm
from rouge_score import rouge_scorer

# Function to calculate ROUGE scores
def calculate_rouge_score(generated_summary, reference_summary):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    scores = scorer.score(reference_summary, generated_summary)
    return scores

# Function to calculate the mean ROUGE scores across all articles
def calculate_mean_rouge_scores(all_rouge_scores):
    # Initialize arrays to store scores for ROUGE-1, ROUGE-2, ROUGE-L
    rouge1_precision = []
    rouge1_recall = []
    rouge1_fmeasure = []

    rouge2_precision = []
    rouge2_recall = []
    rouge2_fmeasure = []

    rougel_precision = []
    rougel_recall = []
    rougel_fmeasure = []

    # Loop through the ROUGE scores for each article and extract individual metrics
    for rouge_scores in all_rouge_scores:
        rouge1_precision.append(rouge_scores['rouge1'].precision)
        rouge1_recall.append(rouge_scores['rouge1'].recall)
        rouge1_fmeasure.append(rouge_scores['rouge1'].fmeasure)

        rouge2_precision.append(rouge_scores['rouge2'].precision)
        rouge2_recall.append(rouge_scores['rouge2'].recall)
        rouge2_fmeasure.append(rouge_scores['rouge2'].fmeasure)

        rougel_precision.append(rouge_scores['rougeL'].precision)
        rougel_recall.append(rouge_scores['rougeL'].recall)
        rougel_fmeasure.append(rouge_scores['rougeL'].fmeasure)

    # Calculate the mean of each metric
    mean_rouge1_precision = np.mean(rouge1_precision)
    mean_rouge1_recall = np.mean(rouge1_recall)
    mean_rouge1_fmeasure = np.mean(rouge1_fmeasure)

    mean_rouge2_precision = np.mean(rouge2_precision)
    mean_rouge2_recall = np.mean(rouge2_recall)
    mean_rouge2_fmeasure = np.mean(rouge2_fmeasure)

    mean_rougel_precision = np.mean(rougel_precision)
    mean_rougel_recall = np.mean(rougel_recall)
    mean_rougel_fmeasure = np.mean(rougel_fmeasure)

    # Return the mean scores
    return {
        'rouge1': {'precision': mean_rouge1_precision, 'recall': mean_rouge1_recall, 'fmeasure': mean_rouge1_fmeasure},
        'rouge2': {'precision': mean_rouge2_precision, 'recall': mean_rouge2_recall, 'fmeasure': mean_rouge2_fmeasure},
        'rougeL': {'precision': mean_rougel_precision, 'recall': mean_rougel_recall, 'fmeasure': mean_rougel_fmeasure}
    }

# Function to process all articles and calculate ROUGE scores
def process_all_articles_and_evaluate(processed_df, sentence_clusters, highlights, top_n_sentences=5, word_graph_threshold=1, central_words_top_n=10):
    all_summaries = []
    all_rouge_scores = []

    for article_idx in tqdm(range(len(processed_df))):  # Iterate over all articles
        # Extract data for the current article
        clustered_sentences = sentence_clusters[article_idx]
        processed_sentences = processed_df['processed_text'].iloc[article_idx]

        # Flatten the processed sentences
        processed_sentences_flattened = [item for sublist in processed_sentences for item in sublist]

        # Generate word graph for clustered sentences
        word_graph = generate_word_graph(clustered_sentences, processed_sentences_flattened, threshold=word_graph_threshold)

        # Get central words and rank sentences by those central words
        central_words = get_central_words(word_graph, top_n=central_words_top_n)
        ranked_sentences = rank_sentences_by_central_words(clustered_sentences, processed_sentences_flattened, central_words)

        # Reconstruct summary for the current article
        generated_summary = reconstruct_summary(ranked_sentences, processed_sentences, top_n_sentences)
        all_summaries.append(generated_summary)

        # Get the reference summary from the highlights of the article
        reference_summary = highlights[article_idx]

        # Calculate ROUGE scores between generated and reference summaries
        rouge_scores = calculate_rouge_score(generated_summary, reference_summary)
        all_rouge_scores.append(rouge_scores)

    # Calculate mean ROUGE scores
    mean_rouge_scores = calculate_mean_rouge_scores(all_rouge_scores)

    return all_summaries, all_rouge_scores, mean_rouge_scores

# Example: Process all articles and calculate ROUGE scores for each article
# Assuming highlights is already a list of reference summaries (e.g., from processed_df['highlights'])
highlights = processed_df['highlights'].to_arrow().to_pylist()  # Adjust according to your DataFrame structure
all_summaries, all_rouge_scores, mean_rouge_scores = process_all_articles_and_evaluate(processed_df, sentence_clusters, highlights, top_n_sentences=5, word_graph_threshold=1, central_words_top_n=10)

# Print mean ROUGE scores
print("Mean ROUGE Scores across all articles:")
print("ROUGE-1: Precision: {:.4f}, Recall: {:.4f}, F-Measure: {:.4f}".format(
    mean_rouge_scores['rouge1']['precision'],
    mean_rouge_scores['rouge1']['recall'],
    mean_rouge_scores['rouge1']['fmeasure']
))
print("ROUGE-2: Precision: {:.4f}, Recall: {:.4f}, F-Measure: {:.4f}".format(
    mean_rouge_scores['rouge2']['precision'],
    mean_rouge_scores['rouge2']['recall'],
    mean_rouge_scores['rouge2']['fmeasure']
))
print("ROUGE-L: Precision: {:.4f}, Recall: {:.4f}, F-Measure: {:.4f}".format(
    mean_rouge_scores['rougeL']['precision'],
    mean_rouge_scores['rougeL']['recall'],
    mean_rouge_scores['rougeL']['fmeasure']
))


100%|██████████| 11490/11490 [01:41<00:00, 112.73it/s]


Mean ROUGE Scores across all articles:
ROUGE-1: Precision: 0.2621, Recall: 0.3365, F-Measure: 0.2840
ROUGE-2: Precision: 0.0774, Recall: 0.1014, F-Measure: 0.0845
ROUGE-L: Precision: 0.1820, Recall: 0.2360, F-Measure: 0.1980
