<a href="https://colab.research.google.com/github/KarkiAnuj17/Automatic-text-summarization/blob/main/Automatic_text_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import zipfile

zip_ref = zipfile.ZipFile("/content/drive/MyDrive/dataset.zip", 'r')
zip_ref.extractall("/content/dataset")
zip_ref.close()

In [3]:
import pandas as pd
import nltk
from tqdm import tqdm
import re
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import pos_tag
import cudf

In [4]:
path="/content/dataset/dataset/test.csv"
df=pd.read_csv(path)
print(df.columns)
print(df.shape)

Index(['id', 'article', 'highlights'], dtype='object')
(11490, 3)


In [5]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [6]:
def get_wordnet_pos(treebank_tag):
    # Map POS tag to WordNet POS tag
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def preprocess_text(text):
    # Split text into sentences
    sentences = sent_tokenize(text)

    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Get English stopwords
    stop_words = set(stopwords.words('english'))

    # Preprocess each sentence
    processed_sentences = []
    for sentence in sentences:
        # Tokenize words in the sentence
        words = word_tokenize(sentence)

        # Get POS tags
        pos_tags = pos_tag(words)

        # Lemmatize and remove stopwords
        processed_words = []
        for word, pos in pos_tags:
            if word.lower() not in stop_words and word.isalnum():
                lemmatized_word = lemmatizer.lemmatize(word.lower(), get_wordnet_pos(pos))
                processed_words.append(lemmatized_word)

        # Append processed sentence
        processed_sentences.append(processed_words)

    return processed_sentences

def preprocess_csv(file_path, text_column):
    # Read the CSV file into a cuDF DataFrame
    df = cudf.read_csv(file_path)

    # Convert text column to a pandas Series for processing with NLTK
    text_series = df[text_column].to_pandas()

    # Apply preprocessing to the text column with progress bar
    processed_text = text_series.progress_apply(preprocess_text)

    # Convert the processed text back to a cuDF DataFrame
    df['processed_text'] = cudf.from_pandas(processed_text)

    return df

# Enable the tqdm progress_apply
tqdm.pandas()

file_path = '/content/dataset/dataset/test.csv'
text_column = 'article'
processed_df = preprocess_csv(file_path, text_column)

# Display the processed DataFrame
print(processed_df.head())


100%|██████████| 11490/11490 [08:10<00:00, 23.41it/s]


                                         id  \
0  92c514c913c0bdfe25341af9fd72b29db544099b   
1  2003841c7dc0e7c5b1a248f9cd536d727f27a45a   
2  91b7d2311527f5c2b63a65ca98d21d9c92485149   
3  caabf9cbdf96eb1410295a673e953d304391bfbb   
4  3da746a7d9afcaa659088c8366ef6347fe6b53ea   

                                             article  \
0  Ever noticed how plane seats appear to be gett...   
1  A drunk teenage boy had to be rescued by secur...   
2  Dougie Freedman is on the verge of agreeing a ...   
3  Liverpool target Neto is also wanted by PSG an...   
4  Bruce Jenner will break his silence in a two-h...   

                                          highlights  \
0  Experts question if  packed out planes are put...   
1  Drunk teenage boy climbed into lion enclosure ...   
2  Nottingham Forest are close to extending Dougi...   
3  Fiorentina goalkeeper Neto has been linked wit...   
4  Tell-all interview with the reality TV star, 6...   

                                      proce

In [7]:
import cupy as cp
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

# Cosine Similarity Function
def cosine_similarity_matrix(matrix):
    matrix = cp.array(matrix)
    dot_product = cp.dot(matrix, matrix.T)  # Compute dot product
    norm = cp.linalg.norm(matrix, axis=1)  # Compute norm (magnitude) of each vector
    similarity_matrix = dot_product / (cp.outer(norm, norm) + 1e-10)  # Cosine similarity
    return similarity_matrix

# Vectorization Function using TF-IDF
def vectorize_sentences_with_tfidf(sentences):
    vectorizer = TfidfVectorizer(stop_words='english')  # Optional: Remove stopwords
    term_doc_matrix_cpu = vectorizer.fit_transform(sentences)  # Compute the TF-IDF matrix

    # Convert the sparse matrix to a dense NumPy array
    term_doc_matrix_cpu = term_doc_matrix_cpu.toarray()

    # Convert to CuPy array for GPU acceleration
    term_doc_matrix = cp.array(term_doc_matrix_cpu)

    return term_doc_matrix

# Function to Compute Similarity for Articles
def compute_similarity_for_articles(df, text_column='processed_text'):
    text_series = df[text_column].to_pandas()  # Get text data from DataFrame
    similarity_matrices = []

    # Iterate over each article's processed text
    for index, processed_text in tqdm(text_series.items(), total=len(text_series), desc="Processing Articles"):

        # Ensure processed_text is not empty and is a list of sentences
        if processed_text and isinstance(processed_text, list):
            processed_sentences = [' '.join(sentence) for sentence in processed_text]  # Join words into sentences
        else:
            processed_sentences = []

        # Skip if the processed_sentences list is empty
        if not processed_sentences:
            similarity_matrices.append(None)
            continue

        # Vectorize sentences using TF-IDF approach
        term_doc_matrix = vectorize_sentences_with_tfidf(processed_sentences)

        # Compute the cosine similarity matrix
        similarity_matrix = cosine_similarity_matrix(term_doc_matrix)

        # Convert to CPU (NumPy) array for easier handling if necessary
        similarity_matrix_cpu = cp.asnumpy(similarity_matrix)

        # Store the similarity matrix for the article
        similarity_matrices.append(similarity_matrix_cpu)

    return similarity_matrices

# Enable tqdm for progress bar in pandas apply
tqdm.pandas()

# Assuming you have a DataFrame `processed_df` with a 'processed_text' column
final_similarity_matrices = compute_similarity_for_articles(processed_df, text_column='processed_text')

# Example output for the first article
print("Final Cosine Similarity Matrix for the First Article:\n", final_similarity_matrices[0])

Processing Articles: 100%|██████████| 11490/11490 [00:54<00:00, 210.32it/s]


Final Cosine Similarity Matrix for the First Article:
 [[1.         0.03550455 0.         0.04097448 0.02367313 0.
  0.         0.07210004 0.04264119 0.0403271  0.03092843 0.07493094
  0.11954024 0.         0.02426954 0.06009226]
 [0.03550455 1.         0.         0.03528285 0.02038477 0.
  0.08077285 0.02888779 0.03671804 0.         0.08621036 0.03002202
  0.         0.         0.         0.        ]
 [0.         0.         1.         0.53670032 0.0884582  0.12443423
  0.         0.04675813 0.         0.         0.         0.
  0.         0.         0.02943534 0.        ]
 [0.04097448 0.03528285 0.53670032 1.         0.0548335  0.04404127
  0.         0.07770597 0.04237492 0.         0.0307353  0.0346473
  0.         0.         0.02793048 0.        ]
 [0.02367313 0.02038477 0.0884582  0.0548335  1.         0.21782932
  0.         0.04489486 0.07925059 0.         0.0574819  0.06479821
  0.         0.         0.01613692 0.        ]
 [0.         0.         0.12443423 0.04404127 0.2178293

In [None]:
import networkx as nx
import numpy as np

# Function to build similarity graph
def build_similarity_graph(sentences, similarity_matrix):
    graph = nx.Graph()
    num_sentences = len(sentences)

    # Add nodes for all sentences
    graph.add_nodes_from(range(num_sentences))

    # Add edges for non-zero similarity scores
    for i in range(num_sentences):
        for j in range(i + 1, num_sentences):
            if similarity_matrix[i, j] > 0:  # Add edge if similarity > 0
                graph.add_edge(i, j, weight=similarity_matrix[i, j])
    return graph

# TextRank implementation
def textrank_summary(similarity_matrix, sentences, top_n=3):
    # Build graph from similarity matrix
    graph = build_similarity_graph(sentences, similarity_matrix)

    # Apply PageRank algorithm to rank sentences
    scores = nx.pagerank(graph)

    # Assign default score of 0 for sentences not in the graph
    scores = {i: scores.get(i, 0) for i in range(len(sentences))}

    # Sort sentences by score
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

    # Extract top-ranked sentences
    top_sentences = [sent for _, sent in ranked_sentences[:top_n]]

    # Reorder sentences based on original position for coherence
    ordered_sentences = sorted(top_sentences, key=lambda s: sentences.index(s))

    # Reconstruct the summary
    summary = ' '.join(ordered_sentences)
    return summary

# Generate summaries for all articles
def generate_summaries(df, similarity_matrices, top_n=3):
    summaries = []
    for i, processed_text in enumerate(df['processed_text'].to_pandas()):
        if processed_text and similarity_matrices[i] is not None:
            sentences = [' '.join(sentence) for sentence in processed_text]
            similarity_matrix = similarity_matrices[i]
            summary = textrank_summary(similarity_matrix, sentences, top_n)
        else:
            summary = None
        summaries.append(summary)
    return summaries

# Apply summary generation
summaries = generate_summaries(processed_df, final_similarity_matrices)

# Example output for the first article
print("Summary for the First Article:\n", summaries[0])

Summary for the First Article:
 test conduct faa use plane 31 inch pitch standard airline decrease test conduct use plane 31 inch row seat standard airline decrease report detroit news united airline 30 inch space gulf air economy seat 29 32 inch air asia offer 29 inch spirit airline offer 28 inch


In [None]:
from rouge_score import rouge_scorer
import numpy as np

# Function to calculate ROUGE scores
def calculate_rouge(reference_summaries, generated_summaries):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    for reference, generated in zip(reference_summaries, generated_summaries):
        # Compute ROUGE scores for each pair of reference and generated summary
        scores = scorer.score(reference, generated)

        # Store individual ROUGE scores
        rouge1_scores.append(scores['rouge1'])
        rouge2_scores.append(scores['rouge2'])
        rougeL_scores.append(scores['rougeL'])

    # Calculate the mean ROUGE scores
    mean_rouge1 = {
        'precision': np.mean([score.precision for score in rouge1_scores]),
        'recall': np.mean([score.recall for score in rouge1_scores]),
        'fmeasure': np.mean([score.fmeasure for score in rouge1_scores])
    }

    mean_rouge2 = {
        'precision': np.mean([score.precision for score in rouge2_scores]),
        'recall': np.mean([score.recall for score in rouge2_scores]),
        'fmeasure': np.mean([score.fmeasure for score in rouge2_scores])
    }

    mean_rougeL = {
        'precision': np.mean([score.precision for score in rougeL_scores]),
        'recall': np.mean([score.recall for score in rougeL_scores]),
        'fmeasure': np.mean([score.fmeasure for score in rougeL_scores])
    }

    return mean_rouge1, mean_rouge2, mean_rougeL

# Step 1: Generate summaries using TextRank
summaries = generate_summaries(processed_df, final_similarity_matrices)

# Step 2: Extract reference summaries (assuming 'highlights' column contains them)
reference_summaries = processed_df['highlights'].to_pandas()

# Step 3: Calculate ROUGE scores for the generated summaries
mean_rouge1, mean_rouge2, mean_rougeL = calculate_rouge(reference_summaries, summaries)

# Step 4: Print the mean ROUGE scores
print("Mean ROUGE-1 Scores:", mean_rouge1)
print("Mean ROUGE-2 Scores:", mean_rouge2)
print("Mean ROUGE-L Scores:", mean_rougeL)


Mean ROUGE-1 Scores: {'precision': 0.27874620980722675, 'recall': 0.23936717390104828, 'fmeasure': 0.24657258770837462}
Mean ROUGE-2 Scores: {'precision': 0.07585856244601576, 'recall': 0.06637691629397806, 'fmeasure': 0.06779815066016259}
Mean ROUGE-L Scores: {'precision': 0.2030583887684702, 'recall': 0.17502554704126508, 'fmeasure': 0.17982591290404076}


In [None]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np

# Function to cluster sentences within a document
def cluster_sentences(similarity_matrix, n_clusters=5):
    # Ensure the similarity matrix is NumPy array
    similarity_matrix = np.array(similarity_matrix)

    # Convert similarity matrix to distance matrix (1 - similarity)
    distance_matrix = 1 - similarity_matrix

    # Ensure that the number of clusters does not exceed the number of sentences
    n_clusters = min(n_clusters, distance_matrix.shape[0])

    # Apply Agglomerative Clustering
    clustering_model = AgglomerativeClustering(
        n_clusters=n_clusters, metric='precomputed', linkage='average'
    )
    cluster_labels = clustering_model.fit_predict(distance_matrix)

    return cluster_labels

# Function to cluster sentences for all documents
def cluster_sentences_for_documents(similarity_matrices, n_clusters=5):
    sentence_clusters = []

    for similarity_matrix in similarity_matrices:
        if similarity_matrix is None:
            sentence_clusters.append(None)  # Skip if no similarity matrix
            continue

        # Cluster sentences for the document
        clusters = cluster_sentences(similarity_matrix, n_clusters)
        sentence_clusters.append(clusters)

    return sentence_clusters

# Apply clustering to your computed similarity matrices
n_clusters = 5  # Number of clusters per document (adjust based on data)
sentence_clusters = cluster_sentences_for_documents(final_similarity_matrices, n_clusters=n_clusters)

# Example: Display sentence clusters for the first document
print("Sentence Clusters for the First Document:")
print(sentence_clusters[0])

Sentence Clusters for the First Document:
[2 1 0 0 0 0 4 2 3 3 1 3 2 3 3 3]


In [None]:
import networkx as nx
import itertools
from tqdm import tqdm

# Function to generate a word graph from clustered sentences
def generate_word_graph(clustered_sentences, processed_sentences, threshold=1):
    G = nx.Graph()

    # Iterate through each unique cluster index
    for cluster_idx in set(clustered_sentences):
        # Get sentences belonging to the current cluster
        sentences_in_cluster = [processed_sentences[idx] for idx in range(len(clustered_sentences)) if clustered_sentences[idx] == cluster_idx]

        # Iterate through each sentence in the cluster
        for sentence in sentences_in_cluster:
            # Get the unique words in the sentence
            words = set(sentence)

            # Create edges between all pairs of words in the sentence (undirected graph)
            for word1, word2 in itertools.combinations(words, 2):
                # Add edge with weight (frequency of co-occurrence)
                if G.has_edge(word1, word2):
                    G[word1][word2]['weight'] += 1
                else:
                    G.add_edge(word1, word2, weight=1)

    # Remove edges with weight less than the threshold
    edges_to_remove = [(u, v) for u, v, data in G.edges(data=True) if data['weight'] < threshold]
    G.remove_edges_from(edges_to_remove)

    return G

# Function to process all articles without visualizing the word graph
def process_all_articles(processed_df, sentence_clusters, threshold=1):
    # Loop through all articles and process each article
    for article_idx in tqdm(range(len(processed_df)), desc="Processing Articles"):
        # Get the clustered sentences and processed sentences for the current article
        clustered_sentences = sentence_clusters[article_idx]
        processed_sentences = processed_df['processed_text'].iloc[article_idx]

        # Flatten the processed sentences if needed (flatten each sentence into individual words)
        processed_sentences_flattened = [item for sublist in processed_sentences for item in sublist]

        # Generate word graph for the clustered sentences
        word_graph = generate_word_graph(clustered_sentences, processed_sentences_flattened, threshold)

# Example: Process all articles without visualization
process_all_articles(processed_df, sentence_clusters, threshold=1)


Processing Articles: 100%|██████████| 11490/11490 [00:57<00:00, 199.95it/s]


In [None]:
import networkx as nx
import itertools
from collections import Counter
from tqdm import tqdm  # Import tqdm for progress bar

# Function to generate a word graph from clustered sentences
def generate_word_graph(clustered_sentences, processed_sentences, threshold=1):
    G = nx.Graph()

    # Iterate through each cluster in clustered_sentences
    for cluster_idx in clustered_sentences:
        # Ensure each cluster is a list of sentences
        if isinstance(cluster_idx, list):
            for sentence in cluster_idx:
                # Ensure the sentence is a list of words
                words = set(sentence)

                # Create edges between all pairs of words in the sentence (undirected graph)
                for word1, word2 in itertools.combinations(words, 2):
                    # Add edge with weight (frequency of co-occurrence)
                    if G.has_edge(word1, word2):
                        G[word1][word2]['weight'] += 1
                    else:
                        G.add_edge(word1, word2, weight=1)

    # Remove edges with weight less than the threshold
    edges_to_remove = [(u, v) for u, v, data in G.edges(data=True) if data['weight'] < threshold]
    G.remove_edges_from(edges_to_remove)

    return G


# Function to get the central words based on degree centrality
def get_central_words(word_graph, top_n=10):
    centrality = nx.degree_centrality(word_graph)
    sorted_centrality = sorted(centrality.items(), key=lambda x: x[1], reverse=True)
    central_words = [word for word, _ in sorted_centrality[:top_n]]
    return central_words


# Function to rank sentences based on central word count
def rank_sentences_by_central_words(clustered_sentences, processed_sentences, central_words):
    sentence_scores = []

    for i, sentence in enumerate(processed_sentences):
        central_word_count = sum(1 for word in sentence if word in central_words)
        sentence_scores.append((i, central_word_count))

    # Sort sentences by their centrality (most central words first)
    ranked_sentences = sorted(sentence_scores, key=lambda x: x[1], reverse=True)
    return ranked_sentences


# Function to generate summary based on ranked sentences
def generate_summary(ranked_sentences, processed_sentences, top_n_sentences=5):
    summary_sentences = [processed_sentences[i] for i, _ in ranked_sentences[:top_n_sentences]]
    summary = ' '.join([' '.join(sentence) for sentence in summary_sentences])
    return summary


# Process all articles and generate summaries
def process_all_articles(processed_df, sentence_clusters, top_n_sentences=7, word_graph_threshold=1, central_words_top_n=10):
    all_summaries = []

    # Use tqdm to add a progress bar to the iteration over the articles
    for article_idx in tqdm(range(len(processed_df)), desc="Processing Articles"):
        # Get the clustered sentences for the current article
        clustered_sentences = sentence_clusters[article_idx]

        # Get the processed sentences for the current article
        processed_sentences = processed_df['processed_text'].iloc[article_idx]

        # Generate word graph for the clustered sentences
        word_graph = generate_word_graph(clustered_sentences, processed_sentences, threshold=word_graph_threshold)

        # Get the central words from the word graph
        central_words = get_central_words(word_graph, top_n=central_words_top_n)

        # Rank sentences by their central word count
        ranked_sentences = rank_sentences_by_central_words(clustered_sentences, processed_sentences, central_words)

        # Generate the summary for the current article
        summary = generate_summary(ranked_sentences, processed_sentences, top_n_sentences)

        all_summaries.append(summary)

    return all_summaries


# Example: Generate summaries for all articles in the dataset
all_summaries = process_all_articles(processed_df, sentence_clusters, top_n_sentences=5, word_graph_threshold=1, central_words_top_n=10)

# Print the generated summary for the first article
print("Generated Summary for the First Article:")
print(all_summaries[0])  # Only print the summary for the first article


Processing Articles: 100%|██████████| 11490/11490 [00:49<00:00, 230.82it/s]

Generated Summary for the First Article:
ever notice plane seat appear get small small increase number people take sky expert question pack plane put passenger risk say shrink space aeroplane uncomfortable put health safety danger squabbling arm rest shrink space plane put health safety danger week consumer advisory group set department transportation say public hearing government happy set standard animal fly plane stipulate minimum amount space human





In [None]:
import nltk
from tqdm import tqdm

# Download the punkt tokenizer if not already downloaded
nltk.download('punkt')

# Function to reconstruct and improve sentence formatting
def reconstruct_summary(ranked_sentences, original_sentences, top_n_sentences=7):
    """
    Generate a summary by selecting top-ranked original sentences.

    Args:
    - ranked_sentences (list): List of tuples (index, score), sorted by score.
    - original_sentences (list): List of original sentences from the dataset.
    - top_n_sentences (int): Number of top sentences to include in the summary.

    Returns:
    - str: Final summary text.
    """
    # Check the number of available ranked sentences and limit the selection to that number
    max_sentences = min(top_n_sentences, len(ranked_sentences))

    # Select the top-ranked original sentences
    summary_sentences = []
    for i, _ in ranked_sentences[:max_sentences]:
        if i < len(original_sentences):  # Ensure the index is valid
            sentence = original_sentences[i].strip()  # Strip any extra spaces
            if sentence:  # Ensure the sentence is not empty
                summary_sentences.append(sentence)

    # Join the selected sentences into one final summary text
    return ' '.join(summary_sentences)

# Function to process all articles in the dataset
def process_all_articles(processed_df, sentence_clusters, top_n_sentences=5, word_graph_threshold=1, central_words_top_n=10):
    """
    Process all articles in the dataset to generate summaries using sentence clustering and ranking.

    Args:
    - processed_df (pd.DataFrame): DataFrame containing processed and original text for each article.
    - sentence_clusters (list): Clustering results for sentences of each article.
    - top_n_sentences (int): Number of top sentences to include in the summary.
    - word_graph_threshold (int): Threshold for word graph edge weight.
    - central_words_top_n (int): Number of top central words to consider for ranking sentences.

    Returns:
    - list: List of summaries for all articles.
    """
    all_summaries = []

    for article_idx in tqdm(range(len(processed_df))):  # Use tqdm to display progress for all articles
        # Extract clustered sentences and processed sentences for the current article
        clustered_sentences = sentence_clusters[article_idx]
        processed_sentences = processed_df['processed_text'].iloc[article_idx]

        # Use nltk's sentence tokenizer for better sentence splitting
        original_sentences = nltk.sent_tokenize(processed_df['article'].iloc[article_idx])

        # Flatten the processed sentences
        processed_sentences_flattened = [item for sublist in processed_sentences for item in sublist]

        # Generate word graph for the clustered sentences
        word_graph = generate_word_graph(clustered_sentences, processed_sentences_flattened, threshold=word_graph_threshold)

        # Get central words from the word graph
        central_words = get_central_words(word_graph, top_n=central_words_top_n)

        # Rank sentences based on central words
        ranked_sentences = rank_sentences_by_central_words(clustered_sentences, processed_sentences_flattened, central_words)

        # Reconstruct summary using original sentences
        summary = reconstruct_summary(ranked_sentences, original_sentences, top_n_sentences)
        all_summaries.append(summary)

    return all_summaries

# Example: Process all articles in the dataset and get the summary for the first article
all_summaries = process_all_articles(processed_df, sentence_clusters, top_n_sentences=5, word_graph_threshold=1, central_words_top_n=10)

# Print the summary for the first article
print("Generated Summary for the First Article (with Proper Sentence Splitting):")
print(all_summaries[0])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
100%|██████████| 11490/11490 [01:34<00:00, 121.72it/s]

Generated Summary for the First Article (with Proper Sentence Splitting):
Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? This week, a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans.





In [15]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding
!sudo apt-get update
!sudo apt-get install -y locales
!sudo locale-gen en_US.UTF-8
!sudo update-locale LANG=en_US.UTF-8
import locale
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
!pip install rouge-score

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com (185.125.190.83)] [Waiting for headers] [10% [Connecting to archive.ubuntu.com (185.125.190.83)] [Waiting for headers] [C                                                                               Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,636 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:9 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,226 kB]
Hit:10 https://ppa.launchpadcontent.ne

In [None]:
import numpy as np
from tqdm import tqdm
from rouge_score import rouge_scorer
import nltk

# Function to calculate ROUGE scores
def calculate_rouge_score(generated_summary, reference_summary):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    scores = scorer.score(reference_summary, generated_summary)
    return scores

# Function to calculate the mean ROUGE scores across all articles
def calculate_mean_rouge_scores(all_rouge_scores):
    rouge1_precision, rouge1_recall, rouge1_fmeasure = [], [], []
    rouge2_precision, rouge2_recall, rouge2_fmeasure = [], [], []
    rougel_precision, rougel_recall, rougel_fmeasure = [], [], []

    for rouge_scores in all_rouge_scores:
        rouge1_precision.append(rouge_scores['rouge1'].precision)
        rouge1_recall.append(rouge_scores['rouge1'].recall)
        rouge1_fmeasure.append(rouge_scores['rouge1'].fmeasure)

        rouge2_precision.append(rouge_scores['rouge2'].precision)
        rouge2_recall.append(rouge_scores['rouge2'].recall)
        rouge2_fmeasure.append(rouge_scores['rouge2'].fmeasure)

        rougel_precision.append(rouge_scores['rougeL'].precision)
        rougel_recall.append(rouge_scores['rougeL'].recall)
        rougel_fmeasure.append(rouge_scores['rougeL'].fmeasure)

    return {
        'rouge1': {
            'precision': np.mean(rouge1_precision),
            'recall': np.mean(rouge1_recall),
            'fmeasure': np.mean(rouge1_fmeasure)
        },
        'rouge2': {
            'precision': np.mean(rouge2_precision),
            'recall': np.mean(rouge2_recall),
            'fmeasure': np.mean(rouge2_fmeasure)
        },
        'rougeL': {
            'precision': np.mean(rougel_precision),
            'recall': np.mean(rougel_recall),
            'fmeasure': np.mean(rougel_fmeasure)
        }
    }

# Function to reconstruct and improve sentence formatting
def reconstruct_summary(ranked_sentences, original_sentences, top_n_sentences=7):
    max_sentences = min(top_n_sentences, len(ranked_sentences))
    summary_sentences = []

    for i, _ in ranked_sentences[:max_sentences]:
        if i < len(original_sentences):  # Ensure the index is valid
            sentence = original_sentences[i].strip()  # Strip any extra spaces
            if sentence:  # Ensure the sentence is not empty
                summary_sentences.append(sentence)

    return ' '.join(summary_sentences)

# Function to process all articles and calculate ROUGE scores
def process_all_articles_and_evaluate(processed_df, sentence_clusters, highlights, top_n_sentences=5, word_graph_threshold=1, central_words_top_n=10):
    all_summaries = []
    all_rouge_scores = []

    for article_idx in tqdm(range(len(processed_df))):  # Iterate over all articles
        clustered_sentences = sentence_clusters[article_idx]
        processed_sentences = processed_df['processed_text'].iloc[article_idx]
        processed_sentences_flattened = [item for sublist in processed_sentences for item in sublist]

        # Extract original sentences using nltk's sentence tokenizer
        original_sentences = nltk.sent_tokenize(processed_df['article'].iloc[article_idx])

        # Generate word graph for clustered sentences
        word_graph = generate_word_graph(clustered_sentences, processed_sentences_flattened, threshold=word_graph_threshold)

        # Get central words and rank sentences by those central words
        central_words = get_central_words(word_graph, top_n=central_words_top_n)
        ranked_sentences = rank_sentences_by_central_words(clustered_sentences, processed_sentences_flattened, central_words)

        # Reconstruct summary for the current article
        generated_summary = reconstruct_summary(ranked_sentences, original_sentences, top_n_sentences)
        all_summaries.append(generated_summary)

        # Get the reference summary from the highlights of the article
        reference_summary = highlights[article_idx]

        # Calculate ROUGE scores between generated and reference summaries
        rouge_scores = calculate_rouge_score(generated_summary, reference_summary)
        all_rouge_scores.append(rouge_scores)

    mean_rouge_scores = calculate_mean_rouge_scores(all_rouge_scores)

    return all_summaries, all_rouge_scores, mean_rouge_scores

# Example: Process all articles and calculate ROUGE scores for each article
highlights = processed_df['highlights'].to_arrow().to_pylist()  # Adjusted to use .to_arrow().to_pylist()
all_summaries, all_rouge_scores, mean_rouge_scores = process_all_articles_and_evaluate(
    processed_df,
    sentence_clusters,
    highlights,
    top_n_sentences=5,
    word_graph_threshold=1,
    central_words_top_n=10
)

# Print mean ROUGE scores
print("Mean ROUGE Scores across all articles:")
print("ROUGE-1: Precision: {:.4f}, Recall: {:.4f}, F-Measure: {:.4f}".format(
    mean_rouge_scores['rouge1']['precision'],
    mean_rouge_scores['rouge1']['recall'],
    mean_rouge_scores['rouge1']['fmeasure']
))
print("ROUGE-2: Precision: {:.4f}, Recall: {:.4f}, F-Measure: {:.4f}".format(
    mean_rouge_scores['rouge2']['precision'],
    mean_rouge_scores['rouge2']['recall'],
    mean_rouge_scores['rouge2']['fmeasure']
))
print("ROUGE-L: Precision: {:.4f}, Recall: {:.4f}, F-Measure: {:.4f}".format(
    mean_rouge_scores['rougeL']['precision'],
    mean_rouge_scores['rougeL']['recall'],
    mean_rouge_scores['rougeL']['fmeasure']
))

100%|██████████| 11490/11490 [02:47<00:00, 68.58it/s]

Mean ROUGE Scores across all articles:
ROUGE-1: Precision: 0.2768, Recall: 0.6299, F-Measure: 0.3728
ROUGE-2: Precision: 0.1273, Recall: 0.2939, F-Measure: 0.1722
ROUGE-L: Precision: 0.1761, Recall: 0.4073, F-Measure: 0.2384





In [10]:
from sklearn.cluster import KMeans
import numpy as np
import cupy as cp

def perform_kmeans_clustering(similarity_matrices, max_clusters=3):
    """
    Perform K-Means clustering on similarity matrices.

    Parameters:
        similarity_matrices (list): List of similarity matrices (one per article).
        max_clusters (int): Maximum number of clusters for K-Means.

    Returns:
        clustered_sentences (list): List of cluster labels for sentences in each article.
    """
    clustered_sentences = []

    for sim_matrix in similarity_matrices:
        if sim_matrix is not None and len(sim_matrix) > 0:  # Ensure the similarity matrix is valid and not empty
            n_sentences = len(sim_matrix)
            n_clusters = min(max_clusters, n_sentences)  # Adjust clusters dynamically based on sentence count

            if n_clusters < 2:  # Skip clustering if not enough sentences
                clustered_sentences.append([0] * n_sentences)  # Assign all sentences to one cluster
                continue

            # Convert similarity matrix rows to feature vectors
            sentence_features = sim_matrix

            # Apply K-Means clustering
            kmeans = KMeans(n_clusters=n_clusters, random_state=42)
            cluster_labels = kmeans.fit_predict(sentence_features)

            # Append cluster labels for the current article
            clustered_sentences.append(cluster_labels)
        else:
            clustered_sentences.append(None)  # Append None for empty or invalid articles

    return clustered_sentences

# Perform K-Means clustering for articles
max_clusters = 3  # Maximum number of clusters
clustered_sentences = perform_kmeans_clustering(final_similarity_matrices, max_clusters=max_clusters)

# Example: Display clusters for the first article
if clustered_sentences[0] is not None:
    print("Cluster labels for the first article's sentences:", clustered_sentences[0])


Cluster labels for the first article's sentences: [2 2 0 0 0 0 2 2 1 1 2 1 1 1 1 1]


In [12]:
import networkx as nx
from collections import Counter

def generate_word_graph(sentences):
    """
    Generate a word graph from a list of tokenized sentences.

    Parameters:
        sentences (list of lists): Tokenized sentences.

    Returns:
        graph (networkx.Graph): Word graph.
    """
    graph = nx.Graph()
    for sentence in sentences:
        for i, word in enumerate(sentence):
            for j in range(i + 1, len(sentence)):
                graph.add_edge(word, sentence[j])
    return graph

def get_central_words(word_graph, top_n=10):
    """
    Get the most central words from the word graph using PageRank.

    Parameters:
        word_graph (networkx.Graph): Word graph.
        top_n (int): Number of top central words to retrieve.

    Returns:
        central_words (list): List of central words.
    """
    pagerank_scores = nx.pagerank(word_graph)
    central_words = [word for word, _ in Counter(pagerank_scores).most_common(top_n)]
    return central_words

def rank_sentences(sentences, central_words):
    """
    Rank sentences based on the occurrence of central words.

    Parameters:
        sentences (list): List of tokenized sentences.
        central_words (list): List of central words.

    Returns:
        ranked_sentences (list): Ranked list of sentences.
    """
    sentence_scores = []
    for i, sentence in enumerate(sentences):
        score = sum(1 for word in sentence if word in central_words)
        sentence_scores.append((i, score))

    # Sort sentences by score in descending order
    ranked_sentences = sorted(sentence_scores, key=lambda x: x[1], reverse=True)
    return [sentences[i] for i, _ in ranked_sentences]

def reconstruct_summary(article, cluster_labels, n_top_sentences=3):
    """
    Generate a reconstructed summary for an article.

    Parameters:
        article (list of lists): List of tokenized sentences in the article.
        cluster_labels (list): Cluster labels for sentences.
        n_top_sentences (int): Number of top-ranked sentences per cluster.

    Returns:
        summary (list): List of reconstructed summary sentences.
    """
    summary = []
    unique_clusters = set(cluster_labels)

    for cluster in unique_clusters:
        # Get sentences in the current cluster
        cluster_sentences = [
            article[i] for i, label in enumerate(cluster_labels) if label == cluster
        ]

        if not cluster_sentences:
            continue

        # Generate word graph for the cluster
        word_graph = generate_word_graph(cluster_sentences)

        # Get central words
        central_words = get_central_words(word_graph)

        # Rank sentences within the cluster
        ranked_sentences = rank_sentences(cluster_sentences, central_words)

        # Add top-ranked sentences to the summary
        summary.extend([' '.join(sentence) for sentence in ranked_sentences[:n_top_sentences]])

    return summary

# Generate summaries for all articles
def generate_summaries(processed_df, clustered_sentences, n_top_sentences=3):
    """
    Generate summaries for all articles in the dataset.

    Parameters:
        processed_df (DataFrame): DataFrame with tokenized and preprocessed text.
        clustered_sentences (list): List of cluster labels for all articles.
        n_top_sentences (int): Number of top sentences per cluster.

    Returns:
        processed_df (DataFrame): Updated DataFrame with a 'summary' column.
    """
    all_summaries = []

    for i in range(len(processed_df)):
        if clustered_sentences[i] is not None:  # Ensure clustering was successful
            summary = reconstruct_summary(
                article=processed_df['processed_text'].iloc[i],
                cluster_labels=clustered_sentences[i],
                n_top_sentences=n_top_sentences
            )
            all_summaries.append('\n'.join(summary))  # Store the summary as a single string
        else:
            all_summaries.append("Summary not available for this article.")

    processed_df['summary'] = all_summaries
    return processed_df

# Example usage:
n_top_sentences = 3  # Adjust the number of sentences per cluster
processed_df = generate_summaries(processed_df, clustered_sentences, n_top_sentences=n_top_sentences)

# Display the first few summaries
print(processed_df[['article', 'summary']].head())


                                             article  \
0  Ever noticed how plane seats appear to be gett...   
1  A drunk teenage boy had to be rescued by secur...   
2  Dougie Freedman is on the verge of agreeing a ...   
3  Liverpool target Neto is also wanted by PSG an...   
4  Bruce Jenner will break his silence in a two-h...   

                                             summary  
0  week consumer advisory group set department tr...  
1  next level drunk intoxicate rahul kumar 17 cli...  
2  dougie freedman verge agree new deal remain no...  
3  report neto verbal agreement join serie champi...  
4  jenner picture walk back car malibu weekend hi...  


In [13]:
if 'summary' in processed_df.columns and not processed_df['summary'].empty:
    print("First Article Summary:")
    print(processed_df['summary'].iloc[0])
else:
    print("Summary not available for the first article.")

First Article Summary:
week consumer advisory group set department transportation say public hearing government happy set standard animal fly plane stipulate minimum amount space human
world animal right space food human say charlie leocha consumer representative committee
say shrink space aeroplane uncomfortable put health safety danger
united airline 30 inch space gulf air economy seat 29 32 inch air asia offer 29 inch spirit airline offer 28 inch
british airway seat pitch 31 inch easyjet 29 inch thomson short haul seat pitch 28 inch virgin atlantic
many economy seat united airline 30 inch room airline offer little 28 inch
could crowd plane lead serious issue fight space overhead locker crash elbow seat back kick
increase number people take sky expert question pack plane put passenger risk
ever notice plane seat appear get small small


In [21]:
from rouge_score import rouge_scorer
import numpy as np
from tqdm import tqdm

def calculate_rouge_metrics_with_progress(reference_texts, generated_summaries):
    """
    Calculate ROUGE Precision, Recall, and F-Measure for a list of reference texts and generated summaries with progress bar.

    Parameters:
        reference_texts (list): List of reference summaries (ground truth).
        generated_summaries (list): List of generated summaries.

    Returns:
        mean_metrics (dict): Mean Precision, Recall, and F-Measure for ROUGE-1, ROUGE-2, and ROUGE-L.
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    metrics = {
        'rouge1': {'precision': [], 'recall': [], 'fmeasure': []},
        'rouge2': {'precision': [], 'recall': [], 'fmeasure': []},
        'rougeL': {'precision': [], 'recall': [], 'fmeasure': []}
    }

    # Add a progress bar for the calculation
    for reference, generated in tqdm(zip(reference_texts, generated_summaries),
                                      total=len(reference_texts),
                                      desc="Calculating ROUGE Metrics"):
        # Ensure both reference and generated text are valid strings
        if isinstance(reference, str) and isinstance(generated, str):
            scores = scorer.score(reference, generated)
            for rouge_type in metrics.keys():
                metrics[rouge_type]['precision'].append(scores[rouge_type].precision)
                metrics[rouge_type]['recall'].append(scores[rouge_type].recall)
                metrics[rouge_type]['fmeasure'].append(scores[rouge_type].fmeasure)

    # Compute mean metrics
    mean_metrics = {
        rouge_type: {
            metric: np.mean(metrics[rouge_type][metric]) for metric in ['precision', 'recall', 'fmeasure']
        }
        for rouge_type in metrics.keys()
    }
    return mean_metrics

# Prepare reference and generated summaries
reference_texts = processed_df['highlights'].to_arrow().to_pylist()
generated_summaries = processed_df['summary'].to_arrow().to_pylist()

# Compute ROUGE metrics with progress bar
mean_rouge_metrics = calculate_rouge_metrics_with_progress(reference_texts, generated_summaries)

# Display the results
print("Mean ROUGE Metrics:")
for rouge_type, scores in mean_rouge_metrics.items():
    print(f"{rouge_type.upper()}:")
    print(f"  Precision: {scores['precision']:.4f}")
    print(f"  Recall:    {scores['recall']:.4f}")
    print(f"  F-Measure: {scores['fmeasure']:.4f}")


Calculating ROUGE Metrics: 100%|██████████| 11490/11490 [01:27<00:00, 131.81it/s]

Mean ROUGE Metrics:
ROUGE1:
  Precision: 0.1507
  Recall:    0.3841
  F-Measure: 0.2102
ROUGE2:
  Precision: 0.0425
  Recall:    0.1103
  F-Measure: 0.0595
ROUGEL:
  Precision: 0.0924
  Recall:    0.2396
  F-Measure: 0.1295



