<a href="https://colab.research.google.com/github/KarkiAnuj17/Automatic-text-summarization/blob/main/Automatic_text_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
import zipfile

zip_ref = zipfile.ZipFile("/content/drive/MyDrive/dataset.zip", 'r')
zip_ref.extractall("/content/dataset")
zip_ref.close()

In [25]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import cudf
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag

In [26]:
path="/content/dataset/dataset/test.csv"
df=pd.read_csv(path)
print(df.columns)
print(df.shape)

Index(['id', 'article', 'highlights'], dtype='object')
(11490, 3)


In [27]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [28]:
def get_wordnet_pos(treebank_tag):
    # Map POS tag to WordNet POS tag
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def preprocess_text(text):
    # Split text into sentences
    sentences = sent_tokenize(text)

    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Get English stopwords
    stop_words = set(stopwords.words('english'))

    # Preprocess each sentence
    processed_sentences = []
    for sentence in sentences:
        # Tokenize words in the sentence
        words = word_tokenize(sentence)

        # Get POS tags
        pos_tags = pos_tag(words)

        # Lemmatize and remove stopwords
        processed_words = []
        for word, pos in pos_tags:
            if word.lower() not in stop_words and word.isalnum():
                lemmatized_word = lemmatizer.lemmatize(word.lower(), get_wordnet_pos(pos))
                processed_words.append(lemmatized_word)

        # Append processed sentence
        processed_sentences.append(processed_words)

    return processed_sentences

def preprocess_csv(file_path, text_column):
    # Read the CSV file into a cuDF DataFrame
    df = cudf.read_csv(file_path)

    # Convert text column to a pandas Series for processing with NLTK
    text_series = df[text_column].to_pandas()

    # Apply preprocessing to the text column with progress bar
    processed_text = text_series.progress_apply(preprocess_text)

    # Convert the processed text back to a cuDF DataFrame
    df['processed_text'] = cudf.from_pandas(processed_text)

    return df

# Enable the tqdm progress_apply
tqdm.pandas()

file_path = '/content/dataset/dataset/test.csv'
text_column = 'article'
processed_df = preprocess_csv(file_path, text_column)

# Display the processed DataFrame
print(processed_df.head())

100%|██████████| 11490/11490 [07:36<00:00, 25.17it/s]


                                         id  \
0  92c514c913c0bdfe25341af9fd72b29db544099b   
1  2003841c7dc0e7c5b1a248f9cd536d727f27a45a   
2  91b7d2311527f5c2b63a65ca98d21d9c92485149   
3  caabf9cbdf96eb1410295a673e953d304391bfbb   
4  3da746a7d9afcaa659088c8366ef6347fe6b53ea   

                                             article  \
0  Ever noticed how plane seats appear to be gett...   
1  A drunk teenage boy had to be rescued by secur...   
2  Dougie Freedman is on the verge of agreeing a ...   
3  Liverpool target Neto is also wanted by PSG an...   
4  Bruce Jenner will break his silence in a two-h...   

                                          highlights  \
0  Experts question if  packed out planes are put...   
1  Drunk teenage boy climbed into lion enclosure ...   
2  Nottingham Forest are close to extending Dougi...   
3  Fiorentina goalkeeper Neto has been linked wit...   
4  Tell-all interview with the reality TV star, 6...   

                                      proce

In [29]:
import cupy as cp
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

# Cosine Similarity Function
def cosine_similarity_matrix(matrix):
    matrix = cp.array(matrix)
    dot_product = cp.dot(matrix, matrix.T)  # Compute dot product
    norm = cp.linalg.norm(matrix, axis=1)  # Compute norm (magnitude) of each vector
    similarity_matrix = dot_product / (cp.outer(norm, norm) + 1e-10)  # Cosine similarity
    return similarity_matrix

# Vectorization Function using TF-IDF
def vectorize_sentences_with_tfidf(sentences):
    vectorizer = TfidfVectorizer(stop_words='english')  # Optional: Remove stopwords
    term_doc_matrix_cpu = vectorizer.fit_transform(sentences)  # Compute the TF-IDF matrix

    # Convert the sparse matrix to a dense NumPy array
    term_doc_matrix_cpu = term_doc_matrix_cpu.toarray()

    # Convert to CuPy array for GPU acceleration
    term_doc_matrix = cp.array(term_doc_matrix_cpu)

    return term_doc_matrix

# Function to Compute Similarity for Articles
def compute_similarity_for_articles(df, text_column='processed_text'):
    text_series = df[text_column].to_pandas()  # Get text data from DataFrame
    similarity_matrices = []

    # Iterate over each article's processed text
    for index, processed_text in tqdm(text_series.items(), total=len(text_series), desc="Processing Articles"):
        processed_sentences = [' '.join(sentence) for sentence in processed_text]  # Join words into sentences

        # Vectorize sentences using TF-IDF approach
        term_doc_matrix = vectorize_sentences_with_tfidf(processed_sentences)

        # Compute the cosine similarity matrix
        similarity_matrix = cosine_similarity_matrix(term_doc_matrix)

        # Convert to CPU (NumPy) array for easier handling if necessary
        similarity_matrix_cpu = cp.asnumpy(similarity_matrix)

        # Store the similarity matrix for the article
        similarity_matrices.append(similarity_matrix_cpu)

    return similarity_matrices

# Enable tqdm for progress bar in pandas apply
tqdm.pandas()

# Assuming you have a DataFrame `processed_df` with a 'processed_text' column
final_similarity_matrices = compute_similarity_for_articles(processed_df, text_column='processed_text')

# Example output for the first article
print("Final Cosine Similarity Matrix for the First Article:\n", final_similarity_matrices[0])


Processing Articles: 100%|██████████| 11490/11490 [00:37<00:00, 306.41it/s]


Final Cosine Similarity Matrix for the First Article:
 [[1.         0.03550455 0.         0.04097448 0.02367313 0.
  0.         0.07210004 0.04264119 0.0403271  0.03092843 0.07493094
  0.11954024 0.         0.02426954 0.06009226]
 [0.03550455 1.         0.         0.03528285 0.02038477 0.
  0.08077285 0.02888779 0.03671804 0.         0.08621036 0.03002202
  0.         0.         0.         0.        ]
 [0.         0.         1.         0.53670032 0.0884582  0.12443423
  0.         0.04675813 0.         0.         0.         0.
  0.         0.         0.02943534 0.        ]
 [0.04097448 0.03528285 0.53670032 1.         0.0548335  0.04404127
  0.         0.07770597 0.04237492 0.         0.0307353  0.0346473
  0.         0.         0.02793048 0.        ]
 [0.02367313 0.02038477 0.0884582  0.0548335  1.         0.21782932
  0.         0.04489486 0.07925059 0.         0.0574819  0.06479821
  0.         0.         0.01613692 0.        ]
 [0.         0.         0.12443423 0.04404127 0.2178293

In [57]:
import nltk
import cupy as cp
import pandas as pd
from tqdm import tqdm

# Function to Compute TextRank Scores Using Precomputed Similarity Matrices
def compute_text_rank_from_similarity(similarity_matrix, damping_factor=0.85, max_iterations=100, tol=1e-6):

    n = similarity_matrix.shape[0]
    scores = cp.ones(n) / n  # Initialize scores uniformly
    transition_matrix = similarity_matrix / (similarity_matrix.sum(axis=1, keepdims=True) + 1e-10)  # Normalize rows

    for _ in range(max_iterations):
        new_scores = (1 - damping_factor) / n + damping_factor * cp.dot(transition_matrix.T, scores)
        if cp.linalg.norm(new_scores - scores, ord=1) < tol:
            break
        scores = new_scores

    return cp.asnumpy(scores)  # Convert to NumPy for easy handling


# Function to Extract Top N Sentences Using Original Text
def get_top_sentences_with_original_text(processed_text, original_sentences, scores, top_n=5):

    sentences_with_scores = [(idx, scores[idx]) for idx in range(len(scores))]

    # Get the top N indices by score
    top_indices = sorted(sentences_with_scores, key=lambda x: x[1], reverse=True)[:top_n]

    # Sort top indices in chronological order
    top_indices_sorted = sorted(top_indices, key=lambda x: x[0])

    # Retrieve original sentences using indices
    top_sentences = [original_sentences[idx] for idx, _ in top_indices_sorted]
    return top_sentences


# Main Function to Rank Top Sentences Using Original Text and Precomputed Similarity Matrices
def rank_top_sentences_with_original_text(df, similarity_matrices, text_column='processed_text', original_text_column='original_text', top_n=5):

    # First, convert the DataFrame to Pandas for sentence tokenization
    df_cpu = df.to_pandas()

    # Apply sentence tokenization to 'article' column (on CPU)
    df_cpu['original_text'] = df_cpu['article'].apply(
        lambda x: nltk.sent_tokenize(x)  # Split full article into sentences
    )

    # Convert back to cuDF for further processing (optional)
    df_gpu = cudf.from_pandas(df_cpu)

    text_series = df_gpu[text_column].to_pandas()  # Extract processed text
    original_text_series = df_gpu[original_text_column].to_pandas()  # Extract original text
    all_top_sentences = []

    for index, (processed_text, original_sentences, similarity_matrix) in tqdm(
        enumerate(zip(text_series, original_text_series, similarity_matrices)),
        total=len(similarity_matrices),
        desc="Ranking Sentences"
    ):
        # Compute TextRank scores from the precomputed similarity matrix
        text_rank_scores = compute_text_rank_from_similarity(cp.array(similarity_matrix))

        # Get the top N sentences using original text
        top_sentences = get_top_sentences_with_original_text(
            processed_text, original_sentences, text_rank_scores, top_n=top_n
        )

        # Store the top sentences
        all_top_sentences.append(top_sentences)

    return all_top_sentences


# Compute Top 5 Sentences Using Precomputed Similarity Matrices and Original Sentences
top_sentences_per_article = rank_top_sentences_with_original_text(
    processed_df, final_similarity_matrices, text_column='processed_text', original_text_column='original_text', top_n=5
)

# Example Output for the First Article
print("Top 5 Sentences in Chronological Order for the First Article:")
for rank, sentence in enumerate(top_sentences_per_article[0], start=1):
    print(f"{rank}: {sentence}")


Ranking Sentences: 100%|██████████| 11490/11490 [04:14<00:00, 45.15it/s]

Top 5 Sentences in Chronological Order for the First Article:
1: Tests conducted by the FAA use planes with a 31 inch pitch, a standard which on some airlines has decreased .
2: Many economy seats on United Airlines have 30 inches of room, while some airlines offer as little as 28 inches .
3: But these tests are conducted using planes with 31 inches between each row of seats, a standard which on some airlines has decreased, reported the Detroit News.
4: While United Airlines has 30 inches of space, Gulf Air economy seats have between 29 and 32 inches, Air Asia offers 29 inches and Spirit Airlines offers just 28 inches.
5: British Airways has a seat pitch of 31 inches, while easyJet has 29 inches, Thomson's short haul seat pitch is 28 inches, and Virgin Atlantic's is 30-31.





In [58]:
from rouge_score import rouge_scorer

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Create a list to store ROUGE scores for each article
rouge_scores = []

# Convert cuDF DataFrame to pandas for iteration
df_cpu = processed_df.to_pandas()

# Iterate over the articles and corresponding summaries
for index, row in df_cpu.iterrows():
    # Get the reference summary (ground truth) and the generated summary
    reference = row['highlights']  # Adjust this column name based on your data
    generated_summary = row['extractive_summary']

    # Compute ROUGE scores
    scores = scorer.score(reference, generated_summary)

    # Store the scores in the list
    rouge_scores.append(scores)

from rouge_score import rouge_scorer
import numpy as np

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Create lists to store ROUGE scores for each article
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

# Convert cuDF DataFrame to pandas for iteration
df_cpu = processed_df.to_pandas()

# Iterate over the articles and corresponding summaries
for index, row in df_cpu.iterrows():
    # Get the reference summary (ground truth) and the generated summary
    reference = row['highlights']  # Adjust this column name based on your data
    generated_summary = row['extractive_summary']

    # Compute ROUGE scores
    scores = scorer.score(reference, generated_summary)

    # Store the individual ROUGE scores
    rouge1_scores.append(scores['rouge1'])
    rouge2_scores.append(scores['rouge2'])
    rougeL_scores.append(scores['rougeL'])

# Calculate the mean of each ROUGE score (precision, recall, and F-measure)
mean_rouge1 = {
    'precision': np.mean([score.precision for score in rouge1_scores]),
    'recall': np.mean([score.recall for score in rouge1_scores]),
    'fmeasure': np.mean([score.fmeasure for score in rouge1_scores])
}

mean_rouge2 = {
    'precision': np.mean([score.precision for score in rouge2_scores]),
    'recall': np.mean([score.recall for score in rouge2_scores]),
    'fmeasure': np.mean([score.fmeasure for score in rouge2_scores])
}

mean_rougeL = {
    'precision': np.mean([score.precision for score in rougeL_scores]),
    'recall': np.mean([score.recall for score in rougeL_scores]),
    'fmeasure': np.mean([score.fmeasure for score in rougeL_scores])
}

# Print the mean ROUGE scores
print("Mean ROUGE-1 Scores:", mean_rouge1)
print("Mean ROUGE-2 Scores:", mean_rouge2)
print("Mean ROUGE-L Scores:", mean_rougeL)



Mean ROUGE-1 Scores: {'precision': 0.2390871506755061, 'recall': 0.5778617605608724, 'fmeasure': 0.3274576034573056}
Mean ROUGE-2 Scores: {'precision': 0.09936520360092581, 'recall': 0.24367598387319003, 'fmeasure': 0.1366565950135189}
Mean ROUGE-L Scores: {'precision': 0.15440856725146226, 'recall': 0.379245983189393, 'fmeasure': 0.2124052351617053}
