In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install rouge

Note: you may need to restart the kernel to use updated packages.


# Import necessary libraries

In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.cluster.util import cosine_distance
from operator import itemgetter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rouge import Rouge

# Download NLTK resources

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Sample paragraph

In [3]:
paragraph = """
Natural language processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans using natural language. It enables computers to understand, interpret, and generate human-like text. NLP involves several challenges, including natural language understanding, language generation, and language translation. Text summarization is a specific NLP task that involves reducing the length of a document while retaining its key information. There are two main approaches to text summarization: extractive and abstractive. Extractive methods select important sentences from the original text, while abstractive methods generate new sentences to form the summary.
"""

# Function to perform sentence tokenization and remove stopwords

In [4]:
def preprocess_text(text):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words("english"))
    preprocessed_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]
    preprocessed_sentences = [[word for word in sentence if word.isalnum() and word not in stop_words] for sentence in preprocessed_sentences]
    return preprocessed_sentences

# Function to calculate sentence similarity using TF-IDF

In [5]:
def calculate_similarity(sentences):
    detokenizer = TreebankWordDetokenizer()
    detokenized_sentences = [detokenizer.detokenize(sentence) for sentence in sentences]
    
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(detokenized_sentences)
    
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return similarity_matrix

# Function to perform TextRank

In [6]:
def text_rank(sentences, similarity_matrix, d=0.85, max_iter=50, tol=0.001):
    scores = [1.0] * len(sentences)
    for _ in range(max_iter):
        prev_scores = scores.copy()
        for i in range(len(sentences)):
            summation = 0
            for j in range(len(sentences)):
                if i != j:
                    summation += similarity_matrix[j][i] * scores[j] / sum(similarity_matrix[j])
            scores[i] = (1 - d) + d * summation

        if sum(abs(scores[i] - prev_scores[i]) for i in range(len(scores))) < tol:
            break

    return scores


# Function to perform TextRank summarization

In [7]:
def text_rank_summarization(text, num_sentences=3):
    sentences = preprocess_text(text)
    similarity_matrix = calculate_similarity(sentences)
    scores = text_rank(sentences, similarity_matrix)

    ranked_sentences = [(sentences[i], scores[i]) for i in range(len(sentences))]
    ranked_sentences = sorted(ranked_sentences, key=itemgetter(1), reverse=True)[:num_sentences]

    summary = [sentence[0] for sentence in ranked_sentences]
    return summary


# Function to calculate Overlap Ratio

In [8]:
def overlap_ratio(reference, generated):
    reference_set = set(reference.split())
    generated_set = set(generated.split())
    
    overlap_count = len(reference_set.intersection(generated_set))
    total_count = len(reference_set) + len(generated_set)
    
    return overlap_count / (total_count / 2)  # Normalize by the average length


# Function to calculate ROUGE

In [9]:
def calculate_rouge(reference, generated):
    rouge = Rouge()
    scores = rouge.get_scores(generated, reference)
    return scores[0]['rouge-1']['f']


# Given reference summary (ground truth)

In [10]:
reference_summary = "Natural language processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans using natural language."

# Perform TextRank summarization

In [11]:
generated_summary = text_rank_summarization(paragraph)

# Print the generated summary

In [13]:
print("Generated Summary:")
print("\n".join(" ".join(sentence) for sentence in generated_summary))

Generated Summary:
natural language processing nlp field artificial intelligence focuses interaction computers humans using natural language
nlp involves several challenges including natural language understanding language generation language translation
two main approaches text summarization extractive abstractive


# Calculate and print the performance metrics

In [16]:
overlap_ratio_score = overlap_ratio(reference_summary, " ".join(" ".join(sentence) for sentence in generated_summary))
rouge_score = calculate_rouge(reference_summary, " ".join(" ".join(sentence) for sentence in generated_summary))

print("\nOverlap Ratio:", overlap_ratio_score)
print("ROUGE Score:", rouge_score)



Overlap Ratio: 0.4583333333333333
ROUGE Score: 0.4680851014395655
