In [1]:
pip install numpy scikit-learn nltk rouge

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from rouge import Rouge

In [3]:
import re

def preprocess_text(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.lower()


In [9]:
def calculate_tfidf_scores(sentences):
    stop_words = list(set(stopwords.words('english')))
    
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    
    # Calculate TF-IDF scores
    tfidf_matrix = vectorizer.fit_transform(sentences)
    
    return tfidf_matrix


In [13]:
def generate_summary(tfidf_matrix, sentences, num_sentences=3):
    sentence_scores = np.sum(tfidf_matrix, axis=1)
    top_sentences_idx = np.argsort(sentence_scores, axis=0)[-num_sentences:][::-1]
    top_sentences_idx = top_sentences_idx.flatten()
    
    # Ensure indices are integers
    top_sentences_idx = [int(idx) for idx in top_sentences_idx]

    summary = [sentences[i] for i in top_sentences_idx]
    return ' '.join(summary)



In [6]:
def calculate_rouge(reference, summary):
    rouge = Rouge()
    scores = rouge.get_scores(summary, reference)
    return scores[0]['rouge-1']['f']


In [7]:
def calculate_overlap_ratio(reference, summary):
    reference_words = set(reference.split())
    summary_words = set(summary.split())
    overlap = len(reference_words.intersection(summary_words))
    ratio = overlap / len(reference_words)
    return ratio


In [15]:
# Example input paragraph
input_paragraph = """
Natural language processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans using natural language. It enables computers to understand, interpret, and generate human-like text. NLP involves several challenges, including natural language understanding, language generation, and language translation. Text summarization is a specific NLP task that involves reducing the length of a document while retaining its key information. There are two main approaches to text summarization: extractive and abstractive. Extractive methods select important sentences from the original text, while abstractive methods generate new sentences to form the summary.
"""

# Preprocess the text
processed_text = preprocess_text(input_paragraph)

# Tokenize the text into sentences
sentences = processed_text.split('.')

# Calculate TF-IDF scores
tfidf_matrix = calculate_tfidf_scores(sentences)

# Generate summary using TF-IDF
summary = generate_summary(tfidf_matrix, sentences, num_sentences=3)

# Example reference summary for evaluation
reference_summary = """
Natural language processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans using natural language
"""

# Evaluate performance using ROUGE
rouge_score = calculate_rouge(reference_summary, summary)

# Calculate overlap ratio
overlap_ratio = calculate_overlap_ratio(reference_summary, summary)

# Display the results
print("Input Paragraph:")
print(input_paragraph)
print("\nGenerated Summary:")
print(summary)
print("\nReference Summary:")
print(reference_summary)
print("\nROUGE Score:", rouge_score)
print("Overlap Ratio:", overlap_ratio)


Input Paragraph:

Natural language processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans using natural language. It enables computers to understand, interpret, and generate human-like text. NLP involves several challenges, including natural language understanding, language generation, and language translation. Text summarization is a specific NLP task that involves reducing the length of a document while retaining its key information. There are two main approaches to text summarization: extractive and abstractive. Extractive methods select important sentences from the original text, while abstractive methods generate new sentences to form the summary.


Generated Summary:

natural language processing nlp is a field of artificial intelligence that focuses on the interaction between computers and humans using natural language it enables computers to understand interpret and generate humanlike text nlp involves several challeng