In [5]:
# Step 1: Data Collection (Example using newspaper3k)
from newspaper import Article

# Example article URL
article_url = 'https://timesofindia.indiatimes.com/india/url-bangladesh-pm-in-india-pm-modi-receives-sheikh-hasina-at-rashtrapati-bhavan-to-hold-bilateral-meet-today/articleshow/111181506.cms'

article = Article(article_url)
article.download()
article.parse()
text = article.text

In [10]:
print(text)

(With inputs from agencies)

NEW DELHI: Prime Minister Narendra Modi received his Bangladeshi counterpart Sheikh Hasina with a ceremonial welcome at the Rashtrapati Bhavan forecourt upon her arrival in the capital, on Saturday.Following the ceremony, the two leaders met with ministers and delegates from their respective nations at the forecourt, engaging in cordial interactions and exchanges.PM Sheikh Hasina arrived in Delhi for a four-day state visit, during which she is expected to hold wide-ranging talks with PM Modi and give a new momentum to the bilateral ties .Apart from holding bilateral talks, PM Hasina is also scheduled to meet with President Droupadi Murmu and Vice President Jagdeep Dhankhar during her visit.Earlier on Friday, External Affairs Minister S Jaishankar held a meeting with the Bangladeshi leader to discuss a range of bilateral matters."Delighted to call on Prime Minister Sheikh Hasina of Bangladesh this evening. Her State visit to India underlines our close and ab

In [6]:
# Step 2: Preprocessing (Example)
import re
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

def preprocess_text(text):
    # Remove HTML tags and non-alphanumeric characters
    text = re.sub('<[^<]+?>', '', text)
    text = re.sub(r'\W+', ' ', text)
    text = text.lower()
    
    # Tokenize into sentences
    sentences = sent_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    cleaned_sentences = [sentence for sentence in sentences if sentence not in stop_words]
    
    return cleaned_sentences

cleaned_sentences = preprocess_text(text)


In [7]:
# Step 3: Feature Extraction (Example using TF-IDF)
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the data
tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_sentences)

In [8]:
# Step 4: Summarization Techniques (Example)
# Calculate sentence scores based on TF-IDF scores
sentence_scores = {}
for i in range(len(cleaned_sentences)):
    sentence_scores[i] = tfidf_matrix[i].sum()

# Rank sentences by scores and select top sentences for summary
import heapq

summary_sentences = heapq.nlargest(5, sentence_scores, key=sentence_scores.get)
summary = ' '.join([cleaned_sentences[i] for i in summary_sentences])

print("Generated Summary:")
print(summary)

Generated Summary:
 with inputs from agencies new delhi prime minister narendra modi received his bangladeshi counterpart sheikh hasina with a ceremonial welcome at the rashtrapati bhavan forecourt upon her arrival in the capital on saturday following the ceremony the two leaders met with ministers and delegates from their respective nations at the forecourt engaging in cordial interactions and exchanges pm sheikh hasina arrived in delhi for a four day state visit during which she is expected to hold wide ranging talks with pm modi and give a new momentum to the bilateral ties apart from holding bilateral talks pm hasina is also scheduled to meet with president droupadi murmu and vice president jagdeep dhankhar during her visit earlier on friday external affairs minister s jaishankar held a meeting with the bangladeshi leader to discuss a range of bilateral matters delighted to call on prime minister sheikh hasina of bangladesh this evening her state visit to india underlines our close

In [13]:
from rouge_score import rouge_scorer

# Assuming `reference_summary` is the human-written summary and `generated_summary` is the model's summary.
reference_summary = "Prime Minister Narendra Modi and Bangladeshi Prime Minister Sheikh Hasina met at Rashtrapati Bhavan forecourt after a ceremonial welcome. Hasina is expected to hold talks with Modi and discuss bilateral ties. She will also meet with President Droupadi Murmu and Vice President Jagdeep Dhankhar. External Affairs Minister S Jaishankar held a meeting with Hasina to discuss bilateral matters. Hasina's visit is expected to boost Bangladesh's 'celebrated bilateral partnership' with India. She was among seven prominent leaders from India's neighborhood who attended Modi's swearing-in ceremony."
generated_summary = summary  # The summary generated by your model

# Initialize the ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Compute ROUGE scores
scores = scorer.score(reference_summary, generated_summary)

# Print the scores
print("ROUGE-1:", scores['rouge1'])
print("ROUGE-2:", scores['rouge2'])
print("ROUGE-L:", scores['rougeL'])


ROUGE-1: Score(precision=0.3074074074074074, recall=0.9325842696629213, fmeasure=0.4623955431754875)
ROUGE-2: Score(precision=0.21189591078066913, recall=0.6477272727272727, fmeasure=0.31932773109243695)
ROUGE-L: Score(precision=0.26296296296296295, recall=0.797752808988764, fmeasure=0.39554317548746515)


In [18]:
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
from newspaper import Article
import nltk


# Function to get the text from a news article
def get_article_text(url):
    article = Article(url)
    article.download()
    article.parse()
    return article.text

# Function to summarize text using BART
def summarize_text(text, max_length=500, min_length=30, num_beams=4):
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
    model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=max_length, min_length=min_length, num_beams=num_beams, length_penalty=2.0, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Example usage
url = 'https://timesofindia.indiatimes.com/india/url-bangladesh-pm-in-india-pm-modi-receives-sheikh-hasina-at-rashtrapati-bhavan-to-hold-bilateral-meet-today/articleshow/111181506.cms'
text = get_article_text(url)
print("Original Text:\n", text)

summary = summarize_text(text)
print("\nSummarized Text:\n", summary)


Original Text:
 (With inputs from agencies)

NEW DELHI: Prime Minister Narendra Modi received his Bangladeshi counterpart Sheikh Hasina with a ceremonial welcome at the Rashtrapati Bhavan forecourt upon her arrival in the capital, on Saturday.Following the ceremony, the two leaders met with ministers and delegates from their respective nations at the forecourt, engaging in cordial interactions and exchanges. She also honored Mahatma Gandhi by visiting Raj Ghat memorial in Delhi, and laid a wreath to pay her respects to the leader.PM Sheikh Hasina arrived in Delhi for a four-day state visit, during which she is expected to hold wide-ranging talks with PM Modi and give a new momentum to the bilateral ties .Apart from holding bilateral talks, PM Hasina is also scheduled to meet with President Droupadi Murmu and Vice President Jagdeep Dhankhar during her visit.Earlier on Friday, External Affairs Minister S Jaishankar held a meeting with the Bangladeshi leader to discuss a range of bilatera

In [16]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [19]:
from rouge import Rouge

# Evaluate the summary
def evaluate_summary(original_text, summarized_text):
    rouge = Rouge()
    scores = rouge.get_scores(summarized_text, original_text)
    return scores

# Example usage for evaluation
scores = evaluate_summary(text, summary)
print("\nROUGE Scores:\n", scores)


ROUGE Scores:
 [{'rouge-1': {'r': 0.1686046511627907, 'p': 1.0, 'f': 0.28855721146110247}, 'rouge-2': {'r': 0.10112359550561797, 'p': 0.8709677419354839, 'f': 0.18120805182716995}, 'rouge-l': {'r': 0.1686046511627907, 'p': 1.0, 'f': 0.28855721146110247}}]


### Conclusion

In this project, I explored two text summarization approaches and evaluated their performance using ROUGE metrics.

#### Key Findings:

1. **Extractive Summarization:**
   - **ROUGE-1**: Precision: 0.3074, Recall: 0.9326, F-measure: 0.4624
   - **ROUGE-2**: Precision: 0.2119, Recall: 0.6477, F-measure: 0.3193
   - **ROUGE-L**: Precision: 0.2630, Recall: 0.7978, F-measure: 0.3955

2. **Abstractive Summarization using BART:**
   - **ROUGE-1**: Precision: 1.0, Recall: 0.1686, F-measure: 0.2886
   - **ROUGE-2**: Precision: 0.8710, Recall: 0.1011, F-measure: 0.1812
   - **ROUGE-L**: Precision: 1.0, Recall: 0.1686, F-measure: 0.2886

#### Analysis:

- **Extractive Summarization** achieves high recall, capturing most key information but includes some irrelevant details.
- **Abstractive Summarization** with BART shows high precision and generates concise summaries but may miss some key information.

#### Future Work:

1. **Combining Approaches**: Integrate extractive and abstractive techniques.
2. **Parameter Tuning**: Experiment with BART model parameters.
3. **Exploring Other Models**: Use advanced models like T5 or PEGASUS.
4. **Diverse Datasets**: Test on a variety of articles for robustness.

#### Practical Implications:

Effective text summarization aids in quickly grasping lengthy documents, crucial for news aggregation, research, and content curation. Continuous refinement can lead to more accurate and reliable summaries, enhancing automated summarization systems.
