
Text Preprocessing & Feature Extraction

In [30]:

import nltk
nltk.download('punkt_tab',quiet=True)
from nltk.tokenize import sent_tokenize, word_tokenize


text = input("Please enter your news article:")


sentences = sent_tokenize(text)


tokenized_sentences = [word_tokenize(sent.lower()) for sent in sentences]

print("Tokenized Sentences:")
for sentence in tokenized_sentences:
    print(sentence)



Tokenized Sentences:
['punjab-based', 'self-styled', 'preacher', 'bajinder', 'singh', 'found', 'himself', 'in', 'fresh', 'controversy', 'after', 'a', 'video', 'allegedly', 'showing', 'him', 'assaulting', 'a', 'woman', 'surfaced', 'online', '.']
['the', 'footage', 'of', 'the', 'incident', 'has', 'now', 'gone', 'viral', '.']
['it', 'shows', 'singh', ',', 'who', 'is', 'involved', 'in', 'multiple', 'sexual', 'harassment', 'cases', ',', 'throwing', 'a', 'pile', 'of', 'papers', 'at', 'a', 'woman', 'seated', 'with', 'a', 'child', 'in', 'his', 'office', '.']
['moments', 'later', ',', 'as', 'she', 'approaches', 'him', ',', 'he', 'appears', 'to', 'push', 'her', '.']
['the', 'situation', 'quickly', 'escalates', ',', 'with', 'others', 'in', 'the', 'room', 'intervening', 'to', 'prevent', 'a', 'further', 'altercation', '.']
['the', 'video', ',', 'shared', 'by', 'journalist', 'gagandeep', 'singh', ',', 'ends', 'as', 'both', 'of', 'them', 'were', 'seen', 'continuing', 'their', 'heated', 'exchange', '.

## Phase 3: Sentence Ranking using TF-IDF

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from collections import Counter
import re


vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(sentences)

def extract_summary(sentences, tfidf_matrix, top_n=3):
    sentence_scores = np.array(tfidf_matrix.sum(axis=1)).flatten()
    ranked_sentences = [sentences[i] for i in sentence_scores.argsort()[::-1][:top_n]]
    return ' '.join(ranked_sentences)

summary = extract_summary(sentences, tfidf_matrix)


def get_word_frequencies(text):
    words = re.findall(r'\b\w+\b', text.lower())
    word_counts = Counter(words)
    return word_counts.most_common(5)

word_frequencies = get_word_frequencies(summary)


print("Extracted Summary:")
print(summary, "\n")

print("Word Frequencies:")
for word, count in word_frequencies:
    print(f"{word}: {count} times")



Extracted Summary:
He is accused of raping a woman from Zirakpur, who had alleged that Singh lured her into his religious circle in 2017 before sexually assaulting her at his Mohali residence. It shows Singh, who is involved in multiple sexual harassment cases, throwing a pile of papers at a woman seated with a child in his office. Singh, who runs a church in Majri, was first arrested on July 20, 2018, at Delhi airport while attempting to board a flight to London. 

Word Frequencies:
a: 6 times
in: 4 times
who: 3 times
singh: 3 times
his: 3 times


## Phase 4: LSTM-Based Summarization Model

In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import nltk
import os
nltk.download('punkt', quiet=True)

# Function to read files from folders
def read_files_from_folder(folder_path):
    data = []
    categories = ['business', 'entertainment', 'politics', 'sport', 'tech']
    
    for category in categories:
        category_path = os.path.join(folder_path, category)
        if os.path.exists(category_path):
            print(f"Reading files from category: {category}")
            for filename in os.listdir(category_path):
                file_path = os.path.join(category_path, filename)
                if os.path.isfile(file_path):
                    try:
                        with open(file_path, 'r', encoding='utf-8') as file:
                            content = file.read()
                    except UnicodeDecodeError:
                        with open(file_path, 'r', encoding='ISO-8859-1') as file:
                            content = file.read()
                    data.append((category, filename, content))
        else:
            print(f"Category folder not found: {category_path}")
    return data

# Paths to the folders
articles_folder = 'C:/Users/Manish/OneDrive/Desktop/NLP/archive/BBC News Summary/News Articles'
summaries_folder = 'C:/Users/Manish/OneDrive/Desktop/NLP/archive/BBC News Summary/Summaries'

# Read the data
articles_data = read_files_from_folder(articles_folder)
summaries_data = read_files_from_folder(summaries_folder)

# Create DataFrames
df_articles = pd.DataFrame(articles_data, columns=['Category', 'Filename', 'Article'])
df_summaries = pd.DataFrame(summaries_data, columns=['Category', 'Filename', 'Summary'])

# Merge the DataFrames
data = pd.merge(df_articles, df_summaries, on=['Category', 'Filename'])

# Tokenization
data['tokenized_articles'] = data['Article'].apply(lambda x: word_tokenize(x.lower()))
data['tokenized_summaries'] = data['Summary'].apply(lambda x: word_tokenize(x.lower()))

# Train Word2Vec model
word2vec_model = Word2Vec(data['tokenized_articles'].tolist() + data['tokenized_summaries'].tolist(), 
                          vector_size=100, window=5, min_count=1, workers=4)

# Convert sentences to vectors
def vectorize_sentence(sentence, model, vector_size=100):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    if len(vectors) == 0:
        return np.zeros((vector_size,))
    return np.mean(vectors, axis=0)

data['article_vectors'] = data['tokenized_articles'].apply(lambda x: vectorize_sentence(x, word2vec_model))
data['summary_vectors'] = data['tokenized_summaries'].apply(lambda x: vectorize_sentence(x, word2vec_model))

# Convert to numpy arrays
X = np.vstack(data['article_vectors'].values)
y = np.vstack(data['summary_vectors'].values)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define LSTM Model
class LSTMSummarizer(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(LSTMSummarizer, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        lstm_out, _ = self.lstm(x.unsqueeze(1))
        output = self.fc(lstm_out[:, -1, :])
        return output

input_size = 100
hidden_size = 128
output_size = 100
num_layers = 1

model = LSTMSummarizer(input_size, hidden_size, output_size, num_layers)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
epochs = 10
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(torch.tensor(X_train, dtype=torch.float32))
    loss = criterion(outputs, torch.tensor(y_train, dtype=torch.float32))
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

print("LSTM Model Training Complete!")


Reading files from category: business
Reading files from category: entertainment
Reading files from category: politics
Reading files from category: sport
Reading files from category: tech
Reading files from category: business
Reading files from category: entertainment
Reading files from category: politics
Reading files from category: sport
Reading files from category: tech
Epoch 1/10, Loss: 0.1672
Epoch 2/10, Loss: 0.1594
Epoch 3/10, Loss: 0.1519
Epoch 4/10, Loss: 0.1443
Epoch 5/10, Loss: 0.1366
Epoch 6/10, Loss: 0.1286
Epoch 7/10, Loss: 0.1203
Epoch 8/10, Loss: 0.1116
Epoch 9/10, Loss: 0.1027
Epoch 10/10, Loss: 0.0937
LSTM Model Training Complete!


## Phase 5: Save Extracted Summary to File

In [32]:
# Function to summarize unseen user input
def summarize_text(input_text, model, word2vec_model, max_sentences=3):
    tokenized_sentences = sent_tokenize(input_text)
    tokenized_words = [word_tokenize(sent.lower()) for sent in tokenized_sentences]
    sentence_vectors = np.array([vectorize_sentence(sent, word2vec_model) for sent in tokenized_words])

    if sentence_vectors.shape[0] == 0:
        return "No meaningful sentences found for summarization."

    with torch.no_grad():
        sentence_scores = model(torch.tensor(sentence_vectors, dtype=torch.float32)).squeeze().cpu().numpy()
        
        # Handle the case when sentence_scores is a single value
        if np.isscalar(sentence_scores) or len(sentence_scores.shape) == 0:
            ranked_indices = [0]
        else:
            # For multi-dimensional arrays, take the sum along the last axis
            if len(sentence_scores.shape) > 1:
                sentence_scores = np.sum(sentence_scores, axis=1)
            
            # Get the indices of the top-scoring sentences
            ranked_indices = np.argsort(sentence_scores)[::-1][:min(len(tokenized_sentences), max_sentences)]

    # Convert numpy array to list before sorting
    ranked_indices_list = ranked_indices.tolist()
    
    # Extract sentences in their original order
    ranked_sentences = [tokenized_sentences[i] for i in sorted(ranked_indices_list)]
    formatted_summary = ' '.join(ranked_sentences)

    return formatted_summary

# Use user input from Phase 1
try:
    summarized_text = summarize_text(text, model, word2vec_model)

    # Save summary to file
    output_file = "summary.txt"
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("Original Text:\n")
        file.write(text + "\n\n")
        file.write("Summarized Text:\n")
        file.write(summarized_text)

    print(f"📄 Original text and summary saved to {output_file}")
    
except Exception as e:
    print(f"An error occurred during summarization: {e}")
    print("Falling back to TF-IDF summary method.")
    
    # Fallback to the TF-IDF method that was working earlier
    summary = extract_summary(sentences, tfidf_matrix)
    
    output_file = "summary.txt"
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("Original Text:\n")
        file.write(text + "\n\n")
        file.write("Summarized Text (TF-IDF method):\n")
        file.write(summary)
    
    print(f"📄 Original text and TF-IDF summary saved to {output_file}")

📄 Original text and summary saved to summary.txt


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
import pandas as pd

# Sample article and summary for demonstration
original_text = text

lstm_summary = summary

# 1. Text Length Comparison
def plot_text_length_comparison(original, summary):
    plt.figure(figsize=(10, 6))
    
    # Calculate lengths
    original_words = len(word_tokenize(original))
    summary_words = len(word_tokenize(summary))
    original_sentences = len(sent_tokenize(original))
    summary_sentences = len(sent_tokenize(summary))
    
    # Text compression rate
    compression_rate = (1 - (summary_words / original_words)) * 100
    
    # Create bar chart
    labels = ['Words', 'Sentences']
    original_counts = [original_words, original_sentences]
    summary_counts = [summary_words, summary_sentences]
    
    x = np.arange(len(labels))
    width = 0.35
    
    fig, ax = plt.subplots(figsize=(10, 6))
    rects1 = ax.bar(x - width/2, original_counts, width, label='Original Article')
    rects2 = ax.bar(x + width/2, summary_counts, width, label='Summary')
    
    # Add labels, title and legend
    ax.set_ylabel('Count')
    ax.set_title(f'Article vs Summary Comparison\nCompression Rate: {compression_rate:.1f}%')
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend()
    
    # Add count labels above bars
    def autolabel(rects):
        for rect in rects:
            height = rect.get_height()
            ax.annotate(f'{height}',
                        xy=(rect.get_x() + rect.get_width()/2, height),
                        xytext=(0, 3),
                        textcoords="offset points",
                        ha='center', va='bottom')
                        
    autolabel(rects1)
    autolabel(rects2)
    
    fig.tight_layout()
    plt.savefig('text_length_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()
    return 'text_length_comparison.png'

# 2. Word Cloud Comparison
def generate_word_clouds(original, summary):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
    
    # Generate word clouds
    wc1 = WordCloud(width=800, height=400, background_color='white', 
                   max_words=100, contour_width=3, contour_color='steelblue')
    wc2 = WordCloud(width=800, height=400, background_color='white',
                   max_words=100, contour_width=3, contour_color='steelblue')
    
    # Generate from original text
    wc1.generate(original)
    
    # Generate from summary
    wc2.generate(summary)
    
    # Display the word clouds
    ax1.imshow(wc1, interpolation='bilinear')
    ax1.set_title('Original Article', fontsize=20)
    ax1.axis('off')
    
    ax2.imshow(wc2, interpolation='bilinear')
    ax2.set_title('Summary', fontsize=20)
    ax2.axis('off')
    
    plt.tight_layout()
    plt.savefig('wordcloud_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()
    return 'wordcloud_comparison.png'

# 3. TF-IDF Term Importance
def plot_tfidf_importance(original, summary):
    # Create a corpus from the original and summary
    corpus = [original, summary]
    
    # Create the TF-IDF vectorizer
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(corpus)
    
    # Get feature names
    feature_names = vectorizer.get_feature_names_out()
    
    # Get TF-IDF scores
    original_scores = tfidf_matrix[0].toarray()[0]
    summary_scores = tfidf_matrix[1].toarray()[0]
    
    # Create a DataFrame for the scores
    df = pd.DataFrame({
        'term': feature_names,
        'original_score': original_scores,
        'summary_score': summary_scores
    })
    
    # Calculate term importance in summary relative to original
    df['importance'] = df['summary_score'] / (df['original_score'] + 0.0001)  # Avoid division by zero
    
    # Sort by importance (highest to lowest)
    df = df.sort_values('importance', ascending=False)
    
    # Get top terms that appear in both texts
    top_terms = df[(df['original_score'] > 0) & (df['summary_score'] > 0)].head(15)
    
    # Create horizontal bar chart
    plt.figure(figsize=(12, 8))
    plt.barh(top_terms['term'], top_terms['importance'], color='skyblue')
    plt.xlabel('Relative Importance')
    plt.ylabel('Terms')
    plt.title('Top Terms by Importance in Summary Relative to Original')
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig('term_importance.png', dpi=300, bbox_inches='tight')
    plt.close()
    return 'term_importance.png'

# 4. Sentence Selection Heatmap
def plot_sentence_selection(original, summary):
    original_sentences = sent_tokenize(original)
    summary_sentences = sent_tokenize(summary)
    
    # Create a matrix showing which original sentences were selected
    selection_matrix = np.zeros((len(original_sentences), 1))
    
    # Mark which sentences were selected
    for i, orig_sent in enumerate(original_sentences):
        if orig_sent in summary_sentences:
            selection_matrix[i, 0] = 1
    
    # Plot heatmap
    plt.figure(figsize=(10, len(original_sentences)/2))
    ax = sns.heatmap(selection_matrix, cmap=['lightgray', 'forestgreen'], 
                     cbar=False, linewidths=.5, linecolor='gray')
    plt.title('Sentence Selection Pattern')
    plt.ylabel('Original Sentence Index')
    plt.xlabel('Selected for Summary')
    
    # Add sentence texts as y-tick labels (shortened)
    shortened_sentences = [s[:50] + '...' if len(s) > 50 else s for s in original_sentences]
    plt.yticks(np.arange(len(original_sentences)) + 0.5, shortened_sentences, fontsize=8)
    
    plt.tight_layout()
    plt.savefig('sentence_selection.png', dpi=300, bbox_inches='tight')
    plt.close()
    return 'sentence_selection.png'

# 5. Model Performance Radar Chart
def plot_performance_radar():
    # Evaluation metrics (example scores - replace with actual evaluation)
    categories = ['Content Coverage', 'Relevance', 'Coherence', 'Non-redundancy', 'Readability']
    
    # Example scores (0-1 scale)
    tfidf_scores = [0.65, 0.70, 0.60, 0.75, 0.80]
    lstm_scores = [0.75, 0.80, 0.65, 0.70, 0.85]
    
    # Number of categories
    N = len(categories)
    
    # Create angle for each category
    angles = [n / float(N) * 2 * np.pi for n in range(N)]
    angles += angles[:1]  # Close the loop
    
    # Add values for each method (also close the loop)
    tfidf_scores += tfidf_scores[:1]
    lstm_scores += lstm_scores[:1]
    
    # Initialize the figure
    fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(polar=True))
    
    # Draw one axis per variable and add labels
    plt.xticks(angles[:-1], categories, fontsize=12)
    
    # Draw the outlines for each method
    ax.plot(angles, tfidf_scores, linewidth=2, linestyle='solid', label='TF-IDF')
    ax.plot(angles, lstm_scores, linewidth=2, linestyle='solid', label='LSTM')
    
    # Fill areas for each method
    ax.fill(angles, tfidf_scores, alpha=0.25)
    ax.fill(angles, lstm_scores, alpha=0.25)
    
    # Add legend
    plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    
    plt.title('Model Performance Comparison', fontsize=15)
    plt.tight_layout()
    plt.savefig('model_performance_radar.png', dpi=300, bbox_inches='tight')
    plt.close()
    return 'model_performance_radar.png'

# Generate all visualizations
text_chart = plot_text_length_comparison(original_text, lstm_summary)
wordcloud = generate_word_clouds(original_text, lstm_summary)
term_importance = plot_tfidf_importance(original_text, lstm_summary)
sentence_selection = plot_sentence_selection(original_text, lstm_summary)
performance_radar = plot_performance_radar()

print("All visualizations have been generated successfully!")

All visualizations have been generated successfully!


<Figure size 1000x600 with 0 Axes>

In [34]:
pip install numpy pandas matplotlib seaborn nltk scikit-learn gensim torch wordcloud


Collecting matplotlib
  Downloading matplotlib-3.10.1-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting wordcloud
  Downloading wordcloud-1.9.4-cp312-cp312-win_amd64.whl.metadata (3.5 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.1-cp312-cp312-win_amd64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.56.0-cp312-cp312-win_amd64.whl.metadata (103 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.8-cp312-cp312-win_amd64.whl.metadata (6.3 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Downloading pyparsing-3.2.1-py3-none-any.whl.metadata (5.0 kB)
Downloading matplotlib-3.10.1-cp312-cp312-win_amd64.whl (8.1 MB)
   ---------------------------------------- 0.0/8.1 MB ? eta -:--:--
   --------


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
