NER(to remove sentences without named entities) AND BERT(word embeddings)

In [None]:
import fitz  # PyMuPDF for PDF text extraction
import spacy
import torch
from transformers import BertModel, BertTokenizer
import numpy as np
from sklearn.cluster import KMeans
import nltk
from nltk.tokenize import sent_tokenize

# Download sentence tokenizer
nltk.download('punkt')
nltk.download('punkt_tab')

# Step 1: Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + " "  # Extract text from each page
    return text.strip()

# Provide the path to your PDF file
pdf_path = "/content/nlp_ner_news_summ.pdf"
document_text = extract_text_from_pdf(pdf_path)

# Step 2: Split Text into Sentences
sentences = sent_tokenize(document_text)

# Load spaCy Named Entity Recognition Model
nlp = spacy.load("en_core_web_sm")

# Step 3: Perform Named Entity Recognition (NER) and Filter Sentences
def filter_sentences_with_entities(sentences):
    filtered_sentences = []
    for sentence in sentences:
        doc = nlp(sentence)
        if len(doc.ents) > 0:  # Keep only sentences containing named entities
            filtered_sentences.append(sentence)
    return filtered_sentences

filtered_sentences = filter_sentences_with_entities(sentences)
print(f"Sentences after NER filtering: {filtered_sentences}")

# Load BERT Model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Step 4: Convert Sentences to BERT Embeddings
def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Take the mean of the last hidden state to get sentence embedding
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

sentence_embeddings = np.array([get_sentence_embedding(sent) for sent in filtered_sentences])

# Step 5: Cluster the Sentences
num_clusters = 5  # Assuming we want 5 clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
kmeans.fit(sentence_embeddings)
clusters = kmeans.labels_

# Step 6: Generate Summary (Select One Representative Sentence from Each Cluster)
summary = []
for cluster_id in range(num_clusters):
    cluster_sentences = [filtered_sentences[i] for i in range(len(filtered_sentences)) if clusters[i] == cluster_id]
    if cluster_sentences:
        summary.append(cluster_sentences[0])  # Picking the first sentence in each cluster

# Print the final summary
print("\nGenerated Summary:")
for sentence in summary:
    print("-", sentence)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Sentences after NER filtering: ['The recent parliamentary elections in Washington, D.C. resulted in a coalition government, \nshifting the balance of power.', 'The new administration, led by the Democratic Party, proposed \na tax reform bill, sparking intense debates across the country.', 'Opposition leaders from the \nRepublican Party called for revisions, claiming the bill favors large corporations like Amazon \nand Google over citizens.', 'Protests erupted in major cities such as New York and Los Angeles \nas activists demanded a more transparent legislative process.', 'The national football team of Brazil won a dramatic final match in Rio de Janeiro, clinching \ntheir first championship title in years.', "Analysts from ESPN praised the coach's strategic \nsubstitutions, crediting them for the late-game turnaround.", 'Meanwhile, FIFA announced an \nexpansion plan, adding two new teams next season.', 'A highly anticipated Hollywood blockbuster produced by Warner Bros. broke box offic

In [None]:
!pip install pymupdf



# **CLUSTERING (BERT SPACY AND K MEANS)**

In [None]:
import fitz  # PyMuPDF for PDF text extraction
import spacy
import torch
from transformers import BertModel, BertTokenizer
import numpy as np
from sklearn.cluster import KMeans
import nltk
from nltk.tokenize import sent_tokenize

# Download sentence tokenizer
nltk.download('punkt')
nltk.download('punkt_tab')

# Step 1: Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + " "  # Extract text from each page
    return text.strip()

# Provide the path to your PDF file
pdf_path = "/content/nlp_ner_news_summ.pdf"
document_text = extract_text_from_pdf(pdf_path)

# Step 2: Split Text into Sentences
sentences = sent_tokenize(document_text)

# Load spaCy Named Entity Recognition Model
nlp = spacy.load("en_core_web_sm")

# Step 3: Perform Named Entity Recognition (NER) and Filter Sentences
def filter_sentences_with_entities(sentences):
    filtered_sentences = []
    for sentence in sentences:
        doc = nlp(sentence)
        if len(doc.ents) > 0:  # Keep only sentences containing named entities
            filtered_sentences.append(sentence)
    return filtered_sentences

filtered_sentences = filter_sentences_with_entities(sentences)
print(f"\n🔹 Sentences after NER filtering ({len(filtered_sentences)} sentences):\n")
for sent in filtered_sentences:
    print("-", sent)

# Load BERT Model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Step 4: Convert Sentences to BERT Embeddings
def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Take the mean of the last hidden state to get sentence embedding
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

sentence_embeddings = np.array([get_sentence_embedding(sent) for sent in filtered_sentences])

# Step 5: Cluster the Sentences
num_clusters = 5  # Assuming we want 5 clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
kmeans.fit(sentence_embeddings)
clusters = kmeans.labels_

# Step 6: Print Clusters and their Sentences
print("\n🔹 Clusters and their Sentences:\n")
cluster_dict = {i: [] for i in range(num_clusters)}
for i, sentence in enumerate(filtered_sentences):
    cluster_dict[clusters[i]].append(sentence)

for cluster_id, cluster_sentences in cluster_dict.items():
    print(f"\n📌 Cluster {cluster_id + 1}:")
    for sent in cluster_sentences:
        print("-", sent)

# Step 7: Generate Summary (Select One Representative Sentence from Each Cluster)
summary = []
for cluster_id in range(num_clusters):
    if cluster_dict[cluster_id]:
        summary.append(cluster_dict[cluster_id][0])  # Picking the first sentence in each cluster



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!



🔹 Sentences after NER filtering (21 sentences):

- The recent parliamentary elections in Washington, D.C. resulted in a coalition government, 
shifting the balance of power.
- The new administration, led by the Democratic Party, proposed 
a tax reform bill, sparking intense debates across the country.
- Opposition leaders from the 
Republican Party called for revisions, claiming the bill favors large corporations like Amazon 
and Google over citizens.
- Protests erupted in major cities such as New York and Los Angeles 
as activists demanded a more transparent legislative process.
- The national football team of Brazil won a dramatic final match in Rio de Janeiro, clinching 
their first championship title in years.
- Analysts from ESPN praised the coach's strategic 
substitutions, crediting them for the late-game turnaround.
- Meanwhile, FIFA announced an 
expansion plan, adding two new teams next season.
- A highly anticipated Hollywood blockbuster produced by Warner Bros. broke box o

In [None]:
import fitz  # PyMuPDF for PDF text extraction
import spacy
import torch
from transformers import BertModel, BertTokenizer
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import sent_tokenize

# Download sentence tokenizer
nltk.download('punkt')

# Step 1: Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + " "  # Extract text from each page
    return text.strip()

# Provide the path to your PDF file
pdf_path = "/content/nlp_ner_news_summ.pdf"
document_text = extract_text_from_pdf(pdf_path)

# Step 2: Split Text into Sentences
sentences = sent_tokenize(document_text)

# Load spaCy Named Entity Recognition Model
nlp = spacy.load("en_core_web_sm")

# Step 3: Perform Named Entity Recognition (NER) and Filter Sentences
def filter_sentences_with_entities(sentences):
    filtered_sentences = []
    for sentence in sentences:
        doc = nlp(sentence)
        if len(doc.ents) > 0:  # Keep only sentences containing named entities
            filtered_sentences.append(sentence)
    return filtered_sentences

filtered_sentences = filter_sentences_with_entities(sentences)

print("\n🔹 Sentences after NER Preprocessing:")
for sent in filtered_sentences:
    print("-", sent)

# Load BERT Model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Step 4: Convert Sentences to BERT Embeddings
def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

sentence_embeddings = np.array([get_sentence_embedding(sent) for sent in filtered_sentences])

# Step 5: Cluster the Sentences
num_clusters = 5  # Assuming 5 clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
kmeans.fit(sentence_embeddings)
clusters = kmeans.labels_

# Step 6: Identify Centroid Sentence in Each Cluster
def find_centroid_sentence(cluster_sentences, cluster_embeddings):
    similarity_matrix = cosine_similarity(cluster_embeddings)
    centroid_idx = np.argmax(np.mean(similarity_matrix, axis=1))
    return cluster_sentences[centroid_idx], centroid_idx, similarity_matrix

# Step 7: Identify Misclassified Sentences
cluster_dict = {i: [] for i in range(num_clusters)}
cluster_embeddings_dict = {i: [] for i in range(num_clusters)}
for i, sentence in enumerate(filtered_sentences):
    cluster_dict[clusters[i]].append(sentence)
    cluster_embeddings_dict[clusters[i]].append(sentence_embeddings[i])

misclassified_sentences = []
centroid_sentences = {}
intra_cluster_similarities = {}

for cluster_id in range(num_clusters):
    if len(cluster_dict[cluster_id]) > 0:
        centroid_sentence, centroid_idx, similarity_matrix = find_centroid_sentence(cluster_dict[cluster_id], np.array(cluster_embeddings_dict[cluster_id]))
        centroid_sentences[cluster_id] = centroid_sentence
        centroid_embedding = cluster_embeddings_dict[cluster_id][centroid_idx]
        intra_cluster_similarities[cluster_id] = similarity_matrix

        # Compute similarity with centroid and identify misclassified sentences
        for i, sentence in enumerate(cluster_dict[cluster_id]):
            similarity = cosine_similarity([centroid_embedding], [cluster_embeddings_dict[cluster_id][i]])[0][0]
            if similarity < np.mean(cosine_similarity([centroid_embedding], cluster_embeddings_dict[cluster_id])):
                misclassified_sentences.append((sentence, cluster_id, similarity))

# Step 8: Print Results
print("\n🔹 Clusters and their Centroids:")
for cluster_id, sentence in centroid_sentences.items():
    print(f"📌 Cluster {cluster_id + 1} Centroid Sentence:")
    print(f"   - {sentence}\n")
    print("   Sentences in Cluster:")
    for sent in cluster_dict[cluster_id]:
        print(f"   - {sent}")
    print()


# Step 9: Print Intra-Cluster Similarity Scores
print("\n🔹 Intra-Cluster Similarity Scores:")
for cluster_id, similarity_matrix in intra_cluster_similarities.items():
    print(f"📌 Cluster {cluster_id + 1} Similarity Scores:")
    print(similarity_matrix)

# Step 10: Compute and Print Pairwise Similarity Scores
print("\n🔹 Pairwise Similarity Scores Between Sentences:")
pairwise_similarity = cosine_similarity(sentence_embeddings)
print(pairwise_similarity)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



🔹 Sentences after NER Preprocessing:
- The recent parliamentary elections in Washington, D.C. resulted in a coalition government, 
shifting the balance of power.
- The new administration, led by the Democratic Party, proposed 
a tax reform bill, sparking intense debates across the country.
- Opposition leaders from the 
Republican Party called for revisions, claiming the bill favors large corporations like Amazon 
and Google over citizens.
- Protests erupted in major cities such as New York and Los Angeles 
as activists demanded a more transparent legislative process.
- The national football team of Brazil won a dramatic final match in Rio de Janeiro, clinching 
their first championship title in years.
- Analysts from ESPN praised the coach's strategic 
substitutions, crediting them for the late-game turnaround.
- Meanwhile, FIFA announced an 
expansion plan, adding two new teams next season.
- A highly anticipated Hollywood blockbuster produced by Warner Bros. broke box office 
recor

# **COSINE SIMILARITY WITHIN CLUSTER**

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Compute intra-cluster similarity scores
print("\n🔹 Intra-Cluster Similarity Scores:")
for cluster_id in range(num_clusters):
    if len(cluster_dict[cluster_id]) > 1:
        cluster_embeddings = np.array(cluster_embeddings_dict[cluster_id])
        similarity_matrix = cosine_similarity(cluster_embeddings)
        print(f"\n📌 Cluster {cluster_id + 1}:")
        for i in range(len(cluster_dict[cluster_id])):
            for j in range(i + 1, len(cluster_dict[cluster_id])):
                print(f"   • \"{cluster_dict[cluster_id][i]}\"")
                print(f"     🔹 ↔ Similarity with → \"{cluster_dict[cluster_id][j]}\"")
                print(f"       ✅ Score: {similarity_matrix[i][j]:.4f}\n")

# Compute similarity between all filtered sentences
print("\n🔹 Pairwise Sentence Similarity Scores:")
sentence_sim_matrix = cosine_similarity(sentence_embeddings)
for i in range(len(filtered_sentences)):
    for j in range(i + 1, len(filtered_sentences)):
        print(f"   🔵 Sentence 1: \"{filtered_sentences[i]}\"")
        print(f"   🔵 Sentence 2: \"{filtered_sentences[j]}\"")
        print(f"     🔹 ✅ Similarity Score: {sentence_sim_matrix[i][j]:.4f}\n")



🔹 Intra-Cluster Similarity Scores:

📌 Cluster 1:
   • "The national football team of Brazil won a dramatic final match in Rio de Janeiro, clinching 
their first championship title in years."
     🔹 ↔ Similarity with → "Meanwhile, FIFA announced an 
expansion plan, adding two new teams next season."
       ✅ Score: 0.7261


📌 Cluster 2:
   • "The recent parliamentary elections in Washington, D.C. resulted in a coalition government, 
shifting the balance of power."
     🔹 ↔ Similarity with → "The new administration, led by the Democratic Party, proposed 
a tax reform bill, sparking intense debates across the country."
       ✅ Score: 0.8025

   • "The recent parliamentary elections in Washington, D.C. resulted in a coalition government, 
shifting the balance of power."
     🔹 ↔ Similarity with → "Protests erupted in major cities such as New York and Los Angeles 
as activists demanded a more transparent legislative process."
       ✅ Score: 0.7483

   • "The new administration, led by th

In [None]:
def find_centroid_sentence(cluster_sentences, cluster_embeddings):
    similarity_matrix = cosine_similarity(cluster_embeddings)
    centroid_idx = np.argmax(np.mean(similarity_matrix, axis=1))
    return cluster_sentences[centroid_idx], centroid_idx, similarity_matrix  # Returns 3 values

# **CLUSTER SUMMARY**

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Function to compute cosine similarity between all sentence embeddings in a cluster
def compute_cosine_similarities(embeddings):
    return cosine_similarity(embeddings)

# Function to find the most central (most similar) sentence in each cluster
def find_cluster_summary(cluster_sentences, cluster_embeddings):
    if len(cluster_sentences) == 1:
        return cluster_sentences[0]  # If only one sentence in cluster, return it directly

    # Compute cosine similarities between sentences in the same cluster
    cosine_sim = compute_cosine_similarities(cluster_embeddings)

    # Calculate the average similarity for each sentence in the cluster
    avg_similarities = cosine_sim.mean(axis=1)

    # Find the index of the sentence with the highest average similarity
    summary_index = np.argmax(avg_similarities)

    # Return the most central sentence as the summary
    return cluster_sentences[summary_index]

# Step: Print the summary for each cluster based on cosine similarity
print("\n🔹 Cluster Summaries Based on Cosine Similarity:\n")
cluster_summaries = {}  # Dictionary to store summaries
num_clusters=5
for cluster_id in range(num_clusters):
    cluster_sentences = cluster_dict[cluster_id]

    if cluster_sentences:  # Ensure cluster is not empty
        # Get embeddings for the sentences in the current cluster
        cluster_embeddings = np.array(cluster_embeddings_dict[cluster_id])

        # Get the summary sentence for this cluster
        summary_sentence = find_cluster_summary(cluster_sentences, cluster_embeddings)

        cluster_summaries[cluster_id] = summary_sentence  # Store summary

        # Print the summary
        print(f"\n📌 Cluster {cluster_id + 1} Summary:")
        print("-", summary_sentence)

# Optional: Store summaries in a list if needed
final_summary = list(cluster_summaries.values())



🔹 Cluster Summaries Based on Cosine Similarity:


📌 Cluster 1 Summary:
- The national football team of Brazil won a dramatic final match in Rio de Janeiro, clinching 
their first championship title in years.

📌 Cluster 2 Summary:
- The new administration, led by the Democratic Party, proposed 
a tax reform bill, sparking intense debates across the country.

📌 Cluster 3 Summary:
- A highly anticipated Hollywood blockbuster produced by Warner Bros. broke box office 
records, earning the highest opening weekend revenue.

📌 Cluster 4 Summary:
- However, privacy concerns emerged as critics 
questioned data security measures implemented by Meta and Google.

📌 Cluster 5 Summary:
- Tech giants like Microsoft and Tesla reported record earnings, causing a sharp rise in the stock 
market.


# **EUCLIDEAN DISTANCE FOR CLUSTERING**

In [None]:
from scipy.spatial.distance import euclidean
import numpy as np

# Step: Compute Euclidean Distance of Sentences from Centroids
print("\n🔹 Euclidean Distances of Sentences from Cluster Centroids:")

for cluster_id in range(num_clusters):
    if len(cluster_dict[cluster_id]) > 0:
        # Find centroid sentence and its index (ignore similarity_matrix)
        centroid_sentence, centroid_idx, _ = find_centroid_sentence(
            cluster_dict[cluster_id], np.array(cluster_embeddings_dict[cluster_id])
        )
        centroid_embedding = np.array(cluster_embeddings_dict[cluster_id])[centroid_idx]

        print(f"\n📌 Cluster {cluster_id + 1} Centroid Sentence:")
        print(f"   - {centroid_sentence}\n")

        # Calculate Euclidean distance of each sentence from the centroid
        distances = []
        for i, sentence in enumerate(cluster_dict[cluster_id]):
            distance = euclidean(centroid_embedding, cluster_embeddings_dict[cluster_id][i])
            distances.append((sentence, distance))

        # Sort sentences by distance from the centroid
        distances.sort(key=lambda x: x[1])

        print("   Sentences sorted by distance from centroid:")
        for sent, dist in distances:
            print(f"   - {sent} (Distance: {dist:.4f})")



🔹 Euclidean Distances of Sentences from Cluster Centroids:

📌 Cluster 1 Centroid Sentence:
   - The national football team of Brazil won a dramatic final match in Rio de Janeiro, clinching 
their first championship title in years.

   Sentences sorted by distance from centroid:
   - The national football team of Brazil won a dramatic final match in Rio de Janeiro, clinching 
their first championship title in years. (Distance: 0.0000)
   - Meanwhile, FIFA announced an 
expansion plan, adding two new teams next season. (Distance: 7.0771)

📌 Cluster 2 Centroid Sentence:
   - The new administration, led by the Democratic Party, proposed 
a tax reform bill, sparking intense debates across the country.

   Sentences sorted by distance from centroid:
   - The new administration, led by the Democratic Party, proposed 
a tax reform bill, sparking intense debates across the country. (Distance: 0.0000)
   - The recent parliamentary elections in Washington, D.C. resulted in a coalition government

# **CLUSTER ANALYSIS**

In [None]:
# Extract the centroid sentence of Cluster 4
cluster_id = 3  # Cluster 4 (zero-based index)
centroid_sentence, centroid_idx, _ = find_centroid_sentence(
    cluster_dict[cluster_id], np.array(sentence_embeddings[clusters == cluster_id])
)

# Tokenize and get word embeddings for each word in the centroid sentence
tokens = tokenizer.tokenize(centroid_sentence)
inputs = tokenizer(centroid_sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)

with torch.no_grad():
    outputs = model(**inputs)

# Extract embeddings for each token
word_embeddings = outputs.last_hidden_state.squeeze(0)  # Shape: (num_tokens, embedding_dim)

# Print word embeddings
print(f"\n🔹 Word Embeddings for Each Token in Centroid Sentence of Cluster 4:\n")
for token, embedding in zip(tokens, word_embeddings):
    print(f"Token: {token}\nEmbedding: {embedding.numpy()}\n")



🔹 Word Embeddings for Each Token in Centroid Sentence of Cluster 4:

Token: however
Embedding: [-3.46439213e-01 -1.51526749e-01 -5.01923561e-01  2.48411775e-01
 -5.89512467e-01  1.78395405e-01  5.63737333e-01  3.09803672e-02
  1.67283744e-01  2.78489858e-01 -1.15326405e-01 -2.28878736e-01
 -9.54669788e-02  8.59639049e-01  1.69955015e-01 -1.62474141e-01
 -3.40217620e-01  6.07969284e-01  5.95120668e-01  2.09461376e-01
 -1.53136134e-01 -2.31477737e-01  7.86720216e-02 -6.18944969e-03
 -9.95777249e-02 -5.48874140e-01  2.65589114e-02 -5.19021153e-01
 -9.00311768e-02  2.59114504e-01 -6.88675046e-01  2.20381320e-01
 -6.20640397e-01 -1.95054412e-01  5.05710125e-01 -2.82753080e-01
 -4.98129845e-01 -5.90378791e-02  1.47453085e-01  4.40258980e-02
 -7.65779197e-01 -7.39695907e-01 -3.57899554e-02  2.28708878e-01
 -7.47926831e-02  3.32332969e-01 -3.45462561e+00  1.65664759e-02
 -4.30441827e-01  9.57752317e-02  2.68383138e-02  1.63775921e-01
  3.21890175e-01  4.89583880e-01 -1.00370355e-01  2.4560183

In [None]:
# Compute the sentence embedding by averaging the word embeddings
sentence_embedding = word_embeddings.mean(dim=0).numpy()

# Print the sentence embedding
print(f"\n🔹 Sentence Embedding for Centroid Sentence of Cluster 4:\n")
print(sentence_embedding)



🔹 Sentence Embedding for Centroid Sentence of Cluster 4:

[ 9.99366269e-02 -4.13544569e-03 -1.47739604e-01  1.26819029e-01
  4.15043496e-02 -2.68197089e-01  1.63271919e-01  1.46766558e-01
  1.70333669e-01  2.74609625e-01 -2.60941595e-01 -2.93949068e-01
 -1.05388366e-01  4.31152493e-01 -2.70907223e-01  1.28974155e-01
 -2.15744719e-01  6.80573881e-02  1.74858615e-01  2.94718772e-01
  1.06000826e-01  7.45671242e-02 -5.45856059e-01  2.20329940e-01
  5.04620492e-01 -1.64990932e-01  5.57422033e-03 -9.49430391e-02
 -2.82103658e-01  4.73854654e-02  3.67633581e-01  2.35726431e-01
 -4.08473969e-01 -2.89161336e-02  7.63622671e-02  5.99936880e-02
 -3.33974093e-01  4.54657264e-02 -3.45965326e-01  7.60238916e-02
 -6.09268069e-01 -5.24685383e-01 -8.40122104e-02  4.40280557e-01
  2.85898391e-02  7.87970796e-02  2.45611817e-02  1.35790467e-01
 -1.25296935e-02  1.27179980e-01 -6.74433649e-01  5.98512053e-01
 -1.05758958e-01 -1.13812096e-01  4.63457638e-03  3.99669349e-01
  3.96942347e-02 -8.21343303e-0

# **MODIFICATIONS IN MY CODE**

In [None]:
import spacy
import pdfplumber
import re
import nltk
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans

# Download NLTK tokenizer
nltk.download('punkt')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = " ".join([page.extract_text() for page in pdf.pages if page.extract_text()])
    return text

# Function to clean and tokenize text into sentences
def preprocess_text(text):
    text = re.sub(r'\n+', ' ', text)  # Remove newlines
    sentences = nltk.sent_tokenize(text)  # Split into sentences
    return sentences

# Function to filter sentences that contain Named Entities
def filter_sentences_with_entities(sentences):
    filtered_sentences = []
    for sentence in sentences:
        doc = nlp(sentence)
        if len(doc.ents) > 0:  # Keep only sentences with Named Entities
            filtered_sentences.append(sentence)
    return filtered_sentences

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Function to get sentence embeddings using BERT
def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()  # Mean pooling for sentence embedding

# Function to perform clustering and generate a summary
def generate_summary(sentences, num_clusters=5):
    sentence_embeddings = np.array([get_sentence_embedding(sent) for sent in sentences])

    # Apply K-Means clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
    kmeans.fit(sentence_embeddings)
    clusters = kmeans.labels_

    # Select one representative sentence from each cluster
    summary = []
    for cluster_id in range(num_clusters):
        cluster_sentences = [sentences[i] for i in range(len(sentences)) if clusters[i] == cluster_id]
        if cluster_sentences:
            summary.append(cluster_sentences[0])  # Pick the first sentence in each cluster

    return summary

# Provide the path to your PDF file
pdf_path = "/content/nlp_ner_news_summ.pdf"

# Step 1: Extract and process text
text = extract_text_from_pdf(pdf_path)
sentences = preprocess_text(text)

# Step 2: Filter sentences with Named Entities
filtered_sentences = filter_sentences_with_entities(sentences)

# Step 3: Generate summary using clustering
summary = generate_summary(filtered_sentences)

# Step 4: Display the summary
print("\n🔹 Final Summary:")
for sent in summary:
    print("-", sent)


In [None]:
!pip install pdfplumber