In [1]:
# New Cell for focused download (or modify your existing focused download cell)
import nltk
import os

custom_nltk_data_path = "C:\\nltk_data"
if not os.path.exists(custom_nltk_data_path):
    try:
        os.makedirs(custom_nltk_data_path)
    except Exception as e:
        print(f"Could not create folder {custom_nltk_data_path}: {e}")

if custom_nltk_data_path not in nltk.data.path:
    nltk.data.path.append(custom_nltk_data_path)

print(f"Attempting to download 'punkt' to {custom_nltk_data_path}...")
nltk.download('punkt', download_dir=custom_nltk_data_path, quiet=False, force=True)


print(f"Attempting to download 'punkt_tab' to {custom_nltk_data_path}...")
nltk.download('punkt_tab', download_dir=custom_nltk_data_path, quiet=False, force=True)


print(f"Attempting to download 'stopwords' to {custom_nltk_data_path}...")
nltk.download('stopwords', download_dir=custom_nltk_data_path, quiet=False, force=True)

print("Download attempts finished.")

# Quick test
try:
    from nltk.tokenize import word_tokenize
    print("Successfully imported word_tokenize.")
    test_tokens = word_tokenize("This is a test sentence for tokenization.")
    print("word_tokenize test successful:", test_tokens)
except Exception as e:
    print("Error during quick test of word_tokenize:", e)

Attempting to download 'punkt' to C:\nltk_data...


[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


Attempting to download 'punkt_tab' to C:\nltk_data...


[nltk_data] Downloading package punkt_tab to C:\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


Attempting to download 'stopwords' to C:\nltk_data...


[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Download attempts finished.
Successfully imported word_tokenize.
word_tokenize test successful: ['This', 'is', 'a', 'test', 'sentence', 'for', 'tokenization', '.']


In [1]:
# Cell 1: Imports and Article Fetching/Preprocessing Setup

import nltk
import os 
from newspaper import Article 
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# --- Point NLTK to a specific data directory ---

custom_nltk_data_path = "C:\\nltk_data" # Use double backslashes for Windows paths

if not os.path.exists(custom_nltk_data_path):
    try:
        os.makedirs(custom_nltk_data_path)
        print(f"Created NLTK data directory: {custom_nltk_data_path}")
    except Exception as e:
        print(f"Could not create NLTK data directory {custom_nltk_data_path}. Please create it manually. Error: {e}")

if custom_nltk_data_path not in nltk.data.path:
    nltk.data.path.append(custom_nltk_data_path)
    print(f"Added {custom_nltk_data_path} to NLTK data path.")
else:
    print(f"{custom_nltk_data_path} is already in NLTK data path.")
# --- End NLTK data path setup ---


# Download NLTK resources if not already present, TO THE SPECIFIED DIRECTORY
try:
    # Check if resources are found in ANY of the NLTK paths
    nltk.data.find('corpora/stopwords')
    print("NLTK stopwords found.")
except LookupError: # More specific exception
    print(f"Downloading NLTK stopwords to {custom_nltk_data_path}...")
    nltk.download('stopwords', download_dir=custom_nltk_data_path, quiet=False) # quiet=False for visibility
try:
    nltk.data.find('tokenizers/punkt')
    print("NLTK punkt tokenizer found.")
except LookupError:
    print(f"Downloading NLTK punkt tokenizer to {custom_nltk_data_path}...")
    nltk.download('punkt', download_dir=custom_nltk_data_path, quiet=False) # quiet=False for visibility


stop_words_nltk = set(stopwords.words('english'))

def fetch_and_preprocess_article_nltk(url):
    """Fetches article from URL and preprocesses its text using NLTK."""
    print(f"Attempting to fetch article from: {url}")
    try:
        article_obj = Article(url)
        article_obj.download()
        article_obj.parse()
        raw_text = article_obj.text
        title = article_obj.title
        # Check if title is None or empty, which can happen if parsing fails
        if not title:
            title = "Title not found"
            print("Warning: Article title not found by newspaper3k.")
        else:
            print(f"Successfully fetched: {title}")

        # Check if raw_text is None or very short, indicating a fetching/parsing issue
        if not raw_text or len(raw_text) < 100: # Arbitrary short length check
             print(f"Warning: Fetched raw text is very short or None for {url}. Content might be missing or paywalled/JS-rendered.")
             # return None, title, None # Optionally return early if text is bad

    except Exception as e:
        print(f"Error fetching/parsing article from {url}: {e}")
        return None, None, None

    if not raw_text: # Double check after the try-except
        print(f"No content found for article (raw_text is None): {url}")
        return None, title, None # title might have been set if initial parse got that far

    # NLTK Preprocessing
    text_lower = raw_text.lower()
    text_no_punct = re.sub(r'\W', ' ', text_lower) # Remove non-alphanumeric as punctuation
    text_no_extra_space = re.sub(r'\s+', ' ', text_no_punct).strip() # Remove extra whitespace
    
    tokens = word_tokenize(text_no_extra_space) # This is where the LookupError was happening
    
    filtered_tokens = [
        word for word in tokens
        if word not in stop_words_nltk and len(word) > 2 and word.isalpha() # Keep only alpha words > 2 chars
    ]
    preprocessed_text = " ".join(filtered_tokens)

    return raw_text, title, preprocessed_text

# --- Test the function ---
test_article_url = 'https://www.bbc.com/news/articles/c87j5v4xjxqo'

print("--- Testing Article Fetching and NLTK Preprocessing ---")
raw_content, article_title, processed_content_nltk = fetch_and_preprocess_article_nltk(test_article_url)

if raw_content:
    print(f"\nTitle: {article_title}") # article_title should be defined even if raw_content is minimal
    print("\n--- Raw Content (first 500 chars) ---")
    print(raw_content[:500] + "...")
    if processed_content_nltk:
        print("\n--- NLTK Processed Content (first 500 chars) ---")
        print(processed_content_nltk[:500] + "...")
    else:
        print("\n--- NLTK Processed Content: Not generated (likely due to issues with raw_content or tokenization).")


    documents_for_bertopic = [raw_content] if raw_content else []
    processed_documents_for_bertopic = [processed_content_nltk] if processed_content_nltk else []
else:
    print(f"\nCould not get raw content for article: {test_article_url}")
    # Initialize these as empty lists if raw_content is None to avoid errors later
    documents_for_bertopic = []
    processed_documents_for_bertopic = []

C:\nltk_data is already in NLTK data path.
NLTK stopwords found.
NLTK punkt tokenizer found.
--- Testing Article Fetching and NLTK Preprocessing ---
Attempting to fetch article from: https://www.bbc.com/news/articles/c87j5v4xjxqo
Successfully fetched: Sidhu Moose Wala: Gangster tells BBC why India's biggest hip-hop star was murdered

Title: Sidhu Moose Wala: Gangster tells BBC why India's biggest hip-hop star was murdered

--- Raw Content (first 500 chars) ---
Gangster tells BBC why India's biggest hip-hop star was murdered

11 June 2025 Share Save Soutik Biswas & Ishleen Kaur BBC Eye Investigations Share Save

BBC Sidhu Moose Wala was shot dead in a hail of bullets in 2022

It was a killing that shocked India: Punjabi hip-hop star Sidhu Moose Wala shot dead through the windscreen of his car by hired gunmen. Within hours, a Punjabi gangster named Goldy Brar had used Facebook to claim responsibility for ordering the hit. But three years after the murde...

--- NLTK Processed Content (fi

In [3]:
# Cell 2: Topic Extraction with BERTopic (Final Attempt - with Sentence Splitting)

from bertopic import BERTopic
import hdbscan
from nltk.tokenize import sent_tokenize # Import the sentence tokenizer from NLTK

print("\n--- Topic Extraction on a Single Document by Splitting into Sentences ---")

# We will use the 'raw_content' variable from the first cell
if 'raw_content' in locals() and raw_content and len(raw_content) > 1:

    # 1. Split the single document (raw_content) into a list of sentences
    print("Splitting the article into sentences...")
    try:
        # NLTK's sentence tokenizer is generally reliable
        sentences = sent_tokenize(raw_content)
        print(f"Successfully split the article into {len(sentences)} sentences.")
        
        # Optional: Filter out very short sentences if they might be noise (e.g., just a date or byline)
        min_sentence_length = 10 # in characters
        sentences = [s for s in sentences if len(s) > min_sentence_length]
        print(f"Using {len(sentences)} sentences after filtering short ones.")

    except Exception as e:
        print(f"Error during sentence tokenization: {e}")
        sentences = [] # Set to empty list to prevent further errors

    # Proceed only if we have enough sentences to work with
    if len(sentences) > 5: # Arbitrary threshold, clustering needs a decent number of points
        
        # 2. Initialize BERTopic Model
        # We can go back to using UMAP now, as it will have enough data points (sentences) to work with.
        # Or stick with PCA, which is often faster. Let's try UMAP again.
        topic_model = BERTopic(
            embedding_model="all-MiniLM-L6-v2",
            # We don't need to specify umap_model or hdbscan_model if defaults are okay.
            # Let's use the defaults first.
            min_topic_size=3, # A topic must contain at least 3 sentences
            nr_topics="auto",
            verbose=True
        )

        # 3. Fit the model on the list of sentences
        print("\nFitting BERTopic model on the list of sentences...")
        try:
            topics, probabilities = topic_model.fit_transform(sentences)

            print("\n--- BERTopic Results ---")
            topic_info_df = topic_model.get_topic_info()
            print("Topic Info:")
            print(topic_info_df)

            if not topic_info_df.empty and not (len(topic_info_df) == 1 and topic_info_df.iloc[0]["Topic"] == -1):
                print("\nKeywords for each topic:")
                for topic_id in topic_info_df["Topic"]:
                    if topic_id != -1:
                        topic_keywords = topic_model.get_topic(topic_id)
                        print(f"Topic {topic_id}: {topic_keywords}")
            else:
                print("\nNo distinct topics found. All sentences were considered outliers or the result was empty.")

        except Exception as e:
            print(f"\nAn error occurred during BERTopic fit_transform: {e}")

    else:
        print("\nNot enough sentences in the article to perform topic modeling. Please try a longer article.")

else:
    print("Variable 'raw_content' not found or is empty. Please run the previous cell first.")

  from .autonotebook import tqdm as notebook_tqdm
2025-06-28 23:12:12,158 - BERTopic - Embedding - Transforming documents to embeddings.



--- Topic Extraction on a Single Document by Splitting into Sentences ---
Splitting the article into sentences...
Successfully split the article into 79 sentences.
Using 79 sentences after filtering short ones.

Fitting BERTopic model on the list of sentences...


Batches: 100%|██████████| 3/3 [00:02<00:00,  1.50it/s]
2025-06-28 23:12:24,271 - BERTopic - Embedding - Completed ✓
2025-06-28 23:12:24,272 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-28 23:12:38,433 - BERTopic - Dimensionality - Completed ✓
2025-06-28 23:12:38,433 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-28 23:12:38,461 - BERTopic - Cluster - Completed ✓
2025-06-28 23:12:38,462 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-06-28 23:12:38,503 - BERTopic - Representation - Completed ✓
2025-06-28 23:12:38,503 - BERTopic - Topic reduction - Reducing number of topics
2025-06-28 23:12:38,518 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-28 23:12:38,548 - BERTopic - Representation - Completed ✓
2025-06-28 23:12:38,550 - BERTopic - Topic reduction - Reduced number of topics from 6 to 6



--- BERTopic Results ---
Topic Info:
   Topic  Count                      Name  \
0     -1     16        -1_the_and_in_with   
1      0     31      0_wala_moose_and_the   
2      1     13           1_he_was_it_for   
3      2      8           2_the_of_in_had   
4      3      7  3_bishnoi_student_is_the   
5      4      4      4_get_as_simple_same   

                                      Representation  \
0  [the, and, in, with, sport, need, village, gan...   
1  [wala, moose, and, the, to, in, of, with, sidh...   
2  [he, was, it, for, but, or, people, money, him...   
3  [the, of, in, had, was, his, bbc, punjabi, ind...   
4  [bishnoi, student, is, the, brar, and, in, law...   
5  [get, as, simple, same, water, want, someone, ...   

                                 Representative_Docs  
0  [This led to a spell in jail which hardened hi...  
1  ["Lawrence [Bishnoi] was in touch with Sidhu [...  
2  ["The day he died, people cried for him., So, ...  
3  [In the weeks that followed th

In [4]:
# Cell 3: Sentiment Analysis with NLTK VADER

from nltk.sentiment.vader import SentimentIntensityAnalyzer

print("\n--- Performing Sentiment Analysis on the Article ---")

# We will use the 'raw_content' variable from the first cell, as VADER
# works best with text that includes punctuation, capitalization, and stopwords.
if 'raw_content' in locals() and raw_content and len(raw_content) > 1:
    
    # Download the VADER lexicon if not already present
    try:
        nltk.data.find('sentiment/vader_lexicon.zip')
        print("VADER lexicon found.")
    except LookupError:
        print("Downloading VADER lexicon...")
        # Use the same custom path to keep all NLTK data together
        custom_nltk_data_path = "C:\\nltk_data"
        nltk.download('vader_lexicon', download_dir=custom_nltk_data_path, quiet=False)

    # Initialize the VADER sentiment analyzer
    analyzer = SentimentIntensityAnalyzer()

    # --- 1. Overall Article Sentiment ---
    print("\nAnalyzing sentiment of the entire article...")
    # Get the polarity scores for the whole raw text
    overall_sentiment_scores = analyzer.polarity_scores(raw_content)
    
    print("\nOverall Article Sentiment Scores:")
    print(f"  - Negative: {overall_sentiment_scores['neg']:.3f}")
    print(f"  - Neutral:  {overall_sentiment_scores['neu']:.3f}")
    print(f"  - Positive: {overall_sentiment_scores['pos']:.3f}")
    print(f"  - Compound: {overall_sentiment_scores['compound']:.3f}")
    
    # Interpretation of the compound score
    compound_score = overall_sentiment_scores['compound']
    if compound_score >= 0.05:
        print("  --> Overall sentiment: Positive")
    elif compound_score <= -0.05:
        print("  --> Overall sentiment: Negative")
    else:
        print("  --> Overall sentiment: Neutral")


    # --- 2. Sentence-level Sentiment (Example) ---
    # This is useful for finding specific positive or negative statements.
    print("\nAnalyzing sentiment of the first 5 sentences (as an example)...")
    if 'sentences' in locals() and len(sentences) > 0:
        for i, sentence in enumerate(sentences[:5]): # Analyze the first 5 sentences
            sentence_scores = analyzer.polarity_scores(sentence)
            print(f"\nSentence {i+1}: '{sentence}'")
            print(f"  - Scores: {sentence_scores}")
            # You can add the Positive/Negative/Neutral interpretation here too if you want
    else:
        print("Could not find the 'sentences' list. Please ensure the BERTopic cell (Cell 2) has been run.")

else:
    print("Variable 'raw_content' not found or is empty. Please run the previous cell first.")


--- Performing Sentiment Analysis on the Article ---
Downloading VADER lexicon...


[nltk_data] Downloading package vader_lexicon to C:\nltk_data...



Analyzing sentiment of the entire article...

Overall Article Sentiment Scores:
  - Negative: 0.137
  - Neutral:  0.797
  - Positive: 0.066
  - Compound: -0.999
  --> Overall sentiment: Negative

Analyzing sentiment of the first 5 sentences (as an example)...

Sentence 1: 'Gangster tells BBC why India's biggest hip-hop star was murdered

11 June 2025 Share Save Soutik Biswas & Ishleen Kaur BBC Eye Investigations Share Save

BBC Sidhu Moose Wala was shot dead in a hail of bullets in 2022

It was a killing that shocked India: Punjabi hip-hop star Sidhu Moose Wala shot dead through the windscreen of his car by hired gunmen.'
  - Scores: {'neg': 0.241, 'neu': 0.611, 'pos': 0.148, 'compound': -0.891}

Sentence 2: 'Within hours, a Punjabi gangster named Goldy Brar had used Facebook to claim responsibility for ordering the hit.'
  - Scores: {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

Sentence 3: 'But three years after the murder, no-one has faced trial - and Goldy Brar is still on