In [1]:
pip install gensim pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models import LdaModel
import pandas as pd
import os

# Add error handling to print helpful information
try:
    # Download necessary NLTK resources with quiet option to reduce output
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)
    
    # Print current working directory to see where Python is looking for the file
    print(f"Current working directory: {os.getcwd()}")
    
    # Check if file exists before attempting to read
    if not os.path.exists('npr.csv'):
        print("Error: 'npr.csv' file not found in the current directory!")
        print("Please make sure you've uploaded the file and it's in the correct location.")
        # If you're using an interactive environment like Google Colab, you might need to upload the file
        # or provide a sample dataset for testing
        
        # Example data for testing if file is not available
        print("Using example data for demonstration...")
        documents = [
            "Rafael Nadal Joins Roger Federer in Missing U.S. Open",
            "Rafael Nadal Is Out of the Australian Open",
            "Biden Announces Virus Measures",
            "Biden's Virus Plans Meet Reality",
            "Where Biden's Virus Plan Stands"
        ]
    else:
        # Read the CSV file
        print("Reading 'npr.csv'...")
        df = pd.read_csv('npr.csv')
        print(f"CSV loaded successfully. Shape: {df.shape}")
        
        # Check if 'Article' column exists
        if 'Article' not in df.columns:
            print(f"Error: 'Article' column not found in CSV. Available columns: {df.columns.tolist()}")
            raise KeyError("'Article' column not found in the CSV file")
            
        # Check for null values in the Article column
        null_count = df['Article'].isnull().sum()
        if null_count > 0:
            print(f"Warning: Found {null_count} null values in 'Article' column")
            df = df.dropna(subset=['Article'])
            print(f"Dropped null values. New shape: {df.shape}")
            
        # Convert Article column to list
        documents = df['Article'].tolist()
        print(f"Loaded {len(documents)} documents for processing")
        
        # Print first document sample
        if len(documents) > 0:
            print("First document sample (truncated):")
            print(documents[0][:100] + "..." if len(documents[0]) > 100 else documents[0])
    
    # Initialize preprocessing tools
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    def preprocess_text(text):
        """Process a single text document"""
        try:
            # Make sure text is a string
            if not isinstance(text, str):
                text = str(text)
                
            # Tokenize and preprocess
            tokens = word_tokenize(text.lower())
            tokens = [token for token in tokens if token.isalnum()]
            tokens = [token for token in tokens if token not in stop_words]
            tokens = [lemmatizer.lemmatize(token) for token in tokens]
            return tokens
        except Exception as e:
            print(f"Error preprocessing text: {str(e)}")
            print(f"Problematic text (truncated): {str(text)[:50]}...")
            return []  # Return empty list for problematic documents
    
    print("Preprocessing documents...")
    preprocessed_documents = [preprocess_text(doc) for doc in documents]
    non_empty_count = sum(1 for doc in preprocessed_documents if len(doc) > 0)
    print(f"Preprocessed {len(preprocessed_documents)} documents, {non_empty_count} non-empty")
    
    if non_empty_count == 0:
        raise ValueError("All documents are empty after preprocessing!")
    
    # Create dictionary and filter
    print("Creating dictionary...")
    dictionary = corpora.Dictionary(preprocessed_documents)
    original_tokens = len(dictionary)
    
    # Adjust filter values based on corpus size
    min_doc_count = min(15, len(documents) // 10) if len(documents) > 20 else 2
    print(f"Filtering dictionary: min_doc_count={min_doc_count}, max_ratio=0.5")
    
    dictionary.filter_extremes(no_below=min_doc_count, no_above=0.5)
    filtered_tokens = len(dictionary)
    print(f"Dictionary created: {original_tokens} original tokens, {filtered_tokens} after filtering")
    
    if filtered_tokens == 0:
        raise ValueError("Dictionary is empty after filtering! Try adjusting filter parameters.")
    
    # Create corpus
    print("Creating document-term matrix...")
    corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]
    non_empty_corpus = sum(1 for doc in corpus if len(doc) > 0)
    print(f"Created corpus with {len(corpus)} documents, {non_empty_corpus} non-empty")
    
    if non_empty_corpus == 0:
        raise ValueError("Corpus is empty! All documents filtered out.")
    
    # Run LDA with appropriate number of topics
    n_topics = min(5, len(documents) // 5) if len(documents) > 10 else 2
    print(f"Running LDA with {n_topics} topics...")
    lda_model = LdaModel(corpus, num_topics=n_topics, id2word=dictionary, passes=15)
    print("LDA model trained successfully")
    
    # Process results
    print("Determining dominant topic for each document...")
    article_labels = []
    for doc in preprocessed_documents:
        bow = dictionary.doc2bow(doc)
        if len(bow) > 0:  # Only process non-empty documents
            topics = lda_model.get_document_topics(bow)
            dominant_topic = max(topics, key=lambda x: x[1])[0] if topics else -1
        else:
            dominant_topic = -1  # Mark empty documents
        article_labels.append(dominant_topic)
    
    # Create results DataFrame
    df_result = pd.DataFrame({"Article": documents, "Topic": article_labels})
    print("\nResults - Topic Distribution:")
    print(df_result["Topic"].value_counts())
    
    # Show sample of results
    print("\nSample of Results (first 5 rows):")
    pd.set_option('display.max_colwidth', 50)  # Limit column width for display
    print(df_result.head())
    
    # Show top terms for each topic
    print("\nTop terms for each topic:")
    for topic_id in range(lda_model.num_topics):
        print(f"\nTop terms for Topic #{topic_id}:")
        print([term[0] for term in lda_model.show_topic(topic_id, topn=10)])
    
    # Show detailed weights
    print("\nDetailed weights for each topic:")
    for idx, topic in lda_model.print_topics():
        print(f"Topic {idx}:")
        terms = [term.strip() for term in topic.split("+")]
        for term in terms:
            weight, word = term.split("*")
            print(f"- {word.strip()} (weight: {weight.strip()})")
        print()

except Exception as e:
    print(f"An error occurred: {str(e)}")
    import traceback
    traceback.print_exc()

Current working directory: C:\Users\HP\Downloads\Lab 9
Reading 'npr.csv'...
CSV loaded successfully. Shape: (11992, 1)
Loaded 11992 documents for processing
First document sample (truncated):
In the Washington of 2016, even when the policy can be bipartisan, the politics cannot. And in that ...
Preprocessing documents...
Preprocessed 11992 documents, 11992 non-empty
Creating dictionary...
Filtering dictionary: min_doc_count=15, max_ratio=0.5
Dictionary created: 86155 original tokens, 15974 after filtering
Creating document-term matrix...
Created corpus with 11992 documents, 11992 non-empty
Running LDA with 5 topics...
LDA model trained successfully
Determining dominant topic for each document...

Results - Topic Distribution:
Topic
2    3686
1    2633
4    2045
3    1903
0    1725
Name: count, dtype: int64

Sample of Results (first 5 rows):
                                             Article  Topic
0  In the Washington of 2016, even when the polic...      0
1    Donald Trump has used 