In [38]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Gensim imports
import gensim
from gensim import corpora
from gensim.models import LdaModel

In [40]:
# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\New\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\New\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [42]:
# Load the dataset
try:
    file_path = 'data_files/BBC News Train.csv'
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print("Error: Dataset not found!!!!!")

In [44]:
# We only need the 'Text' column for this
if 'df' in locals():
    print(df.head())
    print(f"\nWe have {len(df)} articles to analyze.")
    # Keep only the 'Text' column for our corpus
    documents = df['Text']

   ArticleId                                               Text  Category
0       1833  worldcom ex-boss launches defence lawyers defe...  business
1        154  german business confidence slides german busin...  business
2       1101  bbc poll indicates economic gloom citizens in ...  business
3       1976  lifestyle  governs mobile choice  faster  bett...      tech
4        917  enron bosses in $168m payout eighteen former e...  business

We have 1490 articles to analyze.


##### Advanced Text Preprocessing

In [46]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [48]:

def preprocess_text_lda(text):
    """
    Prepares text for LDA:
    1. Tokenize
    2. Lowercase
    3. Remove punctuation and numbers
    4. Remove stop words
    5. Lemmatize
    6. Remove short words (<= 3 chars)
    """
    tokens = nltk.word_tokenize(text)
    processed_tokens = []
    
    for token in tokens:
        token = token.lower()
        if (token not in stop_words and
            token not in string.punctuation and
            token.isalpha()): # Remove numbers and punctuation mixes
            
            lemmatized_token = lemmatizer.lemmatize(token)
            
            # Remove very short words
            if len(lemmatized_token) > 3:
                processed_tokens.append(lemmatized_token)
                
    return processed_tokens

In [50]:
# Apply the preprocessing to all documents
if 'documents' in locals():
    print("\nStarting text preprocessing for LDA...")
    # This will create a list of lists (each inner list is a doc)
    processed_docs = [preprocess_text_lda(doc) for doc in documents]
    print("Preprocessing complete.")
    
    # Print an example
    print("\n--- Example Preprocessed Document ---")
    print(processed_docs[0][:20]) # Print first 20 tokens of the first doc


Starting text preprocessing for LDA...
Preprocessing complete.

--- Example Preprocessed Document ---
['worldcom', 'launch', 'defence', 'lawyer', 'defending', 'former', 'worldcom', 'chief', 'bernie', 'ebbers', 'battery', 'fraud', 'charge', 'called', 'company', 'whistleblower', 'first', 'witness', 'cynthia', 'cooper']


##### Create Gensim Corpus and Dictionary

In [51]:
if 'processed_docs' in locals():
    # 1. Create the Dictionary
    # This maps each word to a unique ID
    dictionary = corpora.Dictionary(processed_docs)
    
    # Filter out extremes (optional but recommended)
    # filter words that appear in < 5 documents or > 50% of documents
    dictionary.filter_extremes(no_below=5, no_above=0.5)
    
    # 2. Create the Corpus (Bag-of-Words)
    # This converts each document into a list of (word_id, frequency)
    corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    
    print(f"\nNumber of unique words (tokens) in dictionary: {len(dictionary)}")
    
    # Print an example of the corpus
    print("\n--- Example Corpus Entry (Doc 0) ---")
    print(corpus[0][:10]) # (Word ID, Frequency)


Number of unique words (tokens) in dictionary: 5823

--- Example Corpus Entry (Doc 0) ---
[(0, 7), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]


##### Train the LDA Model

In [52]:
if 'corpus' in locals():
    # Set number of topics
    num_topics = 5
    
    print(f"\nTraining LDA model to find {num_topics} topics...")
    
    # Build the LDA model
    # `passes` is like epochs
    lda_model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=100,
        passes=10,
        alpha='auto',
        eta='auto'
    )
    
    print("Model training complete.")


Training LDA model to find 5 topics...
Model training complete.


##### Interpret the Results

In [53]:
if 'lda_model' in locals():
    print("\n--- Discovered Topics and Key Words ---")
    
    # Get and print the topics
    topics = lda_model.print_topics(num_words=10) # Show top 10 words per topic
    
    for topic in topics:
        print(f"Topic {topic[0]}: {topic[1]}\n")


--- Discovered Topics and Key Words ---
Topic 0: 0.012*"people" + 0.007*"could" + 0.005*"government" + 0.005*"many" + 0.004*"system" + 0.004*"software" + 0.004*"user" + 0.004*"firm" + 0.004*"home" + 0.004*"service"

Topic 1: 0.015*"film" + 0.010*"best" + 0.007*"award" + 0.006*"first" + 0.006*"show" + 0.005*"star" + 0.005*"last" + 0.005*"music" + 0.005*"world" + 0.005*"number"

Topic 2: 0.018*"game" + 0.007*"player" + 0.007*"first" + 0.006*"time" + 0.006*"england" + 0.005*"gadget" + 0.004*"world" + 0.004*"wale" + 0.004*"back" + 0.004*"technology"

Topic 3: 0.009*"election" + 0.009*"government" + 0.008*"labour" + 0.008*"party" + 0.006*"economy" + 0.006*"blair" + 0.005*"minister" + 0.005*"country" + 0.005*"tory" + 0.005*"growth"

Topic 4: 0.008*"mobile" + 0.008*"firm" + 0.008*"phone" + 0.007*"company" + 0.005*"service" + 0.005*"club" + 0.005*"people" + 0.005*"sale" + 0.004*"share" + 0.004*"market"



##### Interpetation 

By looking at the keywords, we can manually label these topics:

Topic 0 is clearly about "Sports".

Topic 1 is clearly about "Entertainment".

Topic 2 is clearly about "Politics".

##### Validate with Real Labels

In [65]:
if 'lda_model' in locals():
    # Create a new DataFrame for analysis
    df_results = pd.DataFrame()
    df_results['Original_Category'] = df['Category']
    
    # Get the dominant topic for each document
    doc_topics = [lda_model.get_document_topics(doc) for doc in corpus]
    
    # Find the topic with the highest probability
    dominant_topic = []
    for doc in doc_topics:
        # Sort topics by probability and get the ID of the top one
        top_topic = sorted(doc, key=lambda x: x[1], reverse=True)[0][0]
        dominant_topic.append(top_topic)
        
    df_results['Discovered_Topic'] = dominant_topic
    
    print("\n--- Validation: Original Category vs. Discovered Topic ---")
    
    # Create a cross-tabulation to see the relationship
    crosstab = pd.crosstab(df_results['Original_Category'], df_results['Discovered_Topic'])
    print(crosstab)


--- Validation: Original Category vs. Discovered Topic ---
Discovered_Topic     0    1    2    3    4
Original_Category                         
business            29    0    2  151  154
entertainment       16  251    0    3    3
politics            88    2    3  177    4
sport                0   91  123   11  121
tech               138    3   61    1   58


##### This proves the model successfully discovered the underlying topics without ever being told what they were!