# Lab Assignment 3 - Topic Modeling
**Name:** Ahmad Aizat,Muhammad Aidil  
**ID:** IS01082871, IS01082943


In [16]:
import pandas as pd
import nltk
import re
import gensim
import gensim.corpora as corpora
from nltk.tokenize import word_tokenize
from gensim.models import CoherenceModel, LdaModel
from nltk.stem.porter import PorterStemmer
from gensim.utils import simple_preprocess

# Hardcoded stopwords
custom_stopwords = set([
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
    'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his',
    'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
    'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
    'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
    'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
    'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if',
    'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for',
    'with', 'about', 'against', 'between', 'into', 'through', 'during',
    'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
    'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',
    'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how',
    'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other',
    'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',
    'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don',
    'should', 'now'
])


In [2]:
df = pd.read_csv("news_dataset.csv")
df = df.dropna(subset=['text'])
df.head()


Unnamed: 0.1,Unnamed: 0,text,target,title,date
0,0,I was wondering if anyone out there could enli...,7,rec.autos,2022-08-02 13:48:37.251043
1,17,I recently posted an article asking what kind ...,7,rec.autos,2022-08-02 13:48:37.251043
2,29,\nIt depends on your priorities. A lot of peo...,7,rec.autos,2022-08-02 13:48:37.251043
3,56,an excellent automatic can be found in the sub...,7,rec.autos,2022-08-02 13:48:37.251043
4,64,: Ford and his automobile. I need information...,7,rec.autos,2022-08-02 13:48:37.251043


In [15]:
stemmer = PorterStemmer()

def preprocess_text(text):
    tokens = simple_preprocess(text, deacc=True)
    tokens = [word for word in tokens if word not in custom_stopwords]
    tokens = [stemmer.stem(word) for word in tokens]
    return tokens

df['tokens'] = df['text'].apply(preprocess_text)


In [25]:
id2word = corpora.Dictionary(df['tokens'])
corpus = [id2word.doc2bow(text) for text in df['tokens']]


In [13]:
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=4,
                     random_state=42,
                     update_every=1,
                     chunksize=50,
                     passes=5,
                     alpha='auto',
                     per_word_topics=True)


In [12]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['tokens'], dictionary=id2word, coherence='c_v')
coherence_score = coherence_model_lda.get_coherence()

topics = lda_model.print_topics(num_words=10)
for i, topic in enumerate(topics):
    print(f"Topic {i+1}: {topic[1]}")
    
print(f"\nCoherence Score: {coherence_score}")


Topic 1: 0.025*"bh" + 0.014*"mov" + 0.012*"nist" + 0.011*"di" + 0.010*"al" + 0.010*"ncsl" + 0.009*"cx" + 0.009*"bl" + 0.005*"expn" + 0.005*"ax"
Topic 2: 0.010*"would" + 0.010*"one" + 0.009*"peopl" + 0.007*"govern" + 0.006*"right" + 0.006*"say" + 0.006*"know" + 0.006*"like" + 0.006*"think" + 0.005*"go"
Topic 3: 0.065*"chip" + 0.051*"db" + 0.021*"bit" + 0.020*"devic" + 0.013*"serial" + 0.011*"block" + 0.009*"si" + 0.009*"disk" + 0.008*"turkey" + 0.007*"byte"
Topic 4: 0.024*"key" + 0.015*"use" + 0.015*"encrypt" + 0.009*"system" + 0.008*"secur" + 0.007*"clipper" + 0.007*"inform" + 0.006*"public" + 0.006*"privaci" + 0.006*"anonym"

Coherence Score: 0.49238843810090444


### Interpretation of Coherence Score
The coherence score provides a quantitative measure of topic interpretability. A higher coherence score (closer to 1.0) means that the words within a topic are more semantically related, indicating better topic quality. In this model, a score around 0.5 or higher typically suggests reasonably coherent topics in real-world text data.
