Lab Assignment 3

Muhammad Ammar Wafiy - IS01082517
Nur Aisya Safiyyah - IS01082522

In [1]:
import pandas as pd
import numpy as np
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
import re

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/ammar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ammar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ammar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
data = pd.read_csv('news_dataset.csv')
print(f'The dataset contains {data.shape[0]} records and {data.shape[1]} columns.')
data = data[['text']].dropna()
print(f'Dataset after removing null values: {data.shape[0]} records')

The dataset contains 11314 records and 5 columns.
Dataset after removing null values: 11096 records


In [5]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and 2 < len(word) <= 20 and word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [6]:
data['processed_text'] = data['text'].apply(preprocess_text)
print("\nSample of processed text:")
print(data['processed_text'].head(10))


Sample of processed text:
0    wondering anyone could enlighten car saw day d...
1    recently posted article asking kind rate singl...
2    depends priority lot people put higher priorit...
3    excellent automatic found subaru legacy switch...
4    ford automobile need information whether ford ...
5    watch attributionsi didnt say isnt appropriate...
6    avoid problem entirely installing oil drain va...
7    acura integra speed mile positively worst car ...
8    assuming non turbo gruffness characteristic la...
9    addition restricted mileage many classic insur...
Name: processed_text, dtype: object


In [7]:
bow_dictionary = corpora.Dictionary([text.split() for text in data['processed_text']])
bow_corpus = [bow_dictionary.doc2bow(text.split()) for text in data['processed_text']]
print(f"\nNumber of unique tokens : {len(bow_dictionary)}")


Number of unique tokens : 73869


In [8]:
tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.5)
tfidf_matrix = tfidf_vectorizer.fit_transform(data['processed_text'])

In [9]:
tfidf_corpus = [[(i, float(tfidf_matrix[row, i])) for i in tfidf_matrix[row].nonzero()[1]] for row in range(tfidf_matrix.shape[0])]

In [10]:
combined_corpus = []
for bow_doc, tfidf_doc in zip(bow_corpus, tfidf_corpus):
    combined = {}
    for word_id, bow_weight in bow_doc:
        combined[word_id] = bow_weight * 0.5
    for word_id, tfidf_weight in tfidf_doc:
        combined[word_id] = combined.get(word_id, 0) + tfidf_weight * 0.5
    combined_corpus.append(list(combined.items()))

In [11]:
lda_model_combined = LdaModel(corpus=combined_corpus, id2word=bow_dictionary, num_topics=4, passes=10, random_state=42)

In [12]:
print("\nExtracted Topics :")
for idx, topic in lda_model_combined.show_topics(num_topics=4, formatted=False):
    topic_words = [word for word, _ in topic]
    print(f"Topic {idx+1}: {', '.join(topic_words)}")


Extracted Topics :
Topic 1: would, one, people, dont, think, know, like, time, get, say
Topic 2: file, window, use, program, one, get, system, version, problem, thanks
Topic 3: team, game, league, season, player, play, hockey, win, period, san
Topic 4: key, encryption, chip, system, use, information, clipper, privacy, security, data


In [13]:
coherence_cv = CoherenceModel(model=lda_model_combined, texts=[text.split() for text in data['processed_text']], dictionary=bow_dictionary, coherence='c_v').get_coherence()

In [15]:
print(f"CV Coherence Score: {coherence_cv:.4f}")

CV Coherence Score: 0.6350


The CV coherence score of 0.6350 indicates that the topics generated by the LDA model are moderately coherent and reasonably interpretable. This score suggests that the words within each topic have a decent level of semantic similarity, meaning the topics are meaningful and relevant but not highly optimized.