In [14]:
#Group member:
#Dayang Nurin Syazwina Binti Ramlan (IS01081494)
#Lina Batrisyia Binti Mohd Mazlan (IS01081499)

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to C:\Users\Dayang
[nltk_data]     Nurin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Dayang
[nltk_data]     Nurin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Dayang
[nltk_data]     Nurin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Dayang
[nltk_data]     Nurin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [15]:
df = pd.read_csv('news_dataset.csv')
texts = df['text'].dropna().tolist()


In [16]:
# Stopwords
stop_words = set(stopwords.words('english'))

# Lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to preprocess text
def preprocess_text(text):
    # Tokenize
    tokens = word_tokenize(text.lower())
    # Remove stopwords and non-alphabetic characters, lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

# Apply preprocessing to the texts
processed_texts = [preprocess_text(text) for text in texts]


In [17]:
# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(processed_texts)
# Filter out extremes to limit the number of features
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# Create a corpus: Term Document Frequency
corpus = [dictionary.doc2bow(text) for text in processed_texts]


In [18]:
# Set parameters for LDA
num_topics = 4  # Number of topics

# Build LDA model
lda_model = LdaModel(corpus=corpus,
                     id2word=dictionary,
                     num_topics=num_topics,
                     random_state=100,
                     update_every=1,
                     chunksize=100,
                     passes=10,
                     alpha='auto',
                     per_word_topics=True)


In [19]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)



Coherence Score:  0.5347159504916545


In [20]:
# Print the topics
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)


(0, '0.065*"db" + 0.041*"e" + 0.034*"q" + 0.031*"k" + 0.029*"n" + 0.022*"april" + 0.022*"x" + 0.019*"f" + 0.015*"b" + 0.015*"p"')
(1, '0.017*"people" + 0.015*"government" + 0.010*"law" + 0.008*"u" + 0.007*"one" + 0.007*"state" + 0.006*"would" + 0.006*"right" + 0.005*"armenian" + 0.005*"say"')
(2, '0.019*"would" + 0.013*"one" + 0.012*"like" + 0.011*"could" + 0.011*"know" + 0.010*"get" + 0.008*"time" + 0.008*"think" + 0.007*"good" + 0.006*"much"')
(3, '0.038*"key" + 0.019*"chip" + 0.016*"encryption" + 0.015*"system" + 0.014*"use" + 0.011*"clipper" + 0.009*"information" + 0.009*"message" + 0.009*"phone" + 0.008*"algorithm"')


In [None]:
#The LDA model's topics highlight significant patterns throughout the dataset,
#despite the existence of considerable noise in the first topic. 
#Topic 1 appears to concentrate on political and legal issues, as evidenced by phrases such as "people," "government," and "law." 
#Topic 2 is more conversational and casual, including terms like "would," "like," and "know," implying general talks or viewpoints.
#Topic 3 focuses on technology and encryption, as seen by terminology like "key," "chip," and "encryption." 
#The first subject, which contains seemingly random characters and symbols, might suggest preprocessing difficulties or non-standard content. 
#The coherence score of 0.5347 indicates a moderate amount of interpretability and coherence across the topics, 
#implying that while the model has caught some relevant themes, there is still potential for improvement, particularly in addressing data noise.
