In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import re
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('news_dataset.csv')

texts = df['text'].dropna().tolist()

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    tokens = text.lower().split()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return tokens

processed_texts = [preprocess_text(text) for text in texts]

[nltk_data] Downloading package stopwords to C:\Users\Tharma
[nltk_data]     Raj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Tharma
[nltk_data]     Raj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Tharma
[nltk_data]     Raj\AppData\Roaming\nltk_data...


In [7]:
dictionary = corpora.Dictionary(processed_texts)

dictionary.filter_extremes(no_below=15, no_above=0.5)

corpus = [dictionary.doc2bow(text) for text in processed_texts]

lda_model = LdaModel(corpus, num_topics=4, id2word=dictionary, passes=15)

In [8]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_texts, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model_lda.get_coherence()

print(f'Coherence Score: {coherence_score}')

Coherence Score: 0.7337524525375043


In [13]:
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)
    
#Tharma Raj(IS01081129)
#Yovesh Varma(IS01081505)
    
#Explanation
#The coherence score is a statistic that is employed in topic models, such the LDA (Latent Dirichlet Allocation) model utilized in this investigation, to assess the quality and interpretability of resulting topics. The concepts that the LDA model identified have a coherence score of 0.7337524525375043, which means that the words inside each subject are logically connected and make sense when combined. Better subject coherence is often indicated by a coherence score nearer to 1, while less significant topics are indicated by a score nearer to 0. Consequently, a score of roughly 0.73 indicates that, while there may still be space for development, the model has generated themes that are well-formed and interpretable. This coherence score suggests that the LDA model parameters and preprocessing techniques, including stopword removal and lemmatization, are successfully capturing the underlying themes in the news dataset.

(0, '0.060*"1" + 0.058*"0" + 0.040*"2" + 0.038*"x" + 0.027*"3" + 0.026*"4" + 0.025*"5" + 0.021*"6" + 0.021*"w" + 0.019*"7"')
(1, '0.607*"ax" + 0.063*"q" + 0.045*"max" + 0.029*"3" + 0.020*"p" + 0.017*"r" + 0.015*"g" + 0.014*"7" + 0.011*"n" + 0.007*"pl"')
(2, '0.011*"key" + 0.007*"use" + 0.007*"system" + 0.007*"file" + 0.006*"one" + 0.005*"edu" + 0.005*"chip" + 0.005*"program" + 0.004*"encryption" + 0.004*"window"')
(3, '0.009*"would" + 0.008*"one" + 0.008*"people" + 0.005*"think" + 0.005*"know" + 0.005*"time" + 0.004*"u" + 0.004*"like" + 0.004*"say" + 0.004*"year"')
