# Lab Assignment 3

## Mohammed Samir Ali (SW01080809)
## Muhammad Farish Naufal Bin Norzali (SW01081139)

### Import Libraries & Resources

In [7]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hamody/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/hamody/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/hamody/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Data Reading

In [8]:
# Read the data (use only the ‘text’ column)
df = pd.read_csv('news_dataset.csv', usecols=['text'])

### Perform text pre-processing

In [9]:
# Remove null values
df.dropna(subset=['text'], inplace=True)

# Initialize stopwords, lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenize the text into words and convert to lowercase
    tokens = word_tokenize(text.lower())
    # Filter out non-alphanumeric tokens and remove stopwords
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    # Lemmatize each tokenn
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

# Apply preprocessing to each document in the dataset
df['processed_text'] = df['text'].apply(preprocess_text)

### LDA Model Training

In [10]:
# Create a Gensim Dictionary object from the preprocessed documents
dictionary = corpora.Dictionary(df['processed_text'])
# Convert each preprocessed document into a bag-of-words representation using the dictionary
corpus = [dictionary.doc2bow(doc) for doc in df['processed_text']]

# Train an LDA model on the corpus with 4 topics using Gensim's LdaModel class
lda_model = LdaModel(corpus, num_topics=4, id2word=dictionary, passes=15)


### Coherence Score Calculation

In [11]:
# Calculate the coherence score for the LDA model
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['processed_text'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

# Display the Topic Coherence Score
print(f'Topic Coherence Score (C_V): {coherence_lda:.4f}')

Topic Coherence Score (C_V): 0.6691


### Results Interpretation

In [13]:
# Print the top terms for each topic
print("\nTop Terms for Each Topic:")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}:")
    terms = [term.strip() for term in topic.split("+")]
    for term in terms:
        weight, word = term.split("*")
        print(f"- {word.strip()} (weight: {weight.strip()})")
    print()


Top Terms for Each Topic:
Topic 0:
- "1" (weight: 0.032)
- "max" (weight: 0.026)
- "0" (weight: 0.026)
- "q" (weight: 0.023)
- "2" (weight: 0.021)
- "g" (weight: 0.017)
- "r" (weight: 0.016)
- "7" (weight: 0.016)
- "3" (weight: 0.014)
- "p" (weight: 0.013)

Topic 1:
- "x" (weight: 0.017)
- "key" (weight: 0.011)
- "file" (weight: 0.008)
- "use" (weight: 0.006)
- "system" (weight: 0.006)
- "program" (weight: 0.005)
- "encryption" (weight: 0.005)
- "chip" (weight: 0.005)
- "information" (weight: 0.005)
- "available" (weight: 0.004)

Topic 2:
- "db" (weight: 0.010)
- "one" (weight: 0.007)
- "would" (weight: 0.006)
- "get" (weight: 0.005)
- "like" (weight: 0.005)
- "drive" (weight: 0.005)
- "window" (weight: 0.005)
- "use" (weight: 0.005)
- "problem" (weight: 0.005)
- "know" (weight: 0.004)

Topic 3:
- "would" (weight: 0.008)
- "one" (weight: 0.007)
- "people" (weight: 0.007)
- "think" (weight: 0.005)
- "know" (weight: 0.004)
- "time" (weight: 0.004)
- "say" (weight: 0.004)
- "like" (weigh

### Interpretation of the coherence score
Interpretation of the Coherence Score:
The coherence score of 0.6691 indicates the degree of semantic similarity between the top words in each topic generated by the LDA model. A higher coherence score generally suggests that the topics are more interpretable and meaningful to humans. For practical applications, a higher coherence score is desirable as it implies that the topics are more coherent and easier to understand, leading to more useful insights from the data. In this case, the score provides a quantitative measure of the quality of the topics derived from the news articles.
