In [None]:
%load_ext autoreload
%autoreload 2

from data.dataloader import DataLoader

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1]:
!pwd

/home/haiyao/kefan_workspace/NLP_A2_2025


In [None]:
import pandas as pd
data = pd.read_csv('dataset.csv')

In [None]:
# preprocess title column
data['cleaned_title'] = data['title'].apply(
            lambda x: text_clean(x, nlp)
        )

In [None]:
# generate wordcloud for the title in posts

from wordcloud import WordCloud
import matplotlib.pyplot as plt

text = " ".join(word for word in data_1['cleaned_title'])
wordc = WordCloud(width = 800, height = 600, background_color='white').generate(text)
plt.figure()
plt.imshow(wordc, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.85)
tfidf = vectorizer.fit_transform(data['cleaned_title'])

feature_names = vectorizer.get_feature_names_out()
idf_scores = vectorizer.idf_

# create a dictionary mapping each term to its IDF score.
idf_dict = dict(zip(feature_names, idf_scores))

# sort the terms based on the IDF scores (lowest first).
sorted_terms = sorted(idf_dict.items(), key=lambda item: item[1])
#print(sorted_terms)

# create custom stop words list to add those terms with low IDF score (threshold < 4)
custom_stop_words = []
for term, score in sorted_terms:
    if score <= 4:
        custom_stop_words.append(term)

# again perform stop word removal
def custom_stop_word_removal(text):
    tokens = word_tokenize(text)
    text = [word for word in tokens if word.lower() not in custom_stop_words]
    text = " ".join(text)
    return text

data_1['cleaned_title'] = data_1['cleaned_title'].apply(custom_stop_word_removal)

In [None]:
updated_text = " ".join(word for word in data_1['cleaned_title_x'])
updated_wordc = WordCloud(width = 800, height = 600, background_color='white').generate(updated_text)
plt.figure()
plt.imshow(updated_wordc, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
# topic modelling - LDA

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

LDA_vectorizer = CountVectorizer(max_df=0.85)
LDA_tfidf = LDA_vectorizer.fit_transform(data_1['cleaned_text'])

lda = LatentDirichletAllocation(n_components=10)
lda.fit(LDA_tfidf)

feature_names = LDA_vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    top_words = [feature_names[i] for i in topic.argsort()[-10:]]
    print(f"Topic {topic_idx}: {' '.join(top_words)}")

In [None]:
# K-means clustering with tf-idf

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

k_vectorizer = TfidfVectorizer(max_df=0.85)
k_tfidf = k_vectorizer.fit_transform(data_1['cleaned_text'])

# Set number of clusters (e.g., 5) and perform k-means clustering.
kmeans = KMeans(n_clusters=10)
kmeans.fit(k_tfidf)


# Extract cluster centers and find the top terms in each cluster.
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = k_vectorizer.get_feature_names_out()

for i in range(10):
    top_terms = [terms[ind] for ind in order_centroids[i, :10]]
    print(f"Cluster {i}: {' '.join(top_terms)}")