step 1: read the raw corpus, 

In [1]:
import os

folder_path = "./corpus_deepseek/"
documents = []

for filename in os.listdir(folder_path):
    if filename.endswith(".txt") and filename != "all_texts.txt":  # ignore combined file
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            documents.append(file.read())

print(f"Loaded {len(documents)} documents for topic modeling")
print("First document preview:\n", documents[0][:500])


Loaded 120 documents for topic modeling
First document preview:
 Try unlimited access Only Skr10 for 4 weeks

Then Skr739 per month. Complete digital access to quality FT journalism on any device. Cancel anytime during your trial.


2️⃣ Preprocess the Texts
We need to lowercase, remove punctuation/numbers, tokenize, remove stopwords, and lemmatize.

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
custom_stopwords = {'deepseek','ai','chinese','say'}  # noise words
lemmatizer = WordNetLemmatizer()

docs_tokens = []

for text in documents:
    text = text.lower()                          # lowercase
    text = re.sub(r'[^a-z\s]', '', text)         # keep only letters
    tokens = nltk.word_tokenize(text)            # tokenize
    tokens = [t for t in tokens if t not in stop_words and t not in custom_stopwords]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]  # lemmatize
    
    if tokens:
        docs_tokens.append(tokens)

print("Example tokens from first doc:", docs_tokens[0][:50])


[nltk_data] Downloading package punkt to /Users/lulu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/lulu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/lulu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Example tokens from first doc: ['try', 'unlimited', 'access', 'skr', 'week', 'skr', 'per', 'month', 'complete', 'digital', 'access', 'quality', 'ft', 'journalism', 'device', 'cancel', 'anytime', 'trial']


Now we have docs_tokens = list of lists, where each inner list is a document’s cleaned tokens.

3️⃣ Create Dictionary and Corpus for LDA

In [5]:
from gensim import corpora

dictionary = corpora.Dictionary(docs_tokens)
dictionary.filter_extremes(no_below=2, no_above=0.5)  # optional filtering

corpus = [dictionary.doc2bow(text) for text in docs_tokens]

print("Number of unique tokens:", len(dictionary))
print("Number of documents:", len(corpus))


Number of unique tokens: 5881
Number of documents: 119


4️⃣ Train LDA Topic Model

In [7]:
from gensim.models import LdaModel

num_topics = 5  # adjust based on your corpus size

lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=42,
    passes=10,
    alpha='auto',
    per_word_topics=True
)

# Print topics
for idx, topic in lda_model.print_topics(num_words=10):
    print(f"Topic {idx+1}: {topic}")


Topic 1: 0.007*"use" + 0.006*"data" + 0.005*"government" + 0.004*"make" + 0.004*"nvidia" + 0.004*"like" + 0.004*"trump" + 0.004*"question" + 0.004*"world" + 0.004*"firm"
Topic 2: 0.006*"question" + 0.006*"response" + 0.005*"answer" + 0.005*"like" + 0.005*"image" + 0.005*"asked" + 0.004*"say" + 0.004*"chatbot" + 0.004*"taiwan" + 0.004*"world"
Topic 3: 0.005*"monday" + 0.005*"nvidia" + 0.005*"power" + 0.004*"trump" + 0.004*"firm" + 0.004*"american" + 0.004*"last" + 0.004*"investment" + 0.003*"billion" + 0.003*"per"
Topic 4: 0.004*"report" + 0.004*"last" + 0.004*"government" + 0.004*"firm" + 0.004*"use" + 0.003*"according" + 0.003*"may" + 0.003*"liang" + 0.003*"information" + 0.003*"would"
Topic 5: 0.008*"data" + 0.006*"newsletter" + 0.005*"r" + 0.005*"may" + 0.004*"google" + 0.004*"use" + 0.004*"bn" + 0.004*"openais" + 0.004*"privacy" + 0.004*"information"


5️⃣ (Optional) Visualize Topics
If you want interactive topic visualization:

In [9]:
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis
