In [1]:
# read the files
import glob
directory_path = "negative_DeepSeek_texts"
text_files = glob.glob(f"{directory_path}/*.txt")

texts = []

for filepath in text_files:
    with open(filepath, 'r', encoding='utf-8') as f:
        texts.append(f.read())

2️⃣ Preprocess the Texts
We need to lowercase, remove punctuation/numbers, tokenize, remove stopwords, and lemmatize.

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
custom_stopwords = {'deepseek','ai','chinese','say'}  # noise words
lemmatizer = WordNetLemmatizer()

texts_tokens = []

for text in texts:
    text = text.lower()                          # lowercase
    text = re.sub(r'[^a-z\s]', '', text)         # keep only letters
    tokens = nltk.word_tokenize(text)            # tokenize
    tokens = [t for t in tokens if t not in stop_words and t not in custom_stopwords]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]  # lemmatize
    
    if tokens:
        texts_tokens.append(tokens)

print("Example tokens from first doc:", texts_tokens[0][:50])


[nltk_data] Downloading package punkt to /Users/lulu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/lulu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/lulu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Example tokens from first doc: ['julian', 'gewirtz', 'publishedjul', 'print', 'page', 'writer', 'former', 'senior', 'director', 'china', 'taiwan', 'affair', 'white', 'house', 'national', 'security', 'council', 'senior', 'research', 'scholar', 'columbia', 'school', 'international', 'public', 'affair', 'phone', 'call', 'donald', 'trump', 'day', 'inauguration', 'sam', 'altman', 'chief', 'executive', 'openai', 'told', 'incoming', 'president', 'u', 'would', 'achieve', 'humanlevel', 'artificial', 'general', 'intelligence', 'term', 'office', 'altman', 'framed']


Now we have texts_tokens = list of lists, where each inner list is a document’s cleaned tokens.

3️⃣ Create Dictionary and Corpus for LDA

In [5]:
from gensim import corpora

dictionary = corpora.Dictionary(texts_tokens)
dictionary.filter_extremes(no_below=2, no_above=0.5)  # optional filtering

corpus = [dictionary.doc2bow(text) for text in texts_tokens]

print("Number of unique tokens:", len(dictionary))
print("Number of documents:", len(corpus))


Number of unique tokens: 379
Number of documents: 11


4️⃣ Train LDA Topic Model

In [7]:
from gensim.models import LdaModel

num_topics = 2  # adjust based on your corpus size

lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=42,
    passes=10,
    alpha='auto',
    per_word_topics=True
)

# Print topics
for idx, topic in lda_model.print_topics(num_words=10):
    print(f"Topic {idx+1}: {topic}")


Topic 1: 0.021*"version" + 0.020*"taiwan" + 0.014*"response" + 0.012*"technology" + 0.012*"however" + 0.011*"answer" + 0.010*"word" + 0.010*"political" + 0.009*"app" + 0.008*"video"
Topic 2: 0.021*"app" + 0.012*"chatgpt" + 0.011*"trump" + 0.011*"monday" + 0.011*"company" + 0.010*"month" + 0.009*"south" + 0.009*"last" + 0.009*"data" + 0.008*"rival"


5️⃣ (Optional) Visualize Topics
If you want interactive topic visualization:

In [9]:
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis


In [50]:
import tomotopy as tp
import little_mallet_wrapper
import seaborn
import glob
from pathlib import Path
import pandas as pd

In [64]:
directory = "negative_DeepSeek_texts"

In [66]:
files = glob.glob(f"{directory}/*.txt")

In [68]:
training_data = []
original_texts = []
titles = []

for file in files:
    text = open(file, encoding='utf-8').read()
    processed_text = little_mallet_wrapper.process_string(text, numbers='remove')
    training_data.append(processed_text)
    original_texts.append(text)
    titles.append(Path(file).stem)
print(titles)


['Global_AI_rivalry_is_a_dangerous_game', "What_questions_will_China's_DeepSeek_not_answer?_–", "DeepSeek:_Is_China's_AI_tool_as_good_as_it_seems?", 'First_Thing:_Donald_Trump_calls_China’s_DeepSeek_A', 'South_Korea_Bans_Downloads_of_DeepSeek,_the_Chines', 'DeepSeek_has_ripped_away_AI’s_veil_of_mystique._Th', 'Diving_into_DeepSeek:_inside_the_7_February_Guardi']


In [70]:
len(training_data), len(original_texts), len(titles)

(7, 7, 7)

In [77]:
# Number of topics to return
num_topics = 3
# Numer of topic words to print out
num_topic_words = 10
# Intialize the model
model = tp.LDAModel(k=num_topics)

# Add each document to the model, after splitting it up into words
for text in training_data:
    model.add_doc(text.strip().split())
    
print("Topic Model Training...\n\n")
# Iterate over the data 10 times
iterations = 10
for i in range(0, 100, iterations):
    model.train(iterations)
    print(f'Iteration: {i}\tLog-likelihood: {model.ll_per_word}')

print("\nTopic Model Results:\n\n")
# Print out top 10 words for each topic
topics = []
topic_individual_words = []
for topic_number in range(0, num_topics):
    topic_words = ' '.join(word for word, prob in model.get_topic_words(topic_id=topic_number, top_n=num_topic_words))
    topics.append(topic_words)
    topic_individual_words.append(topic_words.split())
    print(f"✨Topic {topic_number}✨\n\n{topic_words}\n")

Topic Model Training...


Iteration: 0	Log-likelihood: -8.995984777730174
Iteration: 10	Log-likelihood: -8.960639241389044
Iteration: 20	Log-likelihood: -8.94466734824113
Iteration: 30	Log-likelihood: -8.940468452542062
Iteration: 40	Log-likelihood: -8.931678306548205
Iteration: 50	Log-likelihood: -8.910380112384805
Iteration: 60	Log-likelihood: -8.925085632143041
Iteration: 70	Log-likelihood: -8.882198243062712
Iteration: 80	Log-likelihood: -8.888182460949682
Iteration: 90	Log-likelihood: -8.887349981822982

Topic Model Results:


✨Topic 0✨

china chinese version english political government human part responses new

✨Topic 1✨

taiwan however agi one said asked intelligence even word trump

✨Topic 2✨

deepseek technology chatbot questions response chatgpt app answers openai time

