In [10]:
# read the files
import glob
directory_path = "negative_DeepSeek_texts"
text_files = glob.glob(f"{directory_path}/*.txt")

texts = []

for filepath in text_files:
    with open(filepath, 'r', encoding='utf-8') as f:
        texts.append(f.read())

2️⃣ Preprocess the Texts
We need to lowercase, remove punctuation/numbers, tokenize, remove stopwords, and lemmatize.

In [20]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
custom_stopwords = {'deepseek','ai','chinese','say'}  # noise words
lemmatizer = WordNetLemmatizer()

texts_tokens = []

for text in texts:
    text = text.lower()                          # lowercase
    text = re.sub(r'[^a-z\s]', '', text)         # keep only letters
    tokens = nltk.word_tokenize(text)            # tokenize
    tokens = [t for t in tokens if t not in stop_words and t not in custom_stopwords]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]  # lemmatize
    
    if tokens:
        texts_tokens.append(tokens)

print("Example tokens from first doc:", texts_tokens[0][:50])


Example tokens from first doc: ['julian', 'gewirtz', 'publishedjul', 'print', 'page', 'writer', 'former', 'senior', 'director', 'china', 'taiwan', 'affair', 'white', 'house', 'national', 'security', 'council', 'senior', 'research', 'scholar', 'columbia', 'school', 'international', 'public', 'affair', 'phone', 'call', 'donald', 'trump', 'day', 'inauguration', 'sam', 'altman', 'chief', 'executive', 'openai', 'told', 'incoming', 'president', 'u', 'would', 'achieve', 'humanlevel', 'artificial', 'general', 'intelligence', 'term', 'office', 'altman', 'framed']


[nltk_data] Downloading package punkt to /Users/lulu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/lulu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/lulu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Now we have texts_tokens = list of lists, where each inner list is a document’s cleaned tokens.

3️⃣ Create Dictionary and Corpus for LDA

In [22]:
from gensim import corpora

dictionary = corpora.Dictionary(texts_tokens)
dictionary.filter_extremes(no_below=2, no_above=0.5)  # optional filtering

corpus = [dictionary.doc2bow(text) for text in texts_tokens]

print("Number of unique tokens:", len(dictionary))
print("Number of documents:", len(corpus))


Number of unique tokens: 243
Number of documents: 7


4️⃣ Train LDA Topic Model

In [24]:
from gensim.models import LdaModel

num_topics = 5  # adjust based on your corpus size

lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=42,
    passes=10,
    alpha='auto',
    per_word_topics=True
)

# Print topics
for idx, topic in lda_model.print_topics(num_words=10):
    print(f"Topic {idx+1}: {topic}")


Topic 1: 0.052*"version" + 0.038*"response" + 0.024*"word" + 0.021*"political" + 0.021*"asked" + 0.016*"xi" + 0.014*"topic" + 0.014*"let" + 0.014*"party" + 0.014*"right"
Topic 2: 0.004*"human" + 0.004*"agi" + 0.004*"fear" + 0.004*"time" + 0.004*"less" + 0.004*"silicon" + 0.004*"power" + 0.004*"altman" + 0.004*"leading" + 0.004*"first"
Topic 3: 0.004*"version" + 0.004*"agi" + 0.004*"human" + 0.004*"power" + 0.004*"altman" + 0.004*"fear" + 0.004*"response" + 0.004*"less" + 0.004*"war" + 0.004*"leading"
Topic 4: 0.053*"app" + 0.021*"company" + 0.020*"response" + 0.015*"good" + 0.015*"monday" + 0.014*"made" + 0.014*"version" + 0.014*"web" + 0.014*"like" + 0.014*"look"
Topic 5: 0.030*"agi" + 0.019*"fear" + 0.018*"human" + 0.016*"time" + 0.016*"less" + 0.014*"first" + 0.013*"leading" + 0.013*"may" + 0.013*"power" + 0.012*"altman"


5️⃣ (Optional) Visualize Topics
If you want interactive topic visualization:

In [26]:
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis


| Topic | Theme/Label                    | Keywords (Top Words)                             |
| ----- | ------------------------------ | ------------------------------------------------ |
| 1     | Government & Tech Industry     | use, data, government, nvidia, trump             |
| 2     | Chatbots & AI Interaction      | question, response, answer, chatbot, taiwan      |
| 3     | Tech Business & US Politics    | nvidia, investment, billion, trump, american     |
| 4     | Government Reports & Tech News | report, government, firm, information, according |
| 5     | Data Privacy & Big Tech        | data, google, privacy, openais, information      |
