In [None]:
import numpy as np
from scipy.special import digamma
from utils import *
from LDA import *


In [None]:
vocab_path = "data/nyt_vocab.txt"
data_path = "data/nyt_data.txt"

In [None]:
vocab = load_vocab(vocab_path)
V = len(vocab)
docs_counts = load_nyt_counts(data_path)
words = build_corpus_tokens(docs_counts, N=200, min_len=150, seed=0)  # [M, N]
np.save("outputs/words.npy", words)

In [None]:
# Train LDA model
lda = LDA(
    K=25, 
    N=200, 
    max_em_iters=50, 
    max_e_iters=100,
    tol_e=1e-2,
    tol_alpha=1e-4,
    seed=0
)
lda.fit(words, V=len(vocab))
lda.save_params("outputs")


=== EM iter 0 ===
[E-step] iter=00 avg-delta=3.569e-02
[E-step] iter=05 avg-delta=2.695e-02
[E-step] iter=10 avg-delta=3.144e-02
[E-step] iter=15 avg-delta=3.055e-02
[E-step] iter=20 avg-delta=2.462e-02
[E-step] iter=25 avg-delta=1.835e-02
[E-step] iter=30 avg-delta=1.370e-02
[E-step] iter=35 avg-delta=1.061e-02
[E-step] converged at iter=37 (avg-delta=9.676e-03)
[M-step α] iter=00 ||Δα||=3.067e-02 α_mean=0.103
[M-step α] iter=05 ||Δα||=2.234e-03 α_mean=0.109
[M-step α] iter=10 ||Δα||=1.140e-04 α_mean=0.109
[M-step α] converged at iter=11 (||Δα||=6.327e-05)

=== EM iter 1 ===
[E-step] iter=00 avg-delta=3.958e-02
[E-step] iter=05 avg-delta=2.239e-02
[E-step] iter=10 avg-delta=1.558e-02
[E-step] iter=15 avg-delta=1.143e-02
[E-step] converged at iter=18 (avg-delta=9.721e-03)
[M-step α] iter=00 ||Δα||=2.854e-02 α_mean=0.104
[M-step α] iter=05 ||Δα||=1.120e-03 α_mean=0.0986
[M-step α] iter=10 ||Δα||=6.197e-05 α_mean=0.0983
[M-step α] converged at iter=10 (||Δα||=6.197e-05)

=== EM iter 2 =

In [None]:
beta_path="outputs/beta.npy"
vocab_path="data/nyt_vocab.txt"
num_topics=25
top_n=10
seed=18

In [None]:
def top_words_for_topic(beta: np.ndarray, vocab: list[str], k: int, top_n: int = 10):
    row = beta[k]
    idx = np.argpartition(row, -top_n)[-top_n:]
    idx = idx[np.argsort(row[idx])[::-1]]
    return [(vocab[i], float(row[i])) for i in idx]

def sample_topics(K: int, num: int = 5, seed: int = 0) -> np.ndarray:
    rng = np.random.default_rng(seed)
    return np.sort(rng.choice(K, size=num, replace=False))

In [None]:
beta = np.load(beta_path)
vocab = load_vocab(vocab_path)
K, V = beta.shape

topics = sample_topics(K, num_topics, seed)
for k in topics:
    words = top_words_for_topic(beta, vocab, k, top_n)
    print(f"\nTopic {k} (top {top_n} words):")
    print(", ".join(w for w, _ in words))



Topic 0 (top 10 words):
home, open, city, house, area, small, place, lot, start, street

Topic 1 (top 10 words):
price, company, market, percent, sell, american, food, change, industry, service

Topic 2 (top 10 words):
political, state, vote, republican, party, leader, support, campaign, election, issue

Topic 3 (top 10 words):
company, percent, executive, market, industry, chief, analyst, increase, business, stock

Topic 4 (top 10 words):
official, issue, law, president, question, member, pay, company, case, government

Topic 5 (top 10 words):
percent, school, program, state, child, problem, student, public, city, pay

Topic 6 (top 10 words):
company, business, stock, market, executive, sell, buy, deal, large, percent

Topic 7 (top 10 words):
country, political, military, states, power, american, government, group, official, war

Topic 8 (top 10 words):
art, life, world, child, woman, play, write, present, open, artist

Topic 9 (top 10 words):
official, government, political, leader,

In [None]:
beta_path="outputs/beta.npy"
gamma_path="outputs/gamma.npy"
vocab_path="data/nyt_vocab.txt"

top_topics=3
top_words=10

In [None]:
beta = np.load(beta_path)   # [K, V]
gamma = np.load(gamma_path) # [M, K]
vocab = load_vocab(vocab_path)

theta = gamma / gamma.sum(axis=1, keepdims=True)

doc_indices = [-2, -1]
names = ["[Music] Pink Floyd Article", "[Sport] Lakers Article"]

for d, name in zip(doc_indices, names):
    theta_d = theta[d]
    topk = np.argsort(theta_d)[-top_topics:][::-1]
    print(f"\n=== Document {d} {name} ===")
    for k in topk:
        print(f"  Topic {k}: θ={theta_d[k]:.4f}")
        words = top_words_for_topic(beta, vocab, k, top_words)
        print("   Top words:", ", ".join(w for w, _ in words))



=== Document -2 [Music] Pink Floyd Article ===
Top topics and their θ weights:
  Topic 15: θ=0.9608
   Top words: life, mrs, home, place, thing, add, live, building, house, large
  Topic 7: θ=0.0372
   Top words: country, political, military, states, power, american, government, group, official, war
  Topic 20: θ=0.0001
   Top words: states, american, country, official, government, percent, international, market, increase, world

=== Document -1 [Sport] Lakers Article ===
Top topics and their θ weights:
  Topic 12: θ=0.5701
   Top words: case, campaign, hold, state, ask, public, candidate, win, tell, play
  Topic 5: θ=0.3910
   Top words: percent, school, program, state, child, problem, student, public, city, pay
  Topic 8: θ=0.0371
   Top words: art, life, world, child, woman, play, write, present, open, artist


In [None]:
def infer_theta(words, alpha, beta, max_iters=50):
    M, N = words.shape
    K = len(alpha)
    phi = np.full((M, N, K), 1.0 / K)
    gamma = np.tile(alpha + N / K, (M, 1))
    
    for _ in range(max_iters):
        dig = digamma(gamma)
        beta_lookup = np.zeros((M, N, K))
        for k in range(K):
            beta_lookup[:, :, k] = beta[k, words]
        
        phi = beta_lookup * np.exp(dig[:, None, :])
        phi /= phi.sum(axis=2, keepdims=True) + 1e-12
        gamma = alpha + phi.sum(axis=1)
    
    return gamma / gamma.sum(axis=1, keepdims=True)

In [None]:
alpha_path = "outputs/alpha.npy"
beta_path = "outputs/beta.npy"
data_path = "data/nyt_data.txt"
vocab_path = "data/nyt_vocab.txt"
N=200
seed=0
top_topics=3
top_words=10
alpha = np.load(alpha_path)
beta  = np.load(beta_path)
vocab = load_vocab(vocab_path)
docs  = load_nyt_counts(data_path)

last_two = docs[-2:]
rng = np.random.default_rng(seed)
sampled = []
for pairs in last_two:
    toks = expand_counts_to_tokens(pairs)
    idx = rng.integers(0, toks.shape[0], size=N) if toks.shape[0] < N else rng.choice(toks.shape[0], size=N, replace=False)
    sampled.append(toks[idx])
words = np.stack(sampled, 0).astype(np.int32)  # [2,N]

theta = infer_theta(words, alpha, beta, max_iters=100, tol=1e-3)

names = ["[Music] Pink Floyd Article", "[Sport] Lakers Article"]
for d, name in zip(doc_indices, names):
    theta_d = theta[d]
    topk = np.argsort(theta_d)[-top_topics:][::-1]
    print(f"\n=== Document {d} {name} ===")
    for k in topk:
        print(f"  Topic {k}: θ={theta_d[k]:.4f}")
        words = top_words_for_topic(beta, vocab, k, top_words)
        print("   Top words:", ", ".join(w for w, _ in words))



=== Document -2 [Music] Pink Floyd Article ===
  Topic 24: θ=0.3950
   Top words: thing, place, life, book, play, turn, world, write, live, man
  Topic 21: θ=0.1964
   Top words: play, film, television, life, art, music, woman, man, performance, movie
  Topic 8: θ=0.1711
   Top words: art, life, world, child, woman, play, write, present, open, artist

=== Document -1 [Sport] Lakers Article ===
  Topic 18: θ=0.4796
   Top words: game, play, team, win, player, point, second, victory, season, final
  Topic 19: θ=0.2850
   Top words: game, play, win, thing, team, season, hit, player, lose, point
  Topic 5: θ=0.0869
   Top words: percent, school, program, state, child, problem, student, public, city, pay


In [32]:
def generate_doc(theta_row, beta, vocab, length=30, seed=0):
    rng = np.random.default_rng(seed)
    K, V = beta.shape
    topics = rng.choice(K, size=length, p=theta_row)
    words = [rng.choice(V, p=beta[t]) for t in topics]
    return " ".join(vocab[w] for w in words), topics


In [None]:

alpha_path="outputs/alpha.npy"
beta_path="outputs/beta.npy"
data_path="data/nyt_data.txt"
vocab_path="data/nyt_vocab.txt"
N=200
L=30
seed=0
alpha = np.load(alpha_path) 
beta  = np.load(beta_path)
vocab = load_vocab(vocab_path)

docs_raw = load_nyt_counts(data_path)
last_two = docs_raw[-2:]
rng = np.random.default_rng(seed)
sampled = []
for pairs in docs[-2:]:
    toks = expand_counts_to_tokens(pairs)
    sampled.append(sample_document_tokens(toks, 200, rng))
words = np.stack(sampled)
theta = infer_theta(words, alpha, beta)

names = ["[Music] Pink Floyd Article", "[Sport] Lakers Article"]
for i, name in enumerate(names):
    gen_text, topics = generate_doc(theta[i], beta, vocab, 30, seed=42+i)
    counts = np.bincount(topics, minlength=25)
    print(f"Generated 30-word doc for {name}")
    print(gen_text)
    top3 = np.argsort(counts)[-3:][::-1]
    print(f"Topics: {', '.join(f'{t}(count={counts[t]})' for t in top3)}")
    print()


Generated 30-word doc for [Music] Pink Floyd Article
sing tale water step standard allow small star study corporate victory dry murder important colleague minority foot hit aide political return play mention dinner ride tend practice writer live hold
Topics: 24(count=16), 8(count=6), 21(count=5)

Generated 30-word doc for [Sport] Lakers Article
away estimate large goal sure big past college raise final word win ball team guy add measure confidence minute team kind love medium force play contact class ball reject hot
Topics: 18(count=13), 19(count=10), 1(count=4)

