### ***Problem01***
---

In [1]:
### Import library & data set
# library
from google.colab import drive
drive.mount("/content/drive")
import os
import re
import time
import sys
import nltk
import random
import numpy as np
import pandas as pd
from itertools import pairwise
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from sklearn.decomposition import NMF
from sklearn.datasets import load_digits
from keras.preprocessing.text import Tokenizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
nltk.download('punkt')
# dataset
# 20ng
categories = ['alt.atheism', 'talk.religion.misc','comp.graphics', 'sci.space']
ng = fetch_20newsgroups(categories=categories)

Mounted at /content/drive


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [12]:
### 20ng dataset
text = []
for i in range(len(ng.data)):
  word = ng.data[i]
  word = re.sub(r"[^a-zA-Z]", " ", word)
  words = re.sub(r"\s+", " ", word)
  text.append(words)
df = pd.DataFrame(text, columns=["text"])

# normalization
df["text"] = df.text.str.strip()
df["text"] = df.text.str.lower()
df_20ng = np.array(df["text"])

# tokenization
docs = []
for i in range(len(df_20ng)):
  words = list(word_tokenize(str(df_20ng[i])))
  docs.append(words)
X = docs[:100]

In [17]:
### gibbs
def gibbs_sampling_lda(documents,num_topics,num_iterations):
  # bag of words
  global vocab
  def create_vocab(docs): # vocab 나열
    vocab = set()
    for doc in docs:
      vocab.update(doc)
    return sorted(vocab)

  # give index
  def doc_index(doc, vocab):
    return [vocab.index(word) for word in doc]

  vocab = create_vocab(documents)
  num_docs = len(documents)
  """vocab list에 각 단어 index 생성"""
  doc_indices = [doc_index(doc,vocab) for doc in documents]

  # parameter rest
  """topic matrix 생성, 계산에 필요한 matirx들 생성"""
  num_words = len(vocab)
  doc_topic_counts = np.zeros((num_docs,num_topics)) # docs topic matrix
  topic_word_counts = np.zeros((num_topics, num_words)) # number of words that topic contains
  topic_counts = np.zeros(num_topics) # 토픽 수 matrix
  doc_lengths = [len(doc) for doc in documents] # 각 document vocab 개수
  total_words = sum(doc_lengths) # 각 행 모든 단어 수

  # random initialize
  """
  doc 행 수 만큼 (d)
    doc 안의 단어 수만큼 (i)
      t = 행(d) 열(i) value(randint)
      w = 행(d) 열(i) value(index) / index를 통해 본래 단어를 찾아갈 수 있다
        단어 위치 추적 행렬 (w)
        실제 ouput (t)
  """
  doc_assignments = [[random.randint(0,num_topics-1) for _ in range(len(doc))] for doc in documents]
  for d in range(num_docs):
    for i in range(len(documents[d])):
      t = doc_assignments[d][i] #
      w = doc_indices[d][i]
      doc_topic_counts[d,t] += 1
      topic_word_counts[t,w] += 1
      topic_counts[t] += 1

  """
  iteration 반복 수 만큼
    전체 doc 수 만큼
      doc 단어 수 만큼
        t, w 생성
        t = doc/words(random), randint=topic

        doc_topic_counts = docs/topics, randint에 따라 맞는 topic에 += 1
        topic_word_counts = topics/num_words, 각 topic에 단어 수
        topic_counts = topics/count, 맞는 topic 나오면 -= 1
  """
  for _ in range(num_iterations):
    for d in range(num_docs):
      for i in range(len(documents[d])):
        t = doc_assignments[d][i]
        w = doc_indices[d][i]

        doc_topic_counts[d,t] -= 1
        topic_word_counts[t,w] -= 1
        topic_counts[t] -= 1

         # posterior probability
        """
        사후 확률 = doc/word 각각의 사후 확률을 계산하여 가장 높은 확률의 topic을 지정한다
        사후 확률에 맞춰서 matrix t (random choice) -> 실재 단어 한정 topic들의 단어 수, z
        (해당 토픽의 index 있는 단어 한정 토픽 수)*(해당 doc의 토픽 수) / (각 토픽 수 + 전체 단어 수)
        """
        topic_probs = (topic_word_counts[:,w] + 1) * (doc_topic_counts[d] + 1) / (topic_counts + num_words)
        new_t = np.random.choice(range(num_topics), p=topic_probs / topic_probs.sum())

        doc_topic_counts[d, new_t] += 1
        topic_word_counts[new_t, w] += 1
        topic_counts[new_t] += 1
        doc_assignments[d][i] = new_t

  topic_word_probs = (topic_word_counts + 1) / (topic_counts[:, np.newaxis] + num_words)

  return topic_word_probs

In [15]:
# hyperparameters
num_topics = 5
num_iterations = 50

# implement lda
topic_word_probs = gibbs_sampling_lda(X, num_topics, num_iterations)

# result
for t, topic_probs in enumerate(topic_word_probs):
    top_words = [vocab[i] for i in np.argsort(topic_probs)[::-1][:5]]
    print("Topic {}: {}".format(t+1, top_words))

Topic 1: ['the', 'of', 'to', 'a', 'is']
Topic 2: ['de', 'van', 'het', 'een', 'en']
Topic 3: ['dm', 'koresh', 'tek', 'evil', 'almanac']
Topic 4: ['graphics', 'nasa', 'gov', 'systems', 'edu']
Topic 5: ['scodal', 'hendrix', 'clementine', 'information', 'email']
