In [451]:
import pandas as pd
import numpy as np
from scipy.special import gammaln, psi
import math
import random
from numpy.random import multinomial
from numpy.random import randint
from sklearn.feature_extraction.text import CountVectorizer

In [452]:
stopwords =['.', '$', '#', '[', ']', '(', ')', '|', '*', 
        ':', '=', '/', '>', '<', '+', '{', '}', ',', '?', '&', '-', '@', "'",'%', '^']

In [467]:
def data_preprocess(corpus, drop_n_freq=1000):
    """preprocess code refer KaffuChino's implement for its efficiency: https://www.zhihu.com/people/hongkongjournalist/posts
    """
    num_docs = len(corpus)
    all_words = list()
    raw_docs = list()
    # Remove meaningless symbols, get all words of which length > 2
    for i in range(num_docs):
        doc_text = corpus[i].lower()
        for s in stopwords:
            doc_text = doc_text.replace(s, ' ')
        raw_docs.append(list())
        for word in doc_text.split():
            if len(word) > 2:
                raw_docs[-1].append(word)
        all_words.extend(raw_docs[-1])

    # Get global word frequency and sort, remove top n frequent words, which is likely to be meanless.
    unique_words = list(set(all_words))
    word_frequency = dict()
    for wd in unique_words:
        word_frequency[wd] = 0
    for wd in all_words:
        word_frequency[wd] += 1
    unique_words.sort(key=lambda wd: word_frequency[wd], reverse=True)
    vocabulary_list = unique_words[drop_n_freq:]

    # Each word in the vocabulary will be mapped to an positive integer for convenience of indexing.
    # Thus to look up a certain word in the vocabulary takes O(1) time instead of O(log|V|)
    vocabulary_idx = dict()
    for t, wd in enumerate(vocabulary_list):
        vocabulary_idx[wd] = t
    vocabulary_set = set(vocabulary_list)

    # Map words in the documents to integers.
    # Append only words in the vocabulary set. 
    data_set_mapped = []
    for raw_doc in raw_docs:
        data_set_mapped.append([])
        for word in raw_doc:
            if word in vocabulary_set:
                data_set_mapped[-1].append(vocabulary_idx[word])

    return data_set_mapped, vocabulary_list
    
    

In [468]:
import numpy as np


class LDAmodel:
    def __init__(self, corpus, words, K, alpha=1, gamma=1, iter_step=200):
        # self.W = self.preprocess(corpus)
        self.X = corpus
        self.W = words
        self.N_D = len(self.X)  # num of docs
        self.N_W = len(self.W)  # num of words
        self.N_K = K  # num of topics
        self.alpha = alpha # Dirichlet priors
        self.gamma = gamma # Dirichlet priors
        self.iter = iter_step
        self.L = np.array([len(x) for x in self.X])
        self.initialization()


    def initialization(self):
        # Z: word topic
        self.Z = np.zeros(shape=[self.N_D, self.N_W], dtype=int)
        self.n_ik = np.zeros(shape=[self.N_D, self.N_K], dtype=int)
        self.n_wk = np.zeros(shape=[self.N_W, self.N_K], dtype=int)
        self.n_k = np.zeros(shape=self.N_K, dtype=int)
        for i in range(self.N_D):
            for w in self.X[i]:
                # Randomly assign topic to each word
                k = np.random.randint(self.N_K)
                self.Z[i, w] = k
                self.n_ik[i, k] += 1
                self.n_wk[w, k] += 1
                self.n_k[k] += 1
    
    def p_z_computation(self, w, i):
        tmp1 = (self.n_wk[w, :] + self.gamma) / (self.n_k + self.N_W * self.gamma)
        tmp2 = (self.n_ik[i, :] + self.alpha) / (self.L[i] + self.N_K * self.alpha)
        p_z_iw = tmp1 * tmp2
        p_z_iw /= np.sum(p_z_iw)
        return p_z_iw


    def GibbsSampling(self):
        iter_steps = self.iter
        for it in range(iter_steps):
            print("iter step:", it)
            for i in range(self.N_D):
                for w in self.X[i]:
                    k = self.Z[i, w]
                    self.n_ik[i, k] -= 1
                    self.n_wk[w, k] -= 1
                    self.n_k[k] -= 1
                    # p(z_iw | .)
                    p_z_iw = self.p_z_computation(w, i)
                    # Resample word topic assignment
                    k = np.random.multinomial(1, p_z_iw).argmax()
                    # Update counts
                    self.Z[i, w] = k
                    self.n_ik[i, k] += 1
                    self.n_wk[w, k] += 1
                    self.n_k[k] += 1
    
    def doc_topic_dist(self):
        tmp1 = self.n_ik + self.alpha
        tmp2 = np.sum(self.n_ik + self.alpha, axis=1)[:, np.newaxis]
        return tmp1 / tmp2
    
    def topic_word_dist(self):
        tmp1 = (self.n_wk + self.gamma).T
        tmp2 = np.sum(tmp1, axis=1)[:, np.newaxis]
        return tmp1 / tmp2
    


df = pd.read_csv("./amazon_reviews.txt", sep="\t")
texts = df["REVIEW_TEXT"].values.tolist()
data_set_mapped, vocabulary_list = data_preprocess(corpus=texts)
num_docs = len(data_set_mapped)
num_vocabulary = len(vocabulary_list)


k_topics = 5
lda = LDAmodel(corpus=data_set_mapped, words=vocabulary_list, K=k_topics)
lda.GibbsSampling()
theta = lda.doc_topic_dist()
beta = lda.topic_word_dist()



iter step: 0
iter step: 1
iter step: 2
iter step: 3
iter step: 4
iter step: 5
iter step: 6
iter step: 7
iter step: 8
iter step: 9
iter step: 10
iter step: 11
iter step: 12
iter step: 13
iter step: 14
iter step: 15
iter step: 16
iter step: 17
iter step: 18
iter step: 19
iter step: 20
iter step: 21
iter step: 22
iter step: 23
iter step: 24
iter step: 25
iter step: 26
iter step: 27
iter step: 28
iter step: 29
iter step: 30
iter step: 31
iter step: 32
iter step: 33
iter step: 34
iter step: 35
iter step: 36
iter step: 37
iter step: 38
iter step: 39
iter step: 40
iter step: 41
iter step: 42
iter step: 43
iter step: 44
iter step: 45
iter step: 46
iter step: 47
iter step: 48
iter step: 49
iter step: 50
iter step: 51
iter step: 52
iter step: 53
iter step: 54
iter step: 55
iter step: 56
iter step: 57
iter step: 58
iter step: 59
iter step: 60
iter step: 61
iter step: 62
iter step: 63
iter step: 64
iter step: 65
iter step: 66
iter step: 67
iter step: 68
iter step: 69
iter step: 70
iter step: 71
it

In [469]:
def topics_words(n):
    'Return top n words of each topic'
    topic_words_map = np.argsort(-beta, axis=1)
    top_words = [[] for i in range(k_topics)]
    for i in range(k_topics):
        for j in range(0, n): 
            top_words[i].append(vocabulary_list[topic_words_map[i, j]])
    return top_words

for i, words in enumerate(topics_words(n=5)):
    print('Topic %d:' % i, words)

Topic 0: ['height', 'cant', 'mat', 'pulled', 'lid']
Topic 1: ['stylish', 'carrying', 'packaged', 'sizes', 'covers']
Topic 2: ['smells', 'eating', 'sweet', 'delicious', 'mix']
Topic 3: ['complaints', 'worst', 'charged', 'user', 'cons']
Topic 4: ['funny', 'remember', 'author', 'beginning', 'sense']


In [470]:

for k_topics in [3,5,10,15]:
    lda = LDAmodel(corpus=data_set_mapped, words=vocabulary_list, K=k_topics, iter_step=500)
    lda.GibbsSampling()
    theta = lda.doc_topic_dist()
    beta = lda.topic_word_dist()
    for i, words in enumerate(topics_words(n=5)):
        print('Topic %d:' % i, words)

iter step: 0
iter step: 1
iter step: 2
iter step: 3
iter step: 4
iter step: 5
iter step: 6
iter step: 7
iter step: 8
iter step: 9
iter step: 10
iter step: 11
iter step: 12
iter step: 13
iter step: 14
iter step: 15
iter step: 16
iter step: 17
iter step: 18
iter step: 19
iter step: 20
iter step: 21
iter step: 22
iter step: 23
iter step: 24
iter step: 25
iter step: 26
iter step: 27
iter step: 28
iter step: 29
iter step: 30
iter step: 31
iter step: 32
iter step: 33
iter step: 34
iter step: 35
iter step: 36
iter step: 37
iter step: 38
iter step: 39
iter step: 40
iter step: 41
iter step: 42
iter step: 43
iter step: 44
iter step: 45
iter step: 46
iter step: 47
iter step: 48
iter step: 49
iter step: 50
iter step: 51
iter step: 52
iter step: 53
iter step: 54
iter step: 55
iter step: 56
iter step: 57
iter step: 58
iter step: 59
iter step: 60
iter step: 61
iter step: 62
iter step: 63
iter step: 64
iter step: 65
iter step: 66
iter step: 67
iter step: 68
iter step: 69
iter step: 70
iter step: 71
it

iter step: 46
iter step: 47
iter step: 48
iter step: 49
iter step: 50
iter step: 51
iter step: 52
iter step: 53
iter step: 54
iter step: 55
iter step: 56
iter step: 57
iter step: 58
iter step: 59
iter step: 60
iter step: 61
iter step: 62
iter step: 63
iter step: 64
iter step: 65
iter step: 66
iter step: 67
iter step: 68
iter step: 69
iter step: 70
iter step: 71
iter step: 72
iter step: 73
iter step: 74
iter step: 75
iter step: 76
iter step: 77
iter step: 78
iter step: 79
iter step: 80
iter step: 81
iter step: 82
iter step: 83
iter step: 84
iter step: 85
iter step: 86
iter step: 87
iter step: 88
iter step: 89
iter step: 90
iter step: 91
iter step: 92
iter step: 93
iter step: 94
iter step: 95
iter step: 96
iter step: 97
iter step: 98
iter step: 99
iter step: 100
iter step: 101
iter step: 102
iter step: 103
iter step: 104
iter step: 105
iter step: 106
iter step: 107
iter step: 108
iter step: 109
iter step: 110
iter step: 111
iter step: 112
iter step: 113
iter step: 114
iter step: 115
iter

iter step: 82
iter step: 83
iter step: 84
iter step: 85
iter step: 86
iter step: 87
iter step: 88
iter step: 89
iter step: 90
iter step: 91
iter step: 92
iter step: 93
iter step: 94
iter step: 95
iter step: 96
iter step: 97
iter step: 98
iter step: 99
iter step: 100
iter step: 101
iter step: 102
iter step: 103
iter step: 104
iter step: 105
iter step: 106
iter step: 107
iter step: 108
iter step: 109
iter step: 110
iter step: 111
iter step: 112
iter step: 113
iter step: 114
iter step: 115
iter step: 116
iter step: 117
iter step: 118
iter step: 119
iter step: 120
iter step: 121
iter step: 122
iter step: 123
iter step: 124
iter step: 125
iter step: 126
iter step: 127
iter step: 128
iter step: 129
iter step: 130
iter step: 131
iter step: 132
iter step: 133
iter step: 134
iter step: 135
iter step: 136
iter step: 137
iter step: 138
iter step: 139
iter step: 140
iter step: 141
iter step: 142
iter step: 143
iter step: 144
iter step: 145
iter step: 146
iter step: 147
iter step: 148
iter step: 14

iter step: 95
iter step: 96
iter step: 97
iter step: 98
iter step: 99
iter step: 100
iter step: 101
iter step: 102
iter step: 103
iter step: 104
iter step: 105
iter step: 106
iter step: 107
iter step: 108
iter step: 109
iter step: 110
iter step: 111
iter step: 112
iter step: 113
iter step: 114
iter step: 115
iter step: 116
iter step: 117
iter step: 118
iter step: 119
iter step: 120
iter step: 121
iter step: 122
iter step: 123
iter step: 124
iter step: 125
iter step: 126
iter step: 127
iter step: 128
iter step: 129
iter step: 130
iter step: 131
iter step: 132
iter step: 133
iter step: 134
iter step: 135
iter step: 136
iter step: 137
iter step: 138
iter step: 139
iter step: 140
iter step: 141
iter step: 142
iter step: 143
iter step: 144
iter step: 145
iter step: 146
iter step: 147
iter step: 148
iter step: 149
iter step: 150
iter step: 151
iter step: 152
iter step: 153
iter step: 154
iter step: 155
iter step: 156
iter step: 157
iter step: 158
iter step: 159
iter step: 160
iter step: 161


In [None]:
Topic 0: ['eating', 'healthy', 'stylish', 'mix', 'smells']
Topic 1: ['funny', 'author', 'age', 'personal', 'learning']
Topic 2: ['pros', 'cons', 'internet', 'attached', 'assemble']

In [None]:
Topic 0: ['wheels', 'trips', 'adjustable', 'sides', 'camping']
Topic 1: ['healthy', 'eating', 'delicious', 'sweet', 'mix']
Topic 2: ['internet', 'returning', 'pros', 'options', 'user']
Topic 3: ['delivered', 'attention', 'packaged', 'mom', 'price!']
Topic 4: ['funny', 'remember', 'author', 'sense', 'girl']

In [None]:
Topic 0: ['delivered', 'bottles', 'clearly', 'starting', 'everywhere']
Topic 1: ['bulky', 'silver', 'ease', 'combination', 'supplement']
Topic 2: ['crazy', 'seeing', 'unique', 'girlfriend', 'enjoying']
Topic 3: ['cons', 'pros', 'returning', 'amzn', 'upon']
Topic 4: ['stays', 'velcro', 'somewhat', 'various', 'anymore']
Topic 5: ['particularly', 'catch', 'obviously', 'cant', 'packaged']
Topic 6: ['fell', 'seeing', 'covers', 'too!', 'cheaply']
Topic 7: ['hang', 'shot', 'track', 'creative', 'tag']
Topic 8: ['covered', 'price!', 'eating', 'mother', 'rain']
Topic 9: ['sister', 'happen', 'remember', 'appears', 'starting']

In [None]:
Topic 0: ['stays', 'assemble', 'bathroom', 'figured', 'cant']
Topic 1: ['ways', 'software', 'personally', 'plays', 'list']
Topic 2: ['chose', 'variety', 'angle', 'learning', 'mess']
Topic 3: ['rate', 'cheaply', 'rating', 'surprise', 'doubt']
Topic 4: ['changed', 'variety', 'exact', 'toys', 'class']
Topic 5: ['tag', 'below', 'seeing', 'ability', 'creative']
Topic 6: ['sold', 'wire', 'process', 'tested', 'reliable']
Topic 7: ['terrible', 'usual', 'holder', 'sister', 'spray']
Topic 8: ['remember', 'loss', 'wet', 'honestly', 'listen']
Topic 9: ['whatever', 'runs', 'useless', 'throughout', 'intended']
Topic 10: ['provided', 'slip', 'wow', 'camping', 'bracelet']
Topic 11: ['clothes', 'hurt', 'adjustable', 'wheels', 'assemble']
Topic 12: ['construction', 'reviewers', 'fell', 'too!', 'sets']
Topic 13: ['cons', 'pros', 'amzn', 'worst', 'share']
Topic 14: ['sweet', 'covered', 'smells', 'comfort', 'camping']

In [471]:
len(data_set_mapped)

21000

In [472]:
len(vocabulary_list)

38544