Topic Modelling

In [None]:
# --- IMPORTS ---
import numpy as np
import collections
from collections import Counter
import spacy
import random
import pandas as pd

# corpus
from sklearn.datasets import fetch_20newsgroups
train = fetch_20newsgroups(subset='train',  remove=('headers', 'footers')).data
random.Random(42).shuffle(train)


## Data Preparation

In [None]:
# tokenization
# stop word removal
# remove uncommon words (appearing less than 10 times)
# take a subset of the corpus (200 000 words)

In [None]:
# use spacy for tokenization
nlp = spacy.load("en_core_web_sm")

In [None]:
# tokenizes a document from the corpus given the file name
def tokenize(id):

    # read the referenced document
    doc = train[id]
    
    # make a list of the words in the document, with stop words removed
    word_list = []
    for t in [d.text.lower() for d in nlp.tokenizer(doc)]:
        token = nlp.vocab[t]
        if not (token.is_stop or token.is_punct or token.is_space  or token.is_digit or token.like_num or token.like_url or token.like_email or token in ['>', '<', '|', '=', '$', '+', '`']):
            freqs[t] += 1
            word_list.append(t)
            
    # append the word list for the given document to the corpus list
    W.append(word_list)

In [None]:
# parameters
K = 10
word_limit = 300000
min_occ = 10

# list containing each document's word list
W = [] 

# vocabulary storing counts for each token in the corpus
freqs = Counter()  

# read in new documents to the dataset until the word_limit is reached
i = 0
while sum(freqs.values()) < word_limit:
  tokenize(i)
  i += 1

# filter away words with low occurence from the dictionary
freqs = Counter(dict(filter(lambda x: x[1] >= min_occ, freqs.items())))
print(freqs)

# make vocabulary where each word has a unique index from 0 to (V-1)
v = 0
vocab = {}  # a standard vocab mapping word to index
vocab_inv = {}  # a reverse vocab mapping index to word
for k in freqs:
  vocab[k] = v
  vocab_inv[v] = k
  v += 1

# build up a word list of format: W_Z[document_idx][word_idx] = (token, topic)
W_Z = []
assigned_topics = {k+1: 0 for k in range(K)}  # for counting topic assignments
for doc in W:
  tokens = []
  for token in doc:
    if token in vocab: # disregard tokens not appearing in vocabulary
      k = random.randint(1, K)
      tokens.append((token, k))
      assigned_topics[k] +=1
  W_Z.append(tokens)
assigned_topics



{1: 22627,
 2: 22282,
 3: 22333,
 4: 22761,
 5: 22441,
 6: 22717,
 7: 22620,
 8: 22625,
 9: 22457,
 10: 22620}

## Gibbs Sampling

In [None]:
# Topic Modelling:

# Initialize the topic distribution uniformly, i.e. for each Z_dj draw one of the K topics uniformly
# Next, start Gibbs sampling
# for all documents(every z,w):
#       update theta_d by drawing from distribution
#       update phi by drawing from distribution
# draw new z_dj with updated parameters

In [None]:
num_docs = len(W_Z)
voc_size = len(vocab.keys())


# Gibbs sampling
num_gibbs_iterations = 150
alpha = 0.1
beta = 0.1

# gibbs outer loop
for i in range(num_gibbs_iterations):
    if i%10==0:
      print(i, 'iterations')
    
    n_dk = np.zeros((num_docs, K))  # num. words with topic k in document d
    m_kv = np.zeros((K, voc_size))  # num. of each word in vocab with topic k

    # loop through all W_Zs
    for doc in range(len(W_Z)):
        
        for word in range(len(W_Z[doc])):
            
            # retrieve token and topic of current word
            w_token = W_Z[doc][word][0]
            w_topic = W_Z[doc][word][1]
            
            # increment counters
            n_dk[doc, w_topic-1] += 1  
            m_kv[w_topic-1, vocab[w_token]] += 1
    
    # draw theta and phi from the distribution
    theta = np.zeros((num_docs, K))
    phi = np.zeros((K, voc_size))

    #count nr of times a topic occurs in each document
    for d in range(num_docs):
        dirichlet_vec = [alpha for _ in range(K)]
        dirichlet_vec += n_dk[d, :]
        theta[d,: ] = np.random.dirichlet(dirichlet_vec)
    
    #count nr of times a word has been assigned each topic
    for k in range(K):
        dirichlet_vec = [beta for _ in range(voc_size)]
        dirichlet_vec += m_kv[k, :]
        phi[k,: ] = np.random.dirichlet(dirichlet_vec)

    # reassign topics to all words
    for doc in range(len(W_Z)):
        for word in range(len(W_Z[doc])):
          theta_d = theta[doc,:]
          
          w_token = W_Z[doc][word][0]
          phi_w = phi[:,vocab[w_token]]
          
          # build up probability vector, with the probability of each topic
          z_probs = (phi_w*theta_d)/sum(phi_w*theta_d)
          
          # draw new topic from the distribution and assign
          z_dj = np.argmax(np.random.multinomial(1,z_probs)) +1
          W_Z[doc][word] = (W_Z[doc][word][0], z_dj)


0 iterations
10 iterations
20 iterations
30 iterations
40 iterations
50 iterations
60 iterations
70 iterations
80 iterations
90 iterations
100 iterations
110 iterations
120 iterations
130 iterations
140 iterations


In [None]:
assigned_topics = {k+1: [] for k in range(K)}

for doc in range(len(W_Z)):
    for word in range(len(W_Z[doc])):
        w_i = W_Z[doc][word]
        assigned_topics[w_i[1]].append(w_i[0])
#assigned_topics

In [None]:
# find the most common words for each topic

M = 20  # number of top words

most_common = [] # by raw count
most_common_rel = [] # by relative count

for k in assigned_topics.keys():
    word_list = assigned_topics[k]
    count = collections.Counter(word_list)
    top_words = count.most_common(M)
    most_common.append(list(top_words))
    
    rel_freq = dict((word, float(count[word])/freqs[word]) for word in count)
    top_words_rel = list(rel_freq.items())
    top_words_rel.sort(key=lambda tup : tup[1], reverse=True)
    most_common_rel.append(top_words_rel[:M])



In [None]:
# most common words by relative count
print('most common words (relative count)')

common_words_rel = []
for c in most_common_rel:
  temp = []
  for tup in c:
    temp.append(tup[0])
  common_words_rel.append(temp)
common_words_rel

most common words (relative count)


[['yankees',
  'braves',
  'season',
  'reds',
  'winfield',
  'coach',
  'unmoderated',
  'cox',
  'francis',
  'olson',
  'thompson',
  'hitter',
  'pitching',
  'cubs',
  'islanders',
  'fans',
  'espn',
  'leafs',
  'baseball',
  'playoffs'],
 ['elias',
  'davidsson',
  'israeli',
  'palestinian',
  'arab',
  'grants',
  'governments',
  'palestine',
  'phill',
  'senator',
  'democratic',
  'propaganda',
  'nazi',
  'palestinians',
  'zionists',
  'inhabitants',
  'wiretap',
  'encrypt',
  'nsa',
  'crypto'],
 ['taxes',
  'pregnancy',
  'guns',
  'gang',
  'abiding',
  'gun',
  'homicides',
  'defenses',
  'hospital',
  'doctor',
  'treatments',
  'handgun',
  'assault',
  'firearms',
  'neighbors',
  'dose',
  'physician',
  'pound',
  'accidents',
  'handguns'],
 ['hendricks',
  'sexual',
  'marriage',
  'dreams',
  'believer',
  'disorder',
  'mozumder',
  'tear',
  'harvard',
  'constitute',
  'cramer',
  'observations',
  'constructed',
  'khomeini',
  'atomic',
  'hypothesis

In [None]:
df = pd.DataFrame(common_words_rel).T
df.to_csv('common_words_rel_' + str(K) + '.csv', sep=',')

In [None]:
# most common words by raw count
print('most common words (raw count)')

common_words = []
for c in most_common:
  temp = []
  for tup in c:
    temp.append(tup[0])
  common_words.append(temp)
common_words

most common words (raw count)


[['team',
  'year',
  'game',
  'games',
  'season',
  'play',
  'writes',
  'article',
  'players',
  'good',
  'win',
  'league',
  'baseball',
  'gm',
  'think',
  'best',
  'hockey',
  'player',
  'points',
  'san'],
 ['people',
  'government',
  'key',
  'writes',
  'think',
  'right',
  'article',
  'israel',
  'know',
  'president',
  'israeli',
  'like',
  'state',
  'public',
  'encryption',
  'time',
  'rights',
  'going',
  'jews',
  'chip'],
 ['gun',
  'people',
  'insurance',
  'article',
  'guns',
  'crime',
  'health',
  'control',
  'writes',
  'like',
  'rate',
  'study',
  'new',
  'weapons',
  'canada',
  'tax',
  'problem',
  'firearms',
  'medical',
  'care'],
 ['people',
  'writes',
  'article',
  'time',
  'theory',
  'think',
  'science',
  'case',
  'etc',
  'men',
  'like',
  'way',
  'believe',
  'universe',
  'fact',
  'years',
  'things',
  'post',
  'wrong',
  'use'],
 ["max>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax",
  'space',
  'm',
  'lau

In [None]:
df = pd.DataFrame(common_words).T
df.to_csv('common_words_' + str(K) + '.csv', sep=',')

## Coherence Score

In [None]:
# create matrix for storing occurence of word in document
word_occurrence = np.zeros((num_docs, voc_size))

for doc in range(len(W_Z)):
    for word in range(len(W_Z[doc])):
        w_token = W_Z[doc][word][0]
        word_occurrence[doc, vocab[w_token]] = 1


In [None]:
# number of documents containing the word referenced by v_index
def document_frequency(v_index):
    f = sum(word_occurrence[:, v_index])
    return f

# number of documents containing both word v1 and v2
def co_document_frequency(v1_index, v2_index):
    co_f = 0
    for doc in range(len(word_occurrence)):
        if word_occurrence[doc, v1_index] and word_occurrence[doc, v2_index]:
            co_f += 1
    return co_f



In [None]:
import math
coherence_scores = []
for k in range(K):
    top_m_words = common_words[k]
    topic_coherence_k = 0
    for m in range(1, M):
        for l in range(0, m):
            # find indices of the words in the vocabulary
            v_m = vocab[top_m_words[m]]
            v_l = vocab[top_m_words[l]]

            # calculate the sub score
            subscore = math.log2((co_document_frequency(v_m, v_l) + 1) / document_frequency(v_l))
            topic_coherence_k += subscore
    
    coherence_scores.append(round(topic_coherence_k, 2))

coherence_scores
df = pd.DataFrame(coherence_scores)
df.to_csv('coherence_scores_' + str(K) + '.csv', sep=',')

In [None]:
common_words[np.argmax(coherence_scores)]
df = pd.DataFrame(common_words[np.argmax(coherence_scores)])
df.to_csv('coherence_scores_' + str(K) + '_top.csv', sep=',')

In [None]:
common_words[0]

['team',
 'year',
 'game',
 'games',
 'season',
 'play',
 'writes',
 'article',
 'players',
 'good',
 'win',
 'league',
 'baseball',
 'gm',
 'think',
 'best',
 'hockey',
 'player',
 'points',
 'san']