# BGSE Text Mining Homework 2
## Euan Dowers, Veronika Kyuchukova, and Laura Roman

### Exercise 1

The objective of this exercise is to implement uncollapsed gibbs sampling for fitting an LDA model to state of the union speeches from 1945 onwards, with documents being defined at the paragraph level. 

First, we need to read in and process the data, as in the first homework set:

In [1]:
import numpy as np
import pandas as pd
import nltk
import os
import sys
import scipy.sparse as ssp
import time
import matplotlib
import tqdm
from tqdm import tqdm
from numpy.random import dirichlet
from collections import Counter
from utils import data_processing, get_vocab, make_count
%matplotlib inline

In [2]:
data = pd.read_table("HW1/speech_data_extend.txt",encoding="utf-8")
data_post1945 = data.loc[data.year >= 1945]
%time stemmed, processed_data = data_processing(data_post1945)

CPU times: user 8.9 s, sys: 16 ms, total: 8.92 s
Wall time: 8.92 s


Now we will create a function that implements uncollapsed gibbs sampling on our processed data.
This essentially works by repeatedly sampling from the posterior distributions of $Z$, $\Theta$, and $\beta$ and updating values using the most recent sample. 

In [5]:
def Gibbs_sampling_LDA(stemmed, K, alpha = None, eta = None, m=3, n_samples = 200, burnin = 500, perplexity = False):
    '''
    Gibbs sampler for LDA model
    '''

    def Z_class_1(Beta, Theta):
        Z = [np.ndarray.tolist( np.argmax( Beta[:,[idx[word] for word in stemmed[i]]] * \
        Theta[i,:].reshape((K, 1)), axis = 0) ) for i in range(Theta.shape[0] )]
        return Z

    def Beta_sample(eta, Z):
        z_s = [z for sublist in Z for z in sublist ]
        M = np.zeros(shape=(K,V))
        for k in range(K):
            words = [s[i] for i in range(len(z_s)) if z_s[i] == k]
            counts = Counter(words)
            for word in set(words):
                M[k,idx[word]] = counts[word]
        Beta = [dirichlet(alpha = eta + M[i],size = 1)[0] for i in range(K)]
        return np.array(Beta)

    def Theta_sample(alpha, Z):
        N   = np.zeros(shape=(D,K))
        for i in range(D):
            counts   = Counter(Z[i])
            for j in set(counts.keys()):
                N[i,j]  = counts[j]
        Theta = [dirichlet(alpha = alpha + N[i],size = 1)[0] for i in range(D)]
        return np.array(Theta)

    def onehotencode(Z):
        '''
        Create function to one-hot encode topic allocation
        '''
        a       = np.array([i for sublist in Z for i in sublist ])
        b       = np.zeros((a.size, a.max()+1))
        b[np.arange(a.size),a] = 1
        return(b)

    def perplexity(Theta, Beta, count_matrix):
        '''
        Calculate perplexity for given sample
        '''
        ltb     = np.log(Theta.dot(Beta))
        num     = np.sum(count_matrix.multiply(ltb))
        denom   = len(s)
        return np.exp(-num/denom)

    # Get params needed for passing to sampling functions
    s       = [i for sublist in stemmed for i in sublist ]
    vocab   = get_vocab(stemmed)
    D       = len(stemmed)
    V       = len(vocab)
    idx     = dict(zip(vocab,range(len(vocab))))
    count_matrix = make_count(stemmed, idx)
    perp   = []

    # Initialise params
    if eta == None:
        eta = 200/V
    if alpha == None:
        alpha = 50/K

    Theta   = dirichlet(alpha = [alpha]*K, size = D)
    Beta    = dirichlet(alpha = [eta]*V, size = K)
    Z       = Z_class_1(Beta, Theta)
    labels  = np.zeros((n_samples, len(s)))

    # SAMPLING
    print('TIME:', time.strftime("%H:%M:%S", time.gmtime()))
    for i in tqdm(range(burnin)):
        Z       = Z_class_1(Beta, Theta)
        Beta    = Beta_sample(eta, Z)
        Theta   = Theta_sample(alpha, Z)
        if i%20 == 0:
            if perplexity:
                perp.append(perplexity(Theta, Beta, count_matrix))
            #print('Burnin iteration {}'.format(i))

    print('TIME:', time.strftime("%H:%M:%S", time.gmtime()))
    for i in tqdm(range(m*n_samples)):
        Z       = Z_class_1(Beta, Theta)
        Beta    = Beta_sample(eta, Z)
        Theta   = Theta_sample(alpha, Z)

        # Add every m-th sample to output
        if i%m == 0:
            Z_s = [i for sublist in Z for i in sublist ]
            j = np.int(i/m)
            labels[j, :] = Z_s
        if i%20 == 0:
            if perplexity:
                perp.append(perplexity(Theta, Beta, count_matrix))
            #print( "Iteration {}".format(i))

    return (labels, perp)

In [6]:
LDA_labels, perp = Gibbs_sampling_LDA(stemmed,
                                      K = 10,
                                      n_samples = 500,
                                      perplexity=True,
                                      burnin = 1000)

  0%|          | 0/1000 [00:00<?, ?it/s]

TIME: 10:31:47


100%|██████████| 1000/1000 [14:24<00:00,  1.38it/s]
  0%|          | 0/1500 [00:00<?, ?it/s]

TIME: 10:46:11


100%|██████████| 1500/1500 [22:28<00:00,  1.34it/s]


As you can see this sampling function takes around 35 minutes to complete 2500 iterations. 

### Exercise 2

The objective of this exercise is to run the collapsed Gibbs sampling version for fitting an LDA model to the same data, hyperparameters and K, and compare its predictive distribution with the one we get from uncollapes Gibbs sampling. In order to do so, we will work with a Python3 adapted version of the collapsed Gibbs sampler that can be found in https: //github.com/sekhansen/text-mining-tutorial 


#### a) Plot the perplexity across sampling iterations. Which algorithm appears to burn in faster?

In [None]:
import topicmodels

In [None]:
os.chdir('/Users/Laura/Desktop/text_mining_hw2/ex1')
from utils import data_processing, get_vocab, make_count
os.chdir('/Users/Laura/Desktop/text_mining_hw2')
data = pd.read_table("speech_data_extend.txt",encoding="utf-8")
data_post1945 = data.loc[data.year >= 1945]
%time stemmed, processed_data = data_processing(data_post1945)

ldaobj = topicmodels.LDA.LDAGibbs(stemmed, 10)


ldaobj.sample(0, 20, 75)

perp2 = ldaobj.perplexity()
perp_2 =  pd.DataFrame(perp_2)
pd.DataFrame.to_csv(perp_2,path_or_buf='perplexity_collapsed.csv',index=False)

Import perplexity computed with uncollapsed Gibbs sampler:

In [None]:
os.chdir('/Users/Laura/Desktop/text_mining_hw2/data')
perp1 = pd.read_csv('./perplexity.csv') 
perp1 = np.array(perp1)
perp1 = perp1[:,1]

And compare the results by plotting the perplexity obtained with each algorithm against the samples taken:

In [None]:
plt.plot(perp1,   lw = 1., label = 'uncollapsed')
plt.plot(perp2, lw = 1., label = 'collapsed')
plt.legend(loc='center left', bbox_to_anchor=(0.65, 0.8))
plt.ylabel('perplexity')
plt.xlabel('samples')
plt.savefig('perplexities.png', bbox_inches='tight')


We can see that while the collapsed Gibbs sampler stabilizes rapidly, at 10 samples  and with a perplexity of 1130 more or less, the uncollapsed Gibbs sampler seems not to fully stabilize with even more than 75 samples.

#### b) After the burn-in period, construct estimates of the predictive distribution of theta for each document across a number of draws from the samplers. Are the average values of these predictive distributions similar in the uncollapsed and collapsed samplers? How variable are these predictive distributions in the two algorithms across sample draws?

In [None]:
ldaobj = topicmodels.LDA.LDAGibbs(stemmed, 10)
ldaobj.sample(1000, 5, 100)


k = ldaobj.K
alpha =  ldaobj.alpha
docterms = ldaobj.dt_avg()
nm = np.array([len(doc) for doc in stemmed])
nm = nm.reshape((10252,1))
nmz = nm*docterms

theta_collapsed = (nmz+alpha)/(nm+k*alpha)

theta_uncoll = pd.read_csv('./theta_uncollapsed.csv') 
theta_uncoll.drop( 'Unnamed: 0',axis=1,inplace=True)
theta_uncoll = np.array(theta_uncoll)


We now proced to compare the average value of the predictive distributions in the uncollapsed and collapsed samplers. Also, we take a step more and compute the standard deviation and variance. 

In [None]:
theta_collapsed.mean()
theta_uncoll.mean()

# from collapsed gibbs sampler
m_c = np.array(theta_collapsed.mean(axis=0))
st_c= np.array(theta_collapsed.std(axis=0))
var_c = np.array(np.var(theta_collapsed, axis=0))

# from uncollapsed gibbs sampler
m_u = np.array(theta_uncoll.mean(axis=0))
st_u = np.array(theta_uncoll.std(axis=0))
var_u = np.array(np.var(theta_uncoll, axis=0))

basic_stats = pd.DataFrame(np.vstack((m_c,m_u,st_c,st_u,var_c,var_u)).T)
basic_stats.columns = ['mean_c','mean_unc','sd_c','sd_unc','var_c','var_u']
basic_stats

A few observations from these results are:
- The average value of the predictive distribution with the uncollapsed and collapsed gibbs samplers for each of the 10 topics are very similar, as well as the mean of the distribution which is 0.0999999 for both cases.
- The standard deviation across topics and for each model is rather different, being bigger for the uncollapsed sampler. 
- The variance is significantly much smaller for each topic when working with the collapsed gibbs sampler than with the uncollapsed sampler indicating that the predictive distributions for for the collapsed case are more peaked and thin.

Indeed, we can see such difference in the width of topic distributions for each sampler:

- Collapsed gibbs sampler predictive distributions:

In [None]:
# collapsed gibbs sampler

t1 = theta_collapsed[:,0]
t2 = theta_collapsed[:,1]
t3 = theta_collapsed[:,2]
t4 = theta_collapsed[:,3]
t5 = theta_collapsed[:,4]
t6 = theta_collapsed[:,5]
t7 = theta_collapsed[:,6]
t8 = theta_collapsed[:,7]
t9 = theta_collapsed[:,8]
t10 = theta_collapsed[:,9]

bins = np.linspace(0.03, 0.225, 70)

pyplot.hist(t1, bins, alpha=0.5, label='topic 1')
pyplot.hist(t2, bins, alpha=0.5, label='topic 2')
pyplot.hist(t3, bins, alpha=0.5, label='topic 3')
pyplot.hist(t4, bins, alpha=0.5, label='topic 4')
pyplot.hist(t5, bins, alpha=0.5, label='topic 5')
pyplot.hist(t6, bins, alpha=0.5, label='topic 6')
pyplot.hist(t7, bins, alpha=0.5, label='topic 7')
pyplot.hist(t8, bins, alpha=0.5, label='topic 8')
pyplot.hist(t9, bins, alpha=0.5, label='topic 9')
pyplot.legend(loc='upper right')
pyplot.show()

- Uncollapsed gibbs sampler predictive distributions:

In [None]:
# uncollapsed gibbs sampler

ut1 = theta_uncoll[:,0]
ut2 = theta_uncoll[:,1]
ut3 = theta_uncoll[:,2]
ut4 = theta_uncoll[:,3]
ut5 = theta_uncoll[:,4]
ut6 = theta_uncoll[:,5]
ut7 = theta_uncoll[:,6]
ut8 = theta_uncoll[:,7]
ut9 = theta_uncoll[:,8]
ut10 = theta_uncoll[:,9]

bins = np.linspace(0.03, 0.225, 70)

pyplot.hist(ut1, bins, alpha=0.5, label='topic 1')
pyplot.hist(ut2, bins, alpha=0.5, label='topic 2')
pyplot.hist(ut3, bins, alpha=0.5, label='topic 3')
pyplot.hist(ut4, bins, alpha=0.5, label='topic 4')
pyplot.hist(ut5, bins, alpha=0.5, label='topic 5')
pyplot.hist(ut6, bins, alpha=0.5, label='topic 6')
pyplot.hist(ut7, bins, alpha=0.5, label='topic 7')
pyplot.hist(ut8, bins, alpha=0.5, label='topic 8')
pyplot.hist(ut9, bins, alpha=0.5, label='topic 9')
pyplot.legend(loc='upper right')
pyplot.show()


### Exercise 3

In this exercise we are interested in comparing the classification performance of a penalized logistic regression when paragraphs (which is associated with one of two political parties) are represented as unigram counts over raw terms versus topic shares. We use training samples to estimate the relationship between document content and political party, and then assess its out-of-sample performance on held-out data.