In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../helpers_python')
from pre_processing import *
from lda_helpers import * 

import gensim
from gsdmm import MovieGroupProcess

%load_ext autoreload
%autoreload 2

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/robinjaccard/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/robinjaccard/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/robinjaccard/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/robinjaccard/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/robinjaccard/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from imp import reload


In [2]:
country = 'France'

# Load data
df = load_data_lda(country)

In [5]:
df["clean"] = df["clean"].apply(lambda x: str(x).split()) # transform the strings into arrays of words

In [7]:
# cast tweets to numpy array
docs = df.clean.to_numpy()

# create dictionary of all words in all documents
dictionary = gensim.corpora.Dictionary(docs)

In [9]:
# create variable containing size of dictionary
vocab_length = len(dictionary)

# create bag of words dictionary
bow_corpus = [dictionary.doc2bow(doc) for doc in docs]

In [10]:
# initialize GSDMM
gsdmm = MovieGroupProcess(K=15, alpha=0.1, beta=0.3, n_iters=15)

In [11]:
# fit GSDMM model
y = gsdmm.fit(docs, vocab_length)

In stage 0: transferred 742 clusters with 15 clusters populated
In stage 1: transferred 493 clusters with 15 clusters populated
In stage 2: transferred 403 clusters with 15 clusters populated
In stage 3: transferred 305 clusters with 15 clusters populated
In stage 4: transferred 286 clusters with 15 clusters populated
In stage 5: transferred 263 clusters with 15 clusters populated
In stage 6: transferred 283 clusters with 15 clusters populated
In stage 7: transferred 248 clusters with 15 clusters populated
In stage 8: transferred 283 clusters with 15 clusters populated
In stage 9: transferred 236 clusters with 14 clusters populated
In stage 10: transferred 233 clusters with 15 clusters populated
In stage 11: transferred 246 clusters with 14 clusters populated
In stage 12: transferred 232 clusters with 15 clusters populated
In stage 13: transferred 233 clusters with 14 clusters populated
In stage 14: transferred 237 clusters with 13 clusters populated


In [12]:
# print number of documents per topic
doc_count = np.array(gsdmm.cluster_doc_count)
print('Number of documents per topic :', doc_count)

# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-15:][::-1]
print('Most important clusters (by number of docs inside):', top_index)

# define function to get top words per topic
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts = sorted(cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("\nCluster %s : %s"%(cluster, sort_dicts))

# get top words in topics
top_words(gsdmm.cluster_word_distribution, top_index, 10)

Number of documents per topic : [ 33  24  77  78  88 102  30  61 298  29   0   0  16  48  16]
Most important clusters (by number of docs inside): [ 8  5  4  3  2  7 13  0  6  9  1 14 12 11 10]

Cluster 8 : [('white', 155), ('people', 88), ('black', 35), ('say', 32), ('go', 29), ('get', 27), ('u', 26), ('war', 23), ('anti', 23), ('want', 21)]

Cluster 5 : [('muslim', 27), ('attack', 15), ('terrorist', 13), ('police', 12), ('year', 12), ('new', 10), ('one', 10), ('islamic', 9), ('amp', 9), ('migrant', 9)]

Cluster 4 : [('trump', 22), ('cpac', 12), ('people', 10), ('every', 8), ('want', 8), ('time', 7), ('one', 7), ('president', 7), ('party', 7), ('berry', 6)]

Cluster 3 : [('one', 13), ('many', 10), ('people', 9), ('need', 8), ('country', 8), ('biden', 7), ('minister', 7), ('prime', 6), ('first', 6), ('get', 6)]

Cluster 2 : [('biden', 15), ('left', 13), ('border', 12), ('wing', 12), ('germany', 11), ('video', 10), ('u', 9), ('open', 8), ('german', 8), ('migrant', 8)]

Cluster 7 : [('amp

## Metric 

In [13]:
from gensim.models import CoherenceModel

In [14]:
# define function to get words in topics
def get_topics_lists(model, top_clusters, n_words):
    '''
    Gets lists of words in topics as a list of lists.
    
    model: gsdmm instance
    top_clusters:  numpy array containing indices of top_clusters
    n_words: top n number of words to include
    
    '''
    # create empty list to contain topics
    topics = []
    
    # iterate over top n clusters
    for cluster in top_clusters:
        #create sorted dictionary of word distributions
        sorted_dict = sorted(model.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:n_words]
         
        #create empty list to contain words
        topic = []
        
        #iterate over top n words in topic
        for k,v in sorted_dict:
            #append words to topic list
            topic.append(k)
            
        #append topics to topics list   
        if len(topic) != 0:
            topics.append(topic)
    
    return topics

In [15]:
topics = get_topics_lists(gsdmm, top_index, 10) 

In [16]:
# evaluate model using Topic Coherence score
cm_gsdmm = CoherenceModel(topics=topics, 
                          dictionary=dictionary, 
                          corpus=bow_corpus, 
                          texts=docs, 
                          coherence='c_v')

# get coherence value
coherence_gsdmm = cm_gsdmm.get_coherence()  

print(coherence_gsdmm)

0.35696819691336934


## Grid Search

In [17]:
alphas = [0.001, 0.01, 0.1, 0.2]
betas = [0.001, 0.01, 0.1, 0.2]
n_iters = [5, 10, 15, 20]

In [None]:
best_coherence = -1000
for i in alphas:
    for j in betas:
        for n in n_iters:
            gsdmm = MovieGroupProcess(K=10, alpha=i, beta=j, n_iters=n)
            y = gsdmm.fit(docs, vocab_length)
            doc_count = np.array(gsdmm.cluster_doc_count)
            top_index = doc_count.argsort()[-15:][::-1]
            topics = get_topics_lists(gsdmm, top_index, 10) 
            cm_gsdmm = CoherenceModel(topics=topics, 
                          dictionary=dictionary, 
                          corpus=bow_corpus, 
                          texts=docs, 
                          coherence='c_v')

            # get coherence value
            coherence_gsdmm = cm_gsdmm.get_coherence() 
            if coherence_gsdmm > best_coherence:
                best_coherence = coherence_gsdmm
            print("alpha "+str(i)+" betas "+str(j))
            print("Coherence: "+str(coherence_gsdmm))
            print("Best coherence: "+str(best_coherence))

In [20]:
# best model cv coherence
alpha = 0.001
beta = 0.2
iteration = 15
coherence_cv = 0.406

In [21]:
# best model u_mass coherence
alpha = 0.1
beta = 0.01
iters = 20
coherence_umass = -9.63