Notebook for Twitter Topic Modeling

In [1]:
#pip install git+https://github.com/rwalk/gsdmm.git

In [2]:
import gsdmm
import numpy as np
import pandas as pd
import pickle
import re

from gsdmm import MovieGroupProcess
from tqdm import tqdm

In [3]:
# Import dataset
tweets_df = pd.read_csv('dummy data_processed')
tweets_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tweet_id,text,processed_text,stripped_text,text_lem,text_tokens
0,0,0,1630989944289398784,"I ""think"" I started my menstrual cycle yesterd...","i ""think"" i started my menstrual cycle yesterd...",think started menstrual cycle yesterday due ab...,think started menstrual cycle yesterday due ab...,"['think', 'started', 'menstrual', 'cycle', 'ye..."
1,1,1,1630974067875381265,cw menstruation ////\n.\n.\n.\ni have had my p...,cw menstruation //// . . . i have had my perio...,menstruation period years cycle always month w...,menstruation period years cycle always month w...,"['menstruation', 'period', 'years', 'cycle', '..."
2,2,2,1630965437709053952,Clubs must ensure that they enhance supportive...,clubs must ensure that they enhance supportive...,clubs must ensure enhance supportive measures ...,clubs must ensure enhance supportive measures ...,"['clubs', 'must', 'ensure', 'enhance', 'suppor..."
3,3,3,1630878576538013696,#EndPeriodShaming\nI used to think it was a no...,i used to think it was a normal narrative unt...,used think normal narrative saw orphan girl so...,used think normal narrative saw orphan girl so...,"['used', 'think', 'normal', 'narrative', 'saw'..."
4,4,4,1630863346143580161,"Let’s push for ending period stigma, period po...","let us push for ending period stigma, period p...",let push ending period stigma period poverty p...,let push ending period stigma period poverty p...,"['let', 'push', 'ending', 'period', 'stigma', ..."


In [4]:
# Create a single list of tweet tokens
docs = tweets_df['text_tokens'].tolist()
docs[:3]

["['think', 'started', 'menstrual', 'cycle', 'yesterday', 'due', 'ablation', 'surgery', 'working', 'bleeding', 'really', 'bizarre', 'cool', 'always', 'regular', 'track', 'right', 'time', 'period', 'part', 'one', 'still…']",
 "['menstruation', 'period', 'years', 'cycle', 'always', 'month', 'will', 'brain', 'stop', 'convincing', 'pregnant', 'every', 'time', 'hit', 'day', 'cycle']",
 "['clubs', 'must', 'ensure', 'enhance', 'supportive', 'measures', 'women', 'matters', 'menstrual', 'health', 'hygiene', 'management', 'mhm', 'including', 'period', 'tracking', 'free', 'sanitary', 'pads', 'mhm', 'talks', 'internally', 'health', 'experts', 'etc']"]

In [5]:
# Train STTM model
mgp = MovieGroupProcess(K=10, alpha=0.1, beta=0.1, n_iters=15)
vocab = set(x for doc in docs for x in doc)
n_terms = len(vocab)
y = mgp.fit(docs, n_terms)

# Save model
with open('10clusters.model', 'wb') as f:
    pickle.dump(mgp, f)
    f.close()

In stage 0: transferred 87 clusters with 4 clusters populated
In stage 1: transferred 2 clusters with 2 clusters populated
In stage 2: transferred 0 clusters with 2 clusters populated
In stage 3: transferred 1 clusters with 2 clusters populated
In stage 4: transferred 1 clusters with 2 clusters populated
In stage 5: transferred 0 clusters with 2 clusters populated
In stage 6: transferred 0 clusters with 2 clusters populated
In stage 7: transferred 0 clusters with 2 clusters populated
In stage 8: transferred 0 clusters with 2 clusters populated
In stage 9: transferred 0 clusters with 2 clusters populated
In stage 10: transferred 0 clusters with 2 clusters populated
In stage 11: transferred 0 clusters with 2 clusters populated
In stage 12: transferred 1 clusters with 2 clusters populated
In stage 13: transferred 1 clusters with 2 clusters populated
In stage 14: transferred 0 clusters with 2 clusters populated


In [6]:
# Load in trained model 
filehandler = open('10clusters.model', 'rb')
mgp = pickle.load(filehandler)

In [7]:
# Define helper functions
def top_words(cluster_word_distribution, top_cluster, values):
    '''prints the top words in each cluster'''
    for cluster in top_cluster:
        sort_dicts =sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print('Cluster %s : %s'%(cluster,sort_dicts))
        print(' — — — — — — — — —')
        
def cluster_importance(mgp):
    '''returns a word-topic matrix[phi] where each value represents
    the word importance for that particular cluster;
    phi[i][w] would be the importance of word w in topic i.
    '''
    n_z_w = mgp.cluster_word_distribution
    beta, V, K = mgp.beta, mgp.vocab_size, mgp.K
    phi = [{} for i in range(K)]
    for z in range(K):
        for w in n_z_w[z]:
            phi[z][w] = (n_z_w[z][w]+beta)/(sum(n_z_w[z].values())+V*beta)
    return phi

def topic_allocation(df, docs, mgp, topic_dict):
    '''allocates all topics to each document in original dataframe,
    adding two columns for cluster number and cluster description'''
    topic_allocations = []
    for doc in tqdm(docs):
        topic_label, score = mgp.choose_best_label(doc)
        topic_allocations.append(topic_label)

    df['cluster'] = topic_allocations

    df['topic_name'] = df.cluster.apply(lambda x: get_topic_name(x, topic_dict))
    print('Complete. Number of documents with topic allocated: {}'.format(len(df)))

def get_topic_name(doc, topic_dict):
    '''returns the topic name string value from a dictionary of topics'''
    topic_desc = topic_dict[doc]
    return topic_desc

In [8]:
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)
print('*'*20)

# topics sorted by the number of documents they are allocated to
top_index = doc_count.argsort()[-10:][::-1]
print('Most important clusters (by number of docs inside):', top_index)
print('*'*20)

# show the top 5 words in term frequency for each cluster 
topic_indices = np.arange(start=0, stop=len(doc_count), step=1)
top_words(mgp.cluster_word_distribution, topic_indices, 5)

Number of documents per topic : [ 0  0  0  0  0  0  0  0 87 14]
********************
Most important clusters (by number of docs inside): [8 9 7 6 5 4 3 2 1 0]
********************
Cluster 0 : []
 — — — — — — — — —
Cluster 1 : []
 — — — — — — — — —
Cluster 2 : []
 — — — — — — — — —
Cluster 3 : []
 — — — — — — — — —
Cluster 4 : []
 — — — — — — — — —
Cluster 5 : []
 — — — — — — — — —
Cluster 6 : []
 — — — — — — — — —
Cluster 7 : []
 — — — — — — — — —
Cluster 8 : [("'", 3456), (',', 1641), (' ', 1641), ('e', 1217), ('a', 809)]
 — — — — — — — — —
Cluster 9 : [("'", 748), (',', 360), (' ', 360), ('e', 263), ('a', 211)]
 — — — — — — — — —
