Notebook for Twitter Topic Modeling

In [1]:
#pip install git+https://github.com/rwalk/gsdmm.git

In [2]:
import gsdmm
import numpy as np
import pandas as pd
import pickle
import re

from gsdmm import MovieGroupProcess
from tqdm import tqdm

In [3]:
pd.set_option('display.max_colwidth', None)

# Import dataset
tweets_df = pd.read_csv('dummy data_processed')
tweets_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tweet_id,text,processed_text,stripped_text,text_lem,text_tokens
0,0,0,1630989944289398784,"I ""think"" I started my menstrual cycle yesterday, but due to the ablation surgery working, I'm not bleeding at all!\n\nIt's really bizarre and cool! I have always been regular, so I can track that it's the right time. But the period part of it is just not there at all. (One still…","i ""think"" i started my menstrual cycle yesterday, but due to the ablation surgery working, i am not bleeding at all! it is really bizarre and cool! i have always been regular, so i can track that it is the right time. but the period part of it is just not there at all. (one still…",think started menstrual cycle yesterday due ablation surgery working bleeding really bizarre cool always regular track right time period part one still…,think started menstrual cycle yesterday due ablation surgery working bleeding really bizarre cool always regular track right time period part one still…,"['think', 'started', 'menstrual', 'cycle', 'yesterday', 'due', 'ablation', 'surgery', 'working', 'bleeding', 'really', 'bizarre', 'cool', 'always', 'regular', 'track', 'right', 'time', 'period', 'part', 'one', 'still…']"
1,1,1,1630974067875381265,cw menstruation ////\n.\n.\n.\ni have had my period for over 15 years. my cycle has always been over a month. when will my brain stop convincing myself that i’m pregnant every time i hit day 29 of my cycle.,cw menstruation //// . . . i have had my period for over 15 years. my cycle has always been over a month. when will my brain stop convincing myself that i am pregnant every time i hit day 29 of my cycle.,menstruation period years cycle always month will brain stop convincing pregnant every time hit day cycle,menstruation period years cycle always month will brain stop convincing pregnant every time hit day cycle,"['menstruation', 'period', 'years', 'cycle', 'always', 'month', 'will', 'brain', 'stop', 'convincing', 'pregnant', 'every', 'time', 'hit', 'day', 'cycle']"
2,2,2,1630965437709053952,"Clubs must ensure that they enhance supportive measures to our women on matters Menstrual Health Hygiene Management (MHM) including period tracking, free sanitary pads, MHM talks internally and with health experts, etc.\n\n#PlayersWelfareKE https://t.co/WyOkrwc8Wh","clubs must ensure that they enhance supportive measures to our women on matters menstrual health hygiene management (mhm) including period tracking, free sanitary pads, mhm talks internally and with health experts, etc.",clubs must ensure enhance supportive measures women matters menstrual health hygiene management mhm including period tracking free sanitary pads mhm talks internally health experts etc,clubs must ensure enhance supportive measures women matters menstrual health hygiene management mhm including period tracking free sanitary pads mhm talks internally health experts etc,"['clubs', 'must', 'ensure', 'enhance', 'supportive', 'measures', 'women', 'matters', 'menstrual', 'health', 'hygiene', 'management', 'mhm', 'including', 'period', 'tracking', 'free', 'sanitary', 'pads', 'mhm', 'talks', 'internally', 'health', 'experts', 'etc']"
3,3,3,1630878576538013696,"#EndPeriodShaming\nI used to think it was a normal narrative until I saw an orphan girl somewhere in the village who couldn't afford menstrual tools for her cycle and she was like ""I wish I was a boy, I wish I had parents""😭\nPads bring girls back to school. Pads end period poverty","i used to think it was a normal narrative until i saw an orphan girl somewhere in the village who could not afford menstrual tools for her cycle and she was like ""i wish i was a boy, i wish i had parents""😭 pads bring girls back to school. pads end period poverty",used think normal narrative saw orphan girl somewhere village afford menstrual tools cycle wish boy wish parents pads bring girls back school pads end period poverty,used think normal narrative saw orphan girl somewhere village afford menstrual tools cycle wish boy wish parents pads bring girls back school pads end period poverty,"['used', 'think', 'normal', 'narrative', 'saw', 'orphan', 'girl', 'somewhere', 'village', 'afford', 'menstrual', 'tools', 'cycle', 'wish', 'boy', 'wish', 'parents', 'pads', 'bring', 'girls', 'back', 'school', 'pads', 'end', 'period', 'poverty']"
4,4,4,1630863346143580161,"Let’s push for ending period stigma, period poverty, provision of free and affordable sanitary pads, and open education on healthy, safe and dignified menstrual practices.\n#KeepGirlsInSchool #EducationForAll #EndPeriodPoverty \n#EndMenstrualStigma https://t.co/Ri7uMMjQwP","let us push for ending period stigma, period poverty, provision of free and affordable sanitary pads, and open education on healthy, safe and dignified menstrual practices.",let push ending period stigma period poverty provision free affordable sanitary pads open education healthy safe dignified menstrual practices,let push ending period stigma period poverty provision free affordable sanitary pads open education healthy safe dignified menstrual practices,"['let', 'push', 'ending', 'period', 'stigma', 'period', 'poverty', 'provision', 'free', 'affordable', 'sanitary', 'pads', 'open', 'education', 'healthy', 'safe', 'dignified', 'menstrual', 'practices']"


In [4]:
# Create a single list of tweet tokens
docs = tweets_df['text_tokens'].tolist()
docs[:3]

["['think', 'started', 'menstrual', 'cycle', 'yesterday', 'due', 'ablation', 'surgery', 'working', 'bleeding', 'really', 'bizarre', 'cool', 'always', 'regular', 'track', 'right', 'time', 'period', 'part', 'one', 'still…']",
 "['menstruation', 'period', 'years', 'cycle', 'always', 'month', 'will', 'brain', 'stop', 'convincing', 'pregnant', 'every', 'time', 'hit', 'day', 'cycle']",
 "['clubs', 'must', 'ensure', 'enhance', 'supportive', 'measures', 'women', 'matters', 'menstrual', 'health', 'hygiene', 'management', 'mhm', 'including', 'period', 'tracking', 'free', 'sanitary', 'pads', 'mhm', 'talks', 'internally', 'health', 'experts', 'etc']"]

In [5]:
# Train STTM model
mgp = MovieGroupProcess(K=10, alpha=0.1, beta=0.1, n_iters=30)
vocab = set(x for doc in docs for x in doc)
n_terms = len(vocab)
y = mgp.fit(docs, n_terms)

# Save model
with open('10clusters.model', 'wb') as f:
    pickle.dump(mgp, f)
    f.close()

In stage 0: transferred 87 clusters with 4 clusters populated
In stage 1: transferred 7 clusters with 2 clusters populated
In stage 2: transferred 0 clusters with 2 clusters populated
In stage 3: transferred 0 clusters with 2 clusters populated
In stage 4: transferred 1 clusters with 2 clusters populated
In stage 5: transferred 2 clusters with 2 clusters populated
In stage 6: transferred 1 clusters with 2 clusters populated
In stage 7: transferred 0 clusters with 2 clusters populated
In stage 8: transferred 0 clusters with 2 clusters populated
In stage 9: transferred 0 clusters with 2 clusters populated
In stage 10: transferred 0 clusters with 2 clusters populated
In stage 11: transferred 0 clusters with 2 clusters populated
In stage 12: transferred 0 clusters with 2 clusters populated
In stage 13: transferred 0 clusters with 2 clusters populated
In stage 14: transferred 0 clusters with 2 clusters populated
In stage 15: transferred 2 clusters with 2 clusters populated
In stage 16: tran

In [6]:
# Load in trained model 
filehandler = open('10clusters.model', 'rb')
mgp = pickle.load(filehandler)

In [7]:
# Define helper functions

# Prints the top words in each cluster
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts =sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print('Cluster %s : %s'%(cluster,sort_dicts))
        print(' — — — — — — — — —')
        
# Returns a word-topic matrix[phi] where each value represents the word importance for that particular cluster; 
# phi[i][w] would be the importance of word w in topic i.
def cluster_importance(mgp):
    n_z_w = mgp.cluster_word_distribution
    beta, V, K = mgp.beta, mgp.vocab_size, mgp.K
    phi = [{} for i in range(K)]
    for z in range(K):
        for w in n_z_w[z]:
            phi[z][w] = (n_z_w[z][w]+beta)/(sum(n_z_w[z].values())+V*beta)
    return phi

# Allocates all topics to each document in original dataframe,
# adding two columns for cluster number and cluster description
def topic_allocation(df, docs, mgp, topic_dict):
    topic_allocations = []
    for doc in tqdm(docs):
        topic_label, score = mgp.choose_best_label(doc)
        topic_allocations.append(topic_label)

    df['cluster'] = topic_allocations

    df['topic_name'] = df.cluster.apply(lambda x: get_topic_name(x, topic_dict))
    print('Complete. Number of documents with topic allocated: {}'.format(len(df)))

# Returns the topic name string value from a dictionary of topics
def get_topic_name(doc, topic_dict):
    topic_desc = topic_dict[doc]
    return topic_desc

In [8]:
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)
print('*'*20)

# Topics sorted by the number of documents they are allocated to
top_index = doc_count.argsort()[-10:][::-1]
print('Most important clusters (by number of docs inside):', top_index)
print('*'*20)

# show the top 5 words in term frequency for each cluster 
topic_indices = np.arange(start=0, stop=len(doc_count), step=1)
top_words(mgp.cluster_word_distribution, topic_indices, 5)

Number of documents per topic : [87  0  0  0  0  0  0  0  0 14]
********************
Most important clusters (by number of docs inside): [0 9 8 7 6 5 4 3 2 1]
********************
Cluster 0 : [("'", 3456), (',', 1641), (' ', 1641), ('e', 1217), ('a', 809)]
 — — — — — — — — —
Cluster 1 : []
 — — — — — — — — —
Cluster 2 : []
 — — — — — — — — —
Cluster 3 : []
 — — — — — — — — —
Cluster 4 : []
 — — — — — — — — —
Cluster 5 : []
 — — — — — — — — —
Cluster 6 : []
 — — — — — — — — —
Cluster 7 : []
 — — — — — — — — —
Cluster 8 : []
 — — — — — — — — —
Cluster 9 : [("'", 748), (',', 360), (' ', 360), ('e', 263), ('a', 211)]
 — — — — — — — — —


In [9]:
# define dictionary topics in same sequential order
# as resulting clusters from gsdmm model 
topic_dict = {}
topic_names = ['topic 0',
               'topic 1',
               'topic 2',
               'topic 3',
               'topic 4',
               'topic 5',
               'topic 6',
               'topic 7',
               'topic 8',
               'topic 9']

for i, topic_num in enumerate(topic_indices):
    topic_dict[topic_num]=topic_names[i]
    
# allocate topics to original data frame 
topic_allocation(tweets_df, docs, mgp, topic_dict)

100%|████████████████████████████████████████| 101/101 [00:00<00:00, 225.52it/s]

Complete. Number of documents with topic allocated: 101





In [10]:
tweets_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tweet_id,text,processed_text,stripped_text,text_lem,text_tokens,cluster,topic_name
0,0,0,1630989944289398784,"I ""think"" I started my menstrual cycle yesterday, but due to the ablation surgery working, I'm not bleeding at all!\n\nIt's really bizarre and cool! I have always been regular, so I can track that it's the right time. But the period part of it is just not there at all. (One still…","i ""think"" i started my menstrual cycle yesterday, but due to the ablation surgery working, i am not bleeding at all! it is really bizarre and cool! i have always been regular, so i can track that it is the right time. but the period part of it is just not there at all. (one still…",think started menstrual cycle yesterday due ablation surgery working bleeding really bizarre cool always regular track right time period part one still…,think started menstrual cycle yesterday due ablation surgery working bleeding really bizarre cool always regular track right time period part one still…,"['think', 'started', 'menstrual', 'cycle', 'yesterday', 'due', 'ablation', 'surgery', 'working', 'bleeding', 'really', 'bizarre', 'cool', 'always', 'regular', 'track', 'right', 'time', 'period', 'part', 'one', 'still…']",0,topic 0
1,1,1,1630974067875381265,cw menstruation ////\n.\n.\n.\ni have had my period for over 15 years. my cycle has always been over a month. when will my brain stop convincing myself that i’m pregnant every time i hit day 29 of my cycle.,cw menstruation //// . . . i have had my period for over 15 years. my cycle has always been over a month. when will my brain stop convincing myself that i am pregnant every time i hit day 29 of my cycle.,menstruation period years cycle always month will brain stop convincing pregnant every time hit day cycle,menstruation period years cycle always month will brain stop convincing pregnant every time hit day cycle,"['menstruation', 'period', 'years', 'cycle', 'always', 'month', 'will', 'brain', 'stop', 'convincing', 'pregnant', 'every', 'time', 'hit', 'day', 'cycle']",0,topic 0
2,2,2,1630965437709053952,"Clubs must ensure that they enhance supportive measures to our women on matters Menstrual Health Hygiene Management (MHM) including period tracking, free sanitary pads, MHM talks internally and with health experts, etc.\n\n#PlayersWelfareKE https://t.co/WyOkrwc8Wh","clubs must ensure that they enhance supportive measures to our women on matters menstrual health hygiene management (mhm) including period tracking, free sanitary pads, mhm talks internally and with health experts, etc.",clubs must ensure enhance supportive measures women matters menstrual health hygiene management mhm including period tracking free sanitary pads mhm talks internally health experts etc,clubs must ensure enhance supportive measures women matters menstrual health hygiene management mhm including period tracking free sanitary pads mhm talks internally health experts etc,"['clubs', 'must', 'ensure', 'enhance', 'supportive', 'measures', 'women', 'matters', 'menstrual', 'health', 'hygiene', 'management', 'mhm', 'including', 'period', 'tracking', 'free', 'sanitary', 'pads', 'mhm', 'talks', 'internally', 'health', 'experts', 'etc']",0,topic 0
3,3,3,1630878576538013696,"#EndPeriodShaming\nI used to think it was a normal narrative until I saw an orphan girl somewhere in the village who couldn't afford menstrual tools for her cycle and she was like ""I wish I was a boy, I wish I had parents""😭\nPads bring girls back to school. Pads end period poverty","i used to think it was a normal narrative until i saw an orphan girl somewhere in the village who could not afford menstrual tools for her cycle and she was like ""i wish i was a boy, i wish i had parents""😭 pads bring girls back to school. pads end period poverty",used think normal narrative saw orphan girl somewhere village afford menstrual tools cycle wish boy wish parents pads bring girls back school pads end period poverty,used think normal narrative saw orphan girl somewhere village afford menstrual tools cycle wish boy wish parents pads bring girls back school pads end period poverty,"['used', 'think', 'normal', 'narrative', 'saw', 'orphan', 'girl', 'somewhere', 'village', 'afford', 'menstrual', 'tools', 'cycle', 'wish', 'boy', 'wish', 'parents', 'pads', 'bring', 'girls', 'back', 'school', 'pads', 'end', 'period', 'poverty']",0,topic 0
4,4,4,1630863346143580161,"Let’s push for ending period stigma, period poverty, provision of free and affordable sanitary pads, and open education on healthy, safe and dignified menstrual practices.\n#KeepGirlsInSchool #EducationForAll #EndPeriodPoverty \n#EndMenstrualStigma https://t.co/Ri7uMMjQwP","let us push for ending period stigma, period poverty, provision of free and affordable sanitary pads, and open education on healthy, safe and dignified menstrual practices.",let push ending period stigma period poverty provision free affordable sanitary pads open education healthy safe dignified menstrual practices,let push ending period stigma period poverty provision free affordable sanitary pads open education healthy safe dignified menstrual practices,"['let', 'push', 'ending', 'period', 'stigma', 'period', 'poverty', 'provision', 'free', 'affordable', 'sanitary', 'pads', 'open', 'education', 'healthy', 'safe', 'dignified', 'menstrual', 'practices']",0,topic 0


In [11]:
# Export data for further analysis
tweets_df.to_csv("dummy data_topics")