Notebook for Twitter Topic Modeling

In [1]:
#pip install git+https://github.com/rwalk/gsdmm.git

In [2]:
import gsdmm
import numpy as np
import pandas as pd
import pickle
import re

from gsdmm import MovieGroupProcess
from tqdm import tqdm

In [3]:
pd.set_option('display.max_colwidth', None)

# Import dataset
tweets_df = pd.read_csv('tweet_processed')
tweets_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tweet_id,tweet_date,text,processed_text,stripped_text,text_lem,text_tokens
0,0,0,1641476680660254726,2023-03-30 16:25:12+00:00,"This is amazing. I've been pretty lucky in that my periods themselves are pretty average -- still painful, but not crippling. But what was crippling was my PMDD and the migraines that were tied to my cycle. To be able to just have that time would be invaluable.","this is amazing. i have been pretty lucky in that my periods themselves are pretty average -- still painful, but not crippling. but what was crippling was my pmdd and the migraines that were tied to my cycle. to be able to just have that time would be invaluable.",amazing pretty lucky periods pretty average still painful crippling crippling pmdd migraines tied cycle able time invaluable,amazing pretty lucky periods pretty average still painful crippling crippling pmdd migraines tied cycle able time invaluable,"['amazing', 'pretty', 'lucky', 'periods', 'pretty', 'average', 'still', 'painful', 'crippling', 'crippling', 'pmdd', 'migraines', 'tied', 'cycle', 'able', 'time', 'invaluable']"
1,1,1,1641473383291400208,2023-03-30 16:12:06+00:00,"@LadySnArkansas Some girls start their menstrual periods in third grade. Teachers must discuss with the girl, give reassurance, provide sanitary pads, etc. Teachers must cope with many situations. This is one.","some girls start their menstrual periods in third grade. teachers must discuss with the girl, give reassurance, provide sanitary pads, etc. teachers must cope with many situations. this is one.",girls start menstrual periods third grade teachers must discuss girl give reassurance provide sanitary pads etc teachers must cope many situations one,girls start menstrual periods third grade teachers must discuss girl give reassurance provide sanitary pads etc teachers must cope many situations one,"['girls', 'start', 'menstrual', 'periods', 'third', 'grade', 'teachers', 'must', 'discuss', 'girl', 'give', 'reassurance', 'provide', 'sanitary', 'pads', 'etc', 'teachers', 'must', 'cope', 'many', 'situations', 'one']"
2,2,2,1641473086661828609,2023-03-30 16:10:55+00:00,@KiraAfter_Dark @RebeccadeLuca8 @CDPROJEKTRED Nope completely wrong. My opinion is women do need their time of period off from work. In some RARE cases keyword is rare here don't let it go over your head. In some RARE cases a woman's menstrual cycle might be painfullyagonizing. But for the other 99 percent. Stop being soft,nope completely wrong. my opinion is women do need their time of period off from work. in some rare cases keyword is rare here do not let it go over your head. in some rare cases a woman's menstrual cycle might be painfullyagonizing. but for the other 99 percent. stop being soft,nope completely wrong opinion women need time period work rare cases keyword rare let head rare cases woman menstrual cycle might painfullyagonizing percent stop soft,nope completely wrong opinion women need time period work rare cases keyword rare let head rare cases woman menstrual cycle might painfullyagonizing percent stop soft,"['nope', 'completely', 'wrong', 'opinion', 'women', 'need', 'time', 'period', 'work', 'rare', 'cases', 'keyword', 'rare', 'let', 'head', 'rare', 'cases', 'woman', 'menstrual', 'cycle', 'might', 'painfullyagonizing', 'percent', 'stop', 'soft']"
3,3,3,1641447805708546048,2023-03-30 14:30:28+00:00,Fluctuating hormone levels during your menstrual cycle can impact your complexion. Here are my #ProTips on caring for your skin during your period.\n\nhttps://t.co/ALEXR0HrUW,fluctuating hormone levels during your menstrual cycle can impact your complexion. here are my on caring for your skin during your period.,fluctuating hormone levels menstrual cycle impact complexion caring skin period,fluctuating hormone levels menstrual cycle impact complexion caring skin period,"['fluctuating', 'hormone', 'levels', 'menstrual', 'cycle', 'impact', 'complexion', 'caring', 'skin', 'period']"
4,4,4,1641439775759667203,2023-03-30 13:58:33+00:00,Scented Sanitary Pads' are a big No-No. Swipe left to know why\n\n#sanitarypads #periods #menstruation #sanitarynapkins #period #menstruationmatters #menstrualhealth\n#Scentedpads https://t.co/SZyN5I95ca,scented sanitary pads' are a big no-no. swipe left to know why,scented sanitary pads big swipe left know,scented sanitary pads big swipe left know,"['scented', 'sanitary', 'pads', 'big', 'swipe', 'left', 'know']"


In [4]:
# convert string of tokens into tokens list
tweets_df['text_tokens'] = tweets_df['text_tokens'].apply(lambda x: re.split('\s', x))

# Create a single list of tweet tokens
docs = tweets_df['text_tokens'].tolist()

docs[:3]

[["['amazing',",
  "'pretty',",
  "'lucky',",
  "'periods',",
  "'pretty',",
  "'average',",
  "'still',",
  "'painful',",
  "'crippling',",
  "'crippling',",
  "'pmdd',",
  "'migraines',",
  "'tied',",
  "'cycle',",
  "'able',",
  "'time',",
  "'invaluable']"],
 ["['girls',",
  "'start',",
  "'menstrual',",
  "'periods',",
  "'third',",
  "'grade',",
  "'teachers',",
  "'must',",
  "'discuss',",
  "'girl',",
  "'give',",
  "'reassurance',",
  "'provide',",
  "'sanitary',",
  "'pads',",
  "'etc',",
  "'teachers',",
  "'must',",
  "'cope',",
  "'many',",
  "'situations',",
  "'one']"],
 ["['nope',",
  "'completely',",
  "'wrong',",
  "'opinion',",
  "'women',",
  "'need',",
  "'time',",
  "'period',",
  "'work',",
  "'rare',",
  "'cases',",
  "'keyword',",
  "'rare',",
  "'let',",
  "'head',",
  "'rare',",
  "'cases',",
  "'woman',",
  "'menstrual',",
  "'cycle',",
  "'might',",
  "'painfullyagonizing',",
  "'percent',",
  "'stop',",
  "'soft']"]]

In [5]:
# Train STTM model
mgp = MovieGroupProcess(K=5, alpha=0.1, beta=0.1, n_iters=15)
vocab = set(x for doc in docs for x in doc)
n_terms = len(vocab)
y = mgp.fit(docs, n_terms)

# Save model
with open('clusters.model', 'wb') as f:
    pickle.dump(mgp, f)
    f.close()

In stage 0: transferred 10474 clusters with 5 clusters populated
In stage 1: transferred 6115 clusters with 5 clusters populated
In stage 2: transferred 2899 clusters with 5 clusters populated
In stage 3: transferred 2007 clusters with 5 clusters populated
In stage 4: transferred 1629 clusters with 5 clusters populated
In stage 5: transferred 1451 clusters with 5 clusters populated
In stage 6: transferred 1353 clusters with 5 clusters populated
In stage 7: transferred 1276 clusters with 5 clusters populated
In stage 8: transferred 1189 clusters with 5 clusters populated
In stage 9: transferred 1160 clusters with 5 clusters populated
In stage 10: transferred 1107 clusters with 5 clusters populated
In stage 11: transferred 1043 clusters with 5 clusters populated
In stage 12: transferred 1019 clusters with 5 clusters populated
In stage 13: transferred 1035 clusters with 5 clusters populated
In stage 14: transferred 1001 clusters with 5 clusters populated


In [6]:
# Load in trained model 
filehandler = open('clusters.model', 'rb')
mgp = pickle.load(filehandler)

In [7]:
# Define helper functions

# Prints the top words in each cluster
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts =sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print('Cluster %s : %s'%(cluster,sort_dicts))
        print(' — — — — — — — — —')
        
# Returns a word-topic matrix[phi] where each value represents the word importance for that particular cluster; 
# phi[i][w] would be the importance of word w in topic i.
def cluster_importance(mgp):
    n_z_w = mgp.cluster_word_distribution
    beta, V, K = mgp.beta, mgp.vocab_size, mgp.K
    phi = [{} for i in range(K)]
    for z in range(K):
        for w in n_z_w[z]:
            phi[z][w] = (n_z_w[z][w]+beta)/(sum(n_z_w[z].values())+V*beta)
    return phi

# Allocates all topics to each document in original dataframe,
# adding two columns for cluster number and cluster description
def topic_allocation(df, docs, mgp, topic_dict):
    topic_allocations = []
    for doc in tqdm(docs):
        topic_label, score = mgp.choose_best_label(doc)
        topic_allocations.append(topic_label)

    df['cluster'] = topic_allocations

    df['topic_name'] = df.cluster.apply(lambda x: get_topic_name(x, topic_dict))
    print('Complete. Number of documents with topic allocated: {}'.format(len(df)))

# Returns the topic name string value from a dictionary of topics
def get_topic_name(doc, topic_dict):
    topic_desc = topic_dict[doc]
    return topic_desc

In [8]:
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)
print('*'*20)

# Topics sorted by the number of documents they are allocated to
top_index = doc_count.argsort()[-5:][::-1]
print('Most important clusters (by number of docs inside):', top_index)
print('*'*20)

# show the top 20 words in term frequency for each cluster 
topic_indices = np.arange(start=0, stop=len(doc_count), step=1)
top_words(mgp.cluster_word_distribution, topic_indices, 20)

Number of documents per topic : [5453 2343 3150 3013  885]
********************
Most important clusters (by number of docs inside): [0 2 3 1 4]
********************
Cluster 0 : [("'cycle',", 5102), ("'period',", 4457), ("'menstrual',", 3190), ("'days',", 1106), ("'periods',", 1080), ("'now',", 765), ("'pms',", 714), ("'day',", 704), ("'week',", 607), ("'month',", 581), ("'got',", 566), ("'time',", 512), ("'last',", 512), ("'cycle']", 501), ("'every',", 496), ("'pain',", 474), ("'cramps',", 450), ("'weeks',", 439), ("'know',", 428), ("'will',", 423)]
 — — — — — — — — —
Cluster 1 : [("'sanitary',", 1756), ("'pads',", 1540), ("'period',", 1539), ("'menstrual',", 1162), ("'periods',", 590), ("'cycle',", 572), ("'products',", 518), ("'women',", 457), ("'tampons',", 437), ("'poverty',", 434), ("'use',", 304), ("'free',", 278), ("'menstruation',", 233), ("'cups',", 229), ("'tampon',", 224), ("'will',", 218), ("'pad',", 211), ("'one',", 197), ("'girls',", 187), ("'know',", 165)]
 — — — — — — —

In [9]:
# define dictionary topics in same sequential order
# as resulting clusters from gsdmm model 
topic_dict = {}
topic_names = ['Period Symptoms',
               'Period Products',
               'Menstrual Equity/Accesibility',
               'Menstrual Cycle Timing/Tracking',
               'Menstrual Unity',]

for i, topic_num in enumerate(topic_indices):
    topic_dict[topic_num]=topic_names[i]
    
# allocate topics to original data frame 
topic_allocation(tweets_df, docs, mgp, topic_dict)

100%|███████████████████████████████████| 14844/14844 [00:03<00:00, 3969.29it/s]

Complete. Number of documents with topic allocated: 14844





In [10]:
tweets_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tweet_id,tweet_date,text,processed_text,stripped_text,text_lem,text_tokens,cluster,topic_name
0,0,0,1641476680660254726,2023-03-30 16:25:12+00:00,"This is amazing. I've been pretty lucky in that my periods themselves are pretty average -- still painful, but not crippling. But what was crippling was my PMDD and the migraines that were tied to my cycle. To be able to just have that time would be invaluable.","this is amazing. i have been pretty lucky in that my periods themselves are pretty average -- still painful, but not crippling. but what was crippling was my pmdd and the migraines that were tied to my cycle. to be able to just have that time would be invaluable.",amazing pretty lucky periods pretty average still painful crippling crippling pmdd migraines tied cycle able time invaluable,amazing pretty lucky periods pretty average still painful crippling crippling pmdd migraines tied cycle able time invaluable,"[['amazing',, 'pretty',, 'lucky',, 'periods',, 'pretty',, 'average',, 'still',, 'painful',, 'crippling',, 'crippling',, 'pmdd',, 'migraines',, 'tied',, 'cycle',, 'able',, 'time',, 'invaluable']]",0,Period Symptoms
1,1,1,1641473383291400208,2023-03-30 16:12:06+00:00,"@LadySnArkansas Some girls start their menstrual periods in third grade. Teachers must discuss with the girl, give reassurance, provide sanitary pads, etc. Teachers must cope with many situations. This is one.","some girls start their menstrual periods in third grade. teachers must discuss with the girl, give reassurance, provide sanitary pads, etc. teachers must cope with many situations. this is one.",girls start menstrual periods third grade teachers must discuss girl give reassurance provide sanitary pads etc teachers must cope many situations one,girls start menstrual periods third grade teachers must discuss girl give reassurance provide sanitary pads etc teachers must cope many situations one,"[['girls',, 'start',, 'menstrual',, 'periods',, 'third',, 'grade',, 'teachers',, 'must',, 'discuss',, 'girl',, 'give',, 'reassurance',, 'provide',, 'sanitary',, 'pads',, 'etc',, 'teachers',, 'must',, 'cope',, 'many',, 'situations',, 'one']]",2,Menstrual Equity/Accesibility
2,2,2,1641473086661828609,2023-03-30 16:10:55+00:00,@KiraAfter_Dark @RebeccadeLuca8 @CDPROJEKTRED Nope completely wrong. My opinion is women do need their time of period off from work. In some RARE cases keyword is rare here don't let it go over your head. In some RARE cases a woman's menstrual cycle might be painfullyagonizing. But for the other 99 percent. Stop being soft,nope completely wrong. my opinion is women do need their time of period off from work. in some rare cases keyword is rare here do not let it go over your head. in some rare cases a woman's menstrual cycle might be painfullyagonizing. but for the other 99 percent. stop being soft,nope completely wrong opinion women need time period work rare cases keyword rare let head rare cases woman menstrual cycle might painfullyagonizing percent stop soft,nope completely wrong opinion women need time period work rare cases keyword rare let head rare cases woman menstrual cycle might painfullyagonizing percent stop soft,"[['nope',, 'completely',, 'wrong',, 'opinion',, 'women',, 'need',, 'time',, 'period',, 'work',, 'rare',, 'cases',, 'keyword',, 'rare',, 'let',, 'head',, 'rare',, 'cases',, 'woman',, 'menstrual',, 'cycle',, 'might',, 'painfullyagonizing',, 'percent',, 'stop',, 'soft']]",0,Period Symptoms
3,3,3,1641447805708546048,2023-03-30 14:30:28+00:00,Fluctuating hormone levels during your menstrual cycle can impact your complexion. Here are my #ProTips on caring for your skin during your period.\n\nhttps://t.co/ALEXR0HrUW,fluctuating hormone levels during your menstrual cycle can impact your complexion. here are my on caring for your skin during your period.,fluctuating hormone levels menstrual cycle impact complexion caring skin period,fluctuating hormone levels menstrual cycle impact complexion caring skin period,"[['fluctuating',, 'hormone',, 'levels',, 'menstrual',, 'cycle',, 'impact',, 'complexion',, 'caring',, 'skin',, 'period']]",3,Menstrual Cycle Timing/Tracking
4,4,4,1641439775759667203,2023-03-30 13:58:33+00:00,Scented Sanitary Pads' are a big No-No. Swipe left to know why\n\n#sanitarypads #periods #menstruation #sanitarynapkins #period #menstruationmatters #menstrualhealth\n#Scentedpads https://t.co/SZyN5I95ca,scented sanitary pads' are a big no-no. swipe left to know why,scented sanitary pads big swipe left know,scented sanitary pads big swipe left know,"[['scented',, 'sanitary',, 'pads',, 'big',, 'swipe',, 'left',, 'know']]",4,Menstrual Unity


In [11]:
# Export data for further analysis
tweets_df.to_csv("tweet_topics")