In [15]:
'3) Key Topic Modeling'

'''
This file takes the data that was prepared in the 1) Data Prep and uses it to extract the top 20 
key topics, by MBTI type, for the comments. 

The files does the following:
    - Loads libraries and User Defined Functions
    - Loads the Data and Creates a list of MBTI types
    - Loops over the comments data by MBTI and extracts the top 20 topics and prints them to the screen
'''

'\nThis file takes the data that was prepared in the 1) Data Prep and uses it to extract the top 20 \nkey topics, by MBTI type, for the comments. \n\nThe files does the following:\n    - Loads libraries and User Defined Functions\n    - Loads the Data and Creates a list of MBTI types\n    - Loops over the comments data by MBTI and extracts the top 20 topics and prints them to the screen\n'

In [16]:
#Import Libraries
import numpy as np
import pandas as pd
import pickle

from gensim import corpora, models
from normalization import normalize_corpus

In [17]:
#Load User Defined Functions
def print_topics_gensim(topic_model, total_topics=1,
                        weight_threshold=0.0001,
                        display_weights=False,
                        num_terms=None):
    
    for index in range(total_topics):
        topic = topic_model.show_topic(index)
        topic = [(word, round(wt,2)) 
                 for word, wt in topic 
                 if abs(wt) >= weight_threshold]
        if display_weights:
            print('Topic #'+str(index+1)+' with weights')
            print(topic[:num_terms] if num_terms else topic)
        else:
            print('Topic #'+str(index+1)+' without weights')
            tw = [term for term, wt in topic]
            print(tw[:num_terms] if num_terms else tw)
        print()

In [18]:
#Load the cleaned MBTI data
cleaned_mbti_token_cmtlvl = pd.read_pickle("cleaned_mbti_token_cmtlvl.pkl")
print(cleaned_mbti_token_cmtlvl.head())

mbti_list = cleaned_mbti_token_cmtlvl.iloc[:,3].values.tolist()
mbti_list = list(set(mbti_list))

print(mbti_list)

   user commentnum                                            comment  type  \
0     0         s1                                               'url  INFJ   
1     0         s2                                                url  INFJ   
2     0         s3  enfp and intj moments url sportscenter not top...  INFJ   
3     0         s4  What has been the most life-changing experienc...  INFJ   
4     0         s5               url url On repeat for most of today   INFJ   

   type_enc             0           1       2      3             4  ...   \
0         8           url        None    None   None          None  ...    
1         8           url        None    None   None          None  ...    
2         8          enfp        intj  moment    url  sportscenter  ...    
3         8  lifechanging  experience    life   None          None  ...    
4         8           url         url  repeat  today          None  ...    

     56    57    58    59    60    61    62    63    64    65  
0  N

In [21]:
#For each MBTI Type in the data, extract the top 20 topics 

total_topics = 20 #Number of topics

#Define the words to be removed from the lists
wordlist = ['url','infj','intj','infp','intp','enfj','entj','enfp','entp','isfj',
                'istj','isfp','istp','esfj','estj','esfp','estp','tapatalk']

for mbti in mbti_list:
    print('-----------------------------------------------------------------------------------')
    print('-----------------------------------------------------------------------------------')
    print('-----------------------------------------------------------------------------------')
    print(mbti)
    
    #Subset the data
    subset = cleaned_mbti_token_cmtlvl.loc[cleaned_mbti_token_cmtlvl['type'] == mbti]
    print(subset.shape)
    print()
    
    #Transform the data to list form
    features = subset.iloc[:,5:].values.tolist()
    print(features[0:2])
    print()

    #Remove common/superfuerlous words from the lists
    feature_none = []
    for x in features:
        y = list(filter(None.__ne__, x))
        z = [z for z in y if z not in wordlist]    
        feature_none.append(z)
        
    #Create a dictionary of the words
    dictionary = corpora.Dictionary(feature_none)
    print(len(dictionary.token2id))

    #Transform the document to a BOW
    corpus = [dictionary.doc2bow(text, allow_update=True) for text in feature_none]
    #print(corpus[:2])

    #Transform to TFIDF
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    #Extract top topics using Latent Semantic Indexing
    lsi = models.LsiModel(corpus_tfidf, 
                          id2word=dictionary, 
                          num_topics=total_topics)

    #Print the top topics
    print_topics_gensim(topic_model=lsi,
                        total_topics=total_topics,
                        num_terms=15,
                        display_weights=False)

-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
ISFP
(12971, 71)

[['paint', 'without', 'number', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], ['would', 'guess', 'istp', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None


4475
Topic #1 without weights
['like', 'think', 'would', 'know', 'people', 'get', 'type', 'say', 'feel', 'one']

Topic #2 without weights
['welcome', 'forum', 'enjoy', 'new', 'proud', 'sarah', 'hi', 'home', 'hope', 'man']

Topic #3 without weights
['type', 'function', 'test', 'like', 'feel', 'use', 'fe', 'personality', 'take', 'ne']

Topic #4 without weights
['type', 'use', 'test', 'like', 'phone', 'sent', 'function', 'thanks', 'fe', 'take']

Topic #5 without weights
['like', 'type', 'feel', 'would', 'thread', 'well', 'post', 'want', 'think', 'lot']

Topic #6 without weights
['would', 'friend', 'say', 'like', 'time', 'use', 'phone', 'get', 'thanks', 'best']

Topic #7 without weights
['phone', 'use', 'test', 'sent', 'take', 'mean', 'would', 'love', 'get', 'us']

Topic #8 without weights
['friend', 'think', 'sure', 'thanks', 'thank', 'well', 'best', 'help', 'close', 'feel']

Topic #9 without weights
['type', 'help', 'fe', 'thanks', 'think', 'function', 'make', 'people', 'lot', 'get']

T


Topic #19 without weights
['laugh', 'well', 'sure', 'tell', 'feel', 'one', 'make', 'right', 'even', 'sound']

Topic #20 without weights
['well', 'sure', 'get', 'go', 'post', 'welcome', 'want', 'test', 'love', 'maybe']

-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
ISTJ
(9870, 71)

[['newton', 'universal', 'gravity', 'law', 'mean', 'seriously', 'would', 'nothing', 'follow', 'law', 'dust', 'particle', 'space', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], ['well', 'money', 'time', 'object', 'would', 'backpack', 'way'


11955
Topic #1 without weights
['like', 'think', 'would', 'know', 'get', 'people', 'say', 'one', 'make', 'thing']

Topic #2 without weights
['type', 'think', 'welcome', 'want', 'like', 'say', 'know', 'would', 'get', 'personality']

Topic #3 without weights
['welcome', 'type', 'think', 'know', 'post', 'hello', 'get', 'say', 'love', 'back']

Topic #4 without weights
['know', 'would', 'say', 'welcome', 'want', 'get', 'make', 'question', 'thread', 'feel']

Topic #5 without weights
['like', 'know', 'want', 'feel', 'thread', 'post', 'sound', 'say', 'would', 'yes']

Topic #6 without weights
['like', 'get', 'know', 'type', 'thread', 'sound', 'good', 'yes', 'post', 'think']

Topic #7 without weights
['thread', 'think', 'know', 'would', 'welcome', 'post', 'say', 'go', 'get', 'people']

Topic #8 without weights
['think', 'yes', 'type', 'good', 'thread', 'post', 'people', 'personality', 'like', 'know']

Topic #9 without weights
['yes', 'thread', 'think', 'type', 'get', 'good', 'post', 'well', 'la

-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
INTP
(63251, 71)

[['good', 'one', 'url', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], ['course', 'say', 'know', 'blessing', 'curse', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None


[['want', 'go', 'trip', 'without', 'stay', 'behind', 'would', 'option', 'think', 'really', 'believe', 'one', 'unreasonable', 'still', 'continue', 'say', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], ['still', 'completely', 'awe', 'amazed', 'think', 'twice', 'leave', 'one', 'make', 'feel', 'say', 'want', 'tell', 'willing', 'work', 'extra', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]]

20082
Topic #1 without weights
['like', 'think', 'know', 'would', 'people', 'get', 'really', 'say', '

In [20]:
'''
Observations of the topics by MBTI type:
- Overall it appears the the same topics are extracted for each MBTI. This make sense since forum which the data is from
involves people from all types discusing topics. 
'''

'\nObservaations of the topics by MBTI type:\n\n'

In [23]:
#For each MBTI Type in the data, extract the top 20 topics 

total_topics = 20 #Number of topics

#Define the words to be removed from the lists
wordlist = ['url','infj','intj','infp','intp','enfj','entj','enfp','entp','isfj',
                'istj','isfp','istp','esfj','estj','esfp','estp','tapatalk']

print('-----------------------------------------------------------------------------------')
print('-----------------------------------------------------------------------------------')
print('-----------------------------------------------------------------------------------')
    
#Subset the data
subset = cleaned_mbti_token_cmtlvl
print(subset.shape)
print()

#Transform the data to list form
features = subset.iloc[:,5:].values.tolist()

#Remove common/superfuerlous words from the lists
feature_none = []
for x in features:
    y = list(filter(None.__ne__, x))
    z = [z for z in y if z not in wordlist]    
    feature_none.append(z)

#Create a dictionary of the words
dictionary = corpora.Dictionary(feature_none)
print(len(dictionary.token2id))

#Transform the document to a BOW
corpus = [dictionary.doc2bow(text, allow_update=True) for text in feature_none]
#print(corpus[:2])

#Transform to TFIDF
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

#Extract top topics using Latent Semantic Indexing
lsi = models.LsiModel(corpus_tfidf, 
                      id2word=dictionary, 
                      num_topics=total_topics)

#Print the top topics
print_topics_gensim(topic_model=lsi,
                        total_topics=total_topics,
                        num_terms=15,
                        display_weights=False)

-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
ENFP
(421757, 71)

[['url', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], ['url', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, Non

In [None]:
'''
Observations of the topics:
- Welcome to the forum 
- Feelings
- Relationships/Friends
- Thanks to other uses comments

'''