In [62]:
'3) Key Topic Modeling'

'''
This file takes the data that was prepared in the 1) Data Prep and uses it to extract the top 20 
key topics, by MBTI type, for the comments. 

The files does the following:
    - Loads libraries and User Defined Functions
    - Loads the Data and Creates a list of MBTI types
    - Loops over the comments data by MBTI and extracts the top 20 topics and prints them to the screen
'''

'\nThis file takes the data that was prepared in the 1) Data Prep and uses it to extract the top 20 \nkey topics, by MBTI type, for the comments. \n\nThe files does the following:\n    - Loads libraries and User Defined Functions\n    - Loads the Data and Creates a list of MBTI types\n    - Loops over the comments data by MBTI and extracts the top 20 topics and prints them to the screen\n'

In [63]:
#Import Libraries
import numpy as np
import pandas as pd
import pickle

from gensim import corpora, models
from normalization import normalize_corpus

In [64]:
#Load User Defined Functions
def print_topics_gensim(topic_model, total_topics=1,
                        weight_threshold=0.0001,
                        display_weights=False,
                        num_terms=None):
    
    for index in range(total_topics):
        topic = topic_model.show_topic(index)
        topic = [(word, round(wt,2)) 
                 for word, wt in topic 
                 if abs(wt) >= weight_threshold]
        if display_weights:
            print('Topic #'+str(index+1)+' with weights')
            print(topic[:num_terms] if num_terms else topic)
        else:
            print('Topic #'+str(index+1)+' without weights')
            tw = [term for term, wt in topic]
            print(tw[:num_terms] if num_terms else tw)
        print()

In [65]:
#Load the cleaned MBTI data
cleaned_mbti_token_userlvl = pd.read_pickle("cleaned_mbti_token_userlvl.pkl")
print(cleaned_mbti_token_userlvl.head())

mbti_list = cleaned_mbti_token_userlvl.iloc[:,0].values.tolist()
mbti_list = list(set(mbti_list))

print(mbti_list)

   type                                              posts  type_enc  \
0  INFJ  'http://www.youtube.com/watch?v=qsXHcwe3krw|||...         8   
1  ENTP  'I'm finding the lack of me in these posts ver...         3   
2  INTP  'Good one  _____   https://www.youtube.com/wat...        11   
3  INTJ  'Dear INTP,   I enjoyed our conversation the o...        10   
4  ENTJ  'You're fired.|||That's another silly misconce...         2   

                                             comment     0        1      2  \
0  'url url enfp and intj moments url sportscente...   url      url   enfp   
1  'I'm finding the lack of me in these posts ver...  find     lack   post   
2  'Good one url Of course to which I say I know;...  good      one    url   
3  'Dear INTP I enjoyed our conversation the othe...  dear     intp  enjoy   
4  'You're fired That's another silly misconcepti...  fire  another  silly   

               3         4         5  ...    886   887   888   889   890  \
0           intj    mo

In [69]:
#For each MBTI Type in the data, extract the top 20 topics 

total_topics = 20 #Number of topics

#Define the words to be removed from the lists
wordlist = ['url','infj','intj','infp','intp','enfj','entj','enfp','entp','isfj',
                'istj','isfp','istp','esfj','estj','esfp','estp','tapatalk']

for mbti in mbti_list:
    print('-----------------------------------------------------------------------------------')
    print('-----------------------------------------------------------------------------------')
    print('-----------------------------------------------------------------------------------')
    print(mbti)
    
    #Subset the data
    subset = cleaned_mbti_token_userlvl.loc[cleaned_mbti_token_userlvl['type'] == mbti]
    
    #Transform the data to list form
    features = subset.iloc[:,4:].values.tolist()

    #Remove common/superfuerlous words from the lists
    feature_none = []
    for x in features:
        y = list(filter(None.__ne__, x))
        z = [z for z in y if z not in wordlist]    
        feature_none.append(z)

    labels = cleaned_mbti_token_userlvl.iloc[:,[1,3]].values.tolist()

    #Create a dictionary of the words
    dictionary = corpora.Dictionary(feature_none)
    #print( dictionary.token2id)

    #Transform the document to a BOW
    corpus = [dictionary.doc2bow(text) for text in feature_none]
    #print(corpus[:2])

    #Transform to TFIDF
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    #Extract top topics using Latent Semantic Indexing
    lsi = models.LsiModel(corpus_tfidf, 
                          id2word=dictionary, 
                          num_topics=total_topics)

    #Print the top topics
    print_topics_gensim(topic_model=lsi,
                        total_topics=total_topics,
                        num_terms=15,
                        display_weights=False)

-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
ISFP
Topic #1 without weights
['function', 'dont', 'p', 'hata', 'thanks', 'pretty', 'yeah', 'school', 'sometimes', 'welcome']

Topic #2 without weights
['per', 'welcome', 'socionics', 'hug', 'function', 'se', 'fe', 'ne', 'si', 'met']

Topic #3 without weights
['per', 'welcome', 'hug', 'socionics', 'huge', 'function', 'hi', 'fe', 'hello', 'se']

Topic #4 without weights
['dont', 'u', 'husband', 'id', 'clot', 'character', 'heh', 'ill', 'yea', 'fe']

Topic #5 without weights
['socionics', 'sos', '3w2', 'ss', '2w3', '8w7', '7w8', 'sso', '7w6', '3w4']

Topic #6 without weights
['socionics', 'introvert', 'thanks', 'husband', 'dear', 'esfjs', 'song', 'met', 'extrovert', 'dont']

Topic #7 without weights
['socionics', 'hi', 'per', 'dont', 'hug', 

Topic #1 without weights
['function', 'ni', 'hata', 'fe', 'thank', 'welcome', 'relationship', 'dear', 'dont', 'yeah']

Topic #2 without weights
['function', 'ni', 'fe', 'ne', 'ti', 'si', 'se', 'te', 'user', 'dominant']

Topic #3 without weights
['sent', 'phone', 'llh343', 'send', 'z936l', 'samsonsg360a', 'use', 'sg920w8', 'emboli23', 'sg935f']

Topic #4 without weights
['1w2', 'sso', 'welcome', '2w1', 'sos', '5w6', '4w5', 'hello', 'ss', 'u']

Topic #5 without weights
['u', 'dont', 'welcome', '1w2', 'sso', '2w1', '5w6', 'hello', 'hi', 'sos']

Topic #6 without weights
['u', 'dont', 'welcome', 'sso', '1w2', 'tilt', 'hello', 'sos', '2w1', '5w6']

Topic #7 without weights
['dear', 'welcome', 'sincerely', 'hello', 'ni', 'si', 'hi', 'fe', 'function', 'ne']

Topic #8 without weights
['tilt', 'welcome', 'hello', 'u', 'hi', 'likely', 'dear', 'hata', 'obligatory', 'kitten']

Topic #9 without weights
['tilt', 'dear', 'dont', 'welcome', 'hi', 'hello', 'thank', '1w2', 'u', 'hata']

Topic #10 without

ENFJ
Topic #1 without weights
['hata', 'welcome', 'dont', 'fe', 'relationship', 'p', 'enfjs', 'date', 'school', 'hi']

Topic #2 without weights
['fe', 'ne', 'ni', 'welcome', 'dont', 'function', 'hata', 'ti', 'si', 'user']

Topic #3 without weights
['sent', 'phone', 'pad', 'loss', 'emboli122', 'welcome', 'emboli2', 'dont', 'ex', 'send']

Topic #4 without weights
['welcome', 'hi', 'blush', 'dont', 'sent', 'wink', 'u', 'proud', 'forum', 'bass']

Topic #5 without weights
['dont', 'ne', 'p', 'blush', 'yea', 'hata', 'sent', 'dan', 'ni', 'ss']

Topic #6 without weights
['date', 'u', 'dear', 'ex', 'hi', 'sent', 'fuck', 'hang', 'boyfriend', 'eat']

Topic #7 without weights
['dear', 'u', 'sincerely', 'welcome', 'dont', 'boyfriend', 'giggler', 'job', 'per', 'enfjs']

Topic #8 without weights
['dear', 'welcome', 'sincerely', 'hang', 'hi', 'book', 'movie', 'tongue', 'giggler', 'dont']

Topic #9 without weights
['blush', 'hata', 'input', 'disorder', 'clot', 'dear', 'cause', 'wink', 'u', 'giggler']



['guitar', 'health', 'somebody', 'fantasize', 'buttonhole', 'dont', 'gutta', 'wink', 'obvious', 'interview']

Topic #17 without weights
['interview', 'cough', 'character', 'college', 'via', 'jetplane48', 'movie', 'strength', 'extroverted', 'tomboy']

Topic #18 without weights
['interview', 'yea', 'jetplane48', 'via', 'interaction', 'bah', 'notice', 'connection', 'honestly', 'corinthian']

Topic #19 without weights
['bah', 'tomboy', 'cough', 'bear', 'phone', 'money', 'guitar', 'x', 'aristocratic', 'dream']

Topic #20 without weights
['bah', 'sex', 'estps', 'p', 'money', 'mark', 'tomboy', 'dizzy', 'masturbate', 'car']

-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
INFP
Topic #1 without weights
['dont', 'hata', 'p', 'thank', 'function', 'type', 'relationship', 'welcome', 'school', 'yeah

In [67]:
'''
Observations of the topics by MBTI type:
- Across the MBTI types some common theme occur:
    Personality
    Relationship
    Music
    
- When reviewing across the MBTI types, many of the key topics are difficult to summaries. This is likely due to the wide 
variety of topics discussed on the target forum. When summarizing at the user level the various topics appear to be intermingled.
Reviewing the key topics using comment level data may provide more insights. 

'''

'\nObservaations of the topics by MBTI type:\n\n'