In [1]:
## Joscelin is following these steps: https://github.com/Sandeep-Panchal/Topic-Modeling-with-LDA/blob/master/Topic-Modeling-IPYNB/Topic%20Modeling%20with%20LDA.ipynb

import numpy as np
import pandas as pd
import random

import warnings
warnings.filterwarnings('ignore')

In [16]:
df = pd.read_csv('data/twitter_data_clean_final.csv')
df = df.loc[:,["user_id","user_bio","tokens","state","region"]]
df.dropna(inplace=True)
df
# deep copy to avoid override of the original dataframe
import copy

df_new = copy.deepcopy(df)

to_remove = ['act', 'car', 'card', 'acct','acab','ada','admin','aew','age', 'ain', 'aka','alt','alum', 'angeleno', 'answer','antonio','app',
            'area','ask','atl', 'ben', 'brady','brandon', 'chair','circle','clue','collab','com', 'come', 'day', 'dedicate',
            'dev', 'dms', 'dont', 'dot', 'drive', 'driver', 'eat', 'email', 'eth', 'feed', 'form', 'fsu', 'gas', 'giants',
            'google', 'group', 'hair', 'half','harris', 'head','hear','hip', 'hire','hit','hiv','holder','host', 'hold',
            'huge', 'ill', 'info', 'inquire','inquires', 'internet', 'investor', 'investors', 'jeff', 'jack', 'job', 'joe',
            'john', 'join', 'know', 'las', 'lake', ' let', 'lil', 'line', 'link','listen', 'little', 'llc', 'los', 'main', 
            'meet', 'member', 'miles', 'mma', 'multi','near', 'need', 'nice', 'nominate', 'non', 'nut', 'okay', 'open', 
            'order', 'padres', 'page', 'park', 'pin', 'plug', 'print','provide', 'providence', 'ram', 'sag', 'say', 'sell', 'set', 
            'stl','sth','sub', 'syndication', 'tell', 'tony', 'true', 'try', 'tryna', 'type', 'typos', 'usa', 'use', 'valley', 
            'verse', 'view', 'way', 'year', 'years', 'yrs']


            
def stringToList(string):
    listRes = list(string.split(" "))
    return listRes

ls_tokens = []  

for cell in df_new.tokens: 
    cell = stringToList(cell)
    ls_tokens.append(cell)
    


for cell in ls_tokens: 
    for word in list(cell):
        if word in to_remove:
            cell.remove(word)
            
df_new['ls_tokens'] = ls_tokens

df_new['new_tokens'] = [' '.join(map(str, l)) for l in df_new['ls_tokens']]
df_new

Unnamed: 0,user_id,user_bio,tokens,state,region,ls_tokens,new_tokens
0,1.180000e+18,"I'm a nerdy guy who loves art, music, wrestlin...",nerdy guy love art music wrestle comics poly,OH,Midwest,"[nerdy, guy, love, art, music, wrestle, comics...",nerdy guy love art music wrestle comics poly
1,1.440000e+18,No worries Lil Ms Sunshine cause I don't mess ...,worry lil sunshine cause mess woman moon civil...,PA,Northeast,"[worry, sunshine, cause, mess, woman, moon, ci...",worry sunshine cause mess woman moon civil rig...
2,1.219720e+08,I am a huge sports and music fan. I graduated ...,huge sport music fan graduate regis university...,CO,West,"[sport, music, fan, graduate, regis, universit...",sport music fan graduate regis university live...
3,1.390000e+18,"This is not, nor endorsed by, any Government E...",endorse government entity project guarantee wa...,WA,West,"[endorse, government, entity, project, guarant...",endorse government entity project guarantee wa...
4,2.349053e+07,I'm a writer and a cult film fanatic who loves...,writer cult film fanatic love politics,UT,West,"[writer, cult, film, fanatic, love, politics]",writer cult film fanatic love politics
...,...,...,...,...,...,...,...
1723,2.961228e+08,Christian. Husband. Father. Ph.D. Candidate @B...,christian husband father candidate mosquito re...,OH,Midwest,"[christian, husband, father, candidate, mosqui...",christian husband father candidate mosquito re...
1724,8.800000e+17,"MarTech, MBA, Foodie, Bilingual, Soccer, Boxin...",martech mba foodie bilingual soccer box gamer ...,CA,West,"[martech, mba, foodie, bilingual, soccer, box,...",martech mba foodie bilingual soccer box gamer ...
1725,1.153059e+09,"streams occasionally, will choose race cars ov...",stream occasionally choose race cars important...,KY,South,"[stream, occasionally, choose, race, cars, imp...",stream occasionally choose race cars important...
1726,2.432869e+08,Enjoy the practical wisdom of daily life. Prom...,enjoy practical wisdom daily life promote trut...,AZ,Southwest,"[enjoy, practical, wisdom, daily, life, promot...",enjoy practical wisdom daily life promote trut...


In [17]:
# importing TFIDF vectorizer
from sklearn.feature_extraction.text import CountVectorizer

# creating an instance
cv = CountVectorizer(max_df = 0.90, min_df = 3, stop_words = 'english')

# fit and transform the text data
cv_fit = cv.fit_transform(df_new.new_tokens)

print('\nShape of the sparse matrix\n')
cv_fit


Shape of the sparse matrix



<1644x825 sparse matrix of type '<class 'numpy.int64'>'
	with 6348 stored elements in Compressed Sparse Row format>

In [25]:
# importing Latent Dirichlet Allocation library
from sklearn.decomposition import LatentDirichletAllocation

# creating an instance for LDA
lda = LatentDirichletAllocation(n_components = 55, random_state = 3)

In [26]:
%%time

print('Fitting the vectorizer with the LDA')

lda.fit(cv_fit)

Fitting the vectorizer with the LDA
CPU times: user 3.7 s, sys: 26.1 ms, total: 3.72 s
Wall time: 3.92 s


LatentDirichletAllocation(n_components=55, random_state=3)

In [27]:
print('Number of topic:', len(lda.components_))
print('Number of column of the lda fit:',len(lda.components_[0]))
cv

Number of topic: 55
Number of column of the lda fit: 825


CountVectorizer(max_df=0.9, min_df=3, stop_words='english')

In [28]:
feature = cv.get_feature_names()

print('Length of feature names:', len(feature))

Length of feature names: 825


In [29]:
for ind, topic in enumerate(lda.components_):
    print('top 50 words in topic {}'.format(ind))
    print('-'*25)
    top_50 = topic.argsort()[-50:]
    print([feature[i] for i in top_50], '\n\n')

In [30]:
# transform 
df_final = lda.transform(cv_fit)

print('Shape of the df_final:', df_final.shape)


Shape of the df_final: (1644, 55)


In [31]:
print('\nChecking the probabilitiy distribution of one text data belonging to the topic.\n')
print('Few words from 1st row:', df_new.new_tokens[0][:88],'\n')
print('Probability distribution:', df_final[0])


Checking the probabilitiy distribution of one text data belonging to the topic.

Few words from 1st row: nerdy guy love art music wrestle comics poly 

Probability distribution: [0.0025974  0.0025974  0.0025974  0.0025974  0.0025974  0.0025974
 0.0025974  0.0025974  0.0025974  0.0025974  0.0025974  0.0025974
 0.0025974  0.0025974  0.0025974  0.0025974  0.0025974  0.0025974
 0.0025974  0.0025974  0.0025974  0.0025974  0.0025974  0.0025974
 0.19548862 0.0025974  0.14927643 0.0025974  0.0025974  0.0025974
 0.0025974  0.0025974  0.1535443  0.0025974  0.0025974  0.0025974
 0.0025974  0.0025974  0.0025974  0.0025974  0.0025974  0.0025974
 0.0025974  0.36922311 0.0025974  0.0025974  0.0025974  0.0025974
 0.0025974  0.0025974  0.0025974  0.0025974  0.0025974  0.0025974
 0.0025974 ]


In [32]:
prob = df_final[0][df_final[0].argmax()].round(2)

print('Bio belonging to the topic', df_final[0].argmax(), 'with the probability of', prob)

Bio belonging to the topic 43 with the probability of 0.37


In [33]:
df_new['cluster'] = df_final.argmax(axis = 1)

df_new.head()

Unnamed: 0,user_id,user_bio,tokens,state,region,ls_tokens,new_tokens,cluster
0,1.18e+18,"I'm a nerdy guy who loves art, music, wrestlin...",nerdy guy love art music wrestle comics poly,OH,Midwest,"[nerdy, guy, love, art, music, wrestle, comics...",nerdy guy love art music wrestle comics poly,43
1,1.44e+18,No worries Lil Ms Sunshine cause I don't mess ...,worry lil sunshine cause mess woman moon civil...,PA,Northeast,"[worry, sunshine, cause, mess, woman, moon, ci...",worry sunshine cause mess woman moon civil rig...,52
2,121972000.0,I am a huge sports and music fan. I graduated ...,huge sport music fan graduate regis university...,CO,West,"[sport, music, fan, graduate, regis, universit...",sport music fan graduate regis university live...,38
3,1.39e+18,"This is not, nor endorsed by, any Government E...",endorse government entity project guarantee wa...,WA,West,"[endorse, government, entity, project, guarant...",endorse government entity project guarantee wa...,9
4,23490530.0,I'm a writer and a cult film fanatic who loves...,writer cult film fanatic love politics,UT,West,"[writer, cult, film, fanatic, love, politics]",writer cult film fanatic love politics,21


In [34]:
# creating a dictionary with key as topic numbers and value as topic names
#Joscelin used random genres here to see if everything is working

topic_label = {0:'Pop', 1:'Latin', 2:'R&B', 3:'Rock', 4:'HipHop', 5:'HipHop',6:'Christian/Gospel', 7:'EDM',8:'Children',9:'Classical', 10:'World'}

# mapping the dictionary with the dataframe to get the labels.
df_new['genre'] = df_new['cluster'].map(topic_label)

# head of the dataframe
df_new.head()

Unnamed: 0,user_id,user_bio,tokens,state,region,ls_tokens,new_tokens,cluster,genre
0,1.18e+18,"I'm a nerdy guy who loves art, music, wrestlin...",nerdy guy love art music wrestle comics poly,OH,Midwest,"[nerdy, guy, love, art, music, wrestle, comics...",nerdy guy love art music wrestle comics poly,43,
1,1.44e+18,No worries Lil Ms Sunshine cause I don't mess ...,worry lil sunshine cause mess woman moon civil...,PA,Northeast,"[worry, sunshine, cause, mess, woman, moon, ci...",worry sunshine cause mess woman moon civil rig...,52,
2,121972000.0,I am a huge sports and music fan. I graduated ...,huge sport music fan graduate regis university...,CO,West,"[sport, music, fan, graduate, regis, universit...",sport music fan graduate regis university live...,38,
3,1.39e+18,"This is not, nor endorsed by, any Government E...",endorse government entity project guarantee wa...,WA,West,"[endorse, government, entity, project, guarant...",endorse government entity project guarantee wa...,9,Classical
4,23490530.0,I'm a writer and a cult film fanatic who loves...,writer cult film fanatic love politics,UT,West,"[writer, cult, film, fanatic, love, politics]",writer cult film fanatic love politics,21,
