In [1]:
## Joscelin is following these steps: https://github.com/Sandeep-Panchal/Topic-Modeling-with-LDA/blob/master/Topic-Modeling-IPYNB/Topic%20Modeling%20with%20LDA.ipynb

import numpy as np
import pandas as pd
import random

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data/twitter_data_clean_final.csv')
df = df.loc[:,["user_id","user_bio","tokens","state","region"]]
df.dropna(inplace=True)
df
# deep copy to avoid override of the original dataframe
import copy

df_new = copy.deepcopy(df)
df_new

Unnamed: 0,user_id,user_bio,tokens,state,region
0,1.180000e+18,"I'm a nerdy guy who loves art, music, wrestlin...",nerdy guy love art music wrestle comics poly,OH,Midwest
1,1.440000e+18,No worries Lil Ms Sunshine cause I don't mess ...,worry lil sunshine cause mess woman moon civil...,PA,Northeast
2,1.219720e+08,I am a huge sports and music fan. I graduated ...,huge sport music fan graduate regis university...,CO,West
3,1.390000e+18,"This is not, nor endorsed by, any Government E...",endorse government entity project guarantee wa...,WA,West
4,2.349053e+07,I'm a writer and a cult film fanatic who loves...,writer cult film fanatic love politics,UT,West
...,...,...,...,...,...
1723,2.961228e+08,Christian. Husband. Father. Ph.D. Candidate @B...,christian husband father candidate mosquito re...,OH,Midwest
1724,8.800000e+17,"MarTech, MBA, Foodie, Bilingual, Soccer, Boxin...",martech mba foodie bilingual soccer box gamer ...,CA,West
1725,1.153059e+09,"streams occasionally, will choose race cars ov...",stream occasionally choose race cars important...,KY,South
1726,2.432869e+08,Enjoy the practical wisdom of daily life. Prom...,enjoy practical wisdom daily life promote trut...,AZ,Southwest


In [3]:
# importing TFIDF vectorizer
from sklearn.feature_extraction.text import CountVectorizer

# creating an instance
cv = CountVectorizer(max_df = 0.90, min_df = 3, stop_words = 'english')

# fit and transform the text data
cv_fit = cv.fit_transform(df_new.tokens)

print('\nShape of the sparse matrix\n')
cv_fit


Shape of the sparse matrix



<1644x955 sparse matrix of type '<class 'numpy.int64'>'
	with 7163 stored elements in Compressed Sparse Row format>

In [4]:
# importing Latent Dirichlet Allocation library
from sklearn.decomposition import LatentDirichletAllocation

# creating an instance for LDA
lda = LatentDirichletAllocation(n_components = 10, random_state = 1)

In [5]:
%%time

print('Fitting the vectorizer with the LDA')

lda.fit(cv_fit)

Fitting the vectorizer with the LDA
CPU times: user 4.09 s, sys: 34.3 ms, total: 4.13 s
Wall time: 4.17 s


LatentDirichletAllocation(random_state=1)

In [6]:
print('Number of topic:', len(lda.components_))
print('Number of column of the lda fit:',len(lda.components_[0]))

Number of topic: 10
Number of column of the lda fit: 955


In [7]:
feature = cv.get_feature_names()

print('Length of feature names:', len(feature))

Length of feature names: 955


In [8]:
for ind, topic in enumerate(lda.components_):
    print('top 50 words in topic {}'.format(ind))
    print('-'*25)
    top_50 = topic.argsort()[-50:]
    print([feature[i] for i in top_50], '\n\n')

top 50 words in topic 0
-------------------------
['ucla', 'publish', 'woman', 'boy', 'fight', 'student', 'ceo', 'state', 'wrestle', 'play', 'chicago', 'family', 'instagram', 'jesus', 'right', 'women', 'media', 'designer', 'love', 'know', 'film', 'journalist', 'inquiries', 'podcaster', 'producer', 'good', 'years', 'fan', 'alum', 'city', 'texas', 'long', 'business', 'photography', 'win', 'college', 'blm', 'owner', 'university', 'book', 'coach', 'podcast', 'things', 'author', 'man', 'writer', 'time', 'sport', 'com', 'host'] 


top 50 words in topic 1
-------------------------
['look', 'peace', 'single', 'professional', 'future', 'aficionado', 'eth', 'biden', 'near', 'old', 'run', 'listen', 'bless', 'bring', 'forever', 'let', 'long', 'art', 'fan', 'pay', 'day', 'enjoy', 'save', 'florida', 'everyday', 'school', 'arts', 'twitch', 'writer', 'life', 'member', 'business', 'partner', 'hate', 'study', 'shit', 'people', 'designer', 'photographer', 'like', 'play', 'email', 'guy', 'com', 'world', '

In [9]:
# transform 
df_final = lda.transform(cv_fit)

print('Shape of the df_final:', df_final.shape)

Shape of the df_final: (1644, 10)


In [10]:
print('\nChecking the probabilitiy distribution of one text data belonging to the topic.\n')
print('Few words from 1st row:', df_new.tokens[0][:88],'\n')
print('Probability distribution:', df_final[0])


Checking the probabilitiy distribution of one text data belonging to the topic.

Few words from 1st row: nerdy guy love art music wrestle comics poly 

Probability distribution: [0.01428794 0.01429119 0.01428628 0.01428837 0.01428778 0.01428781
 0.87140375 0.01428577 0.01429159 0.01428953]


In [11]:
prob = df_final[0][df_final[0].argmax()].round(2)

print('Bio belonging to the topic', df_final[0].argmax(), 'with the probability of', prob)

Bio belonging to the topic 6 with the probability of 0.87


In [12]:
df_new['topic'] = df_final.argmax(axis = 1)

df_new.head()

Unnamed: 0,user_id,user_bio,tokens,state,region,topic
0,1.18e+18,"I'm a nerdy guy who loves art, music, wrestlin...",nerdy guy love art music wrestle comics poly,OH,Midwest,6
1,1.44e+18,No worries Lil Ms Sunshine cause I don't mess ...,worry lil sunshine cause mess woman moon civil...,PA,Northeast,0
2,121972000.0,I am a huge sports and music fan. I graduated ...,huge sport music fan graduate regis university...,CO,West,9
3,1.39e+18,"This is not, nor endorsed by, any Government E...",endorse government entity project guarantee wa...,WA,West,2
4,23490530.0,I'm a writer and a cult film fanatic who loves...,writer cult film fanatic love politics,UT,West,9


In [13]:
# creating a dictionary with key as topic numbers and value as topic names
#Joscelin used random genres here to see if everything is working

topic_label = {0:'Pop', 1:'Latin', 2:'R&B', 3:'Rock', 4:'HipHop', 5:'HipHop',6:'Christian/Gospel', 7:'EDM',8:'Children',9:'Classical', 10:'World'}

# mapping the dictionary with the dataframe to get the labels.
df_new['topic_name'] = df_new['topic'].map(topic_label)

# head of the dataframe
df_new.head()

Unnamed: 0,user_id,user_bio,tokens,state,region,topic,topic_name
0,1.18e+18,"I'm a nerdy guy who loves art, music, wrestlin...",nerdy guy love art music wrestle comics poly,OH,Midwest,6,Christian/Gospel
1,1.44e+18,No worries Lil Ms Sunshine cause I don't mess ...,worry lil sunshine cause mess woman moon civil...,PA,Northeast,0,Pop
2,121972000.0,I am a huge sports and music fan. I graduated ...,huge sport music fan graduate regis university...,CO,West,9,Classical
3,1.39e+18,"This is not, nor endorsed by, any Government E...",endorse government entity project guarantee wa...,WA,West,2,R&B
4,23490530.0,I'm a writer and a cult film fanatic who loves...,writer cult film fanatic love politics,UT,West,9,Classical
