In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("L:\\Workspace\\Datasets\\ReviewsDatasets\\all_combined.csv")
df.shape

(200000, 4)

In [5]:
df.duplicated(subset=['content']).sum()

np.int64(70958)

In [7]:
df.drop_duplicates(subset=['content'], inplace=True)

In [8]:
df.duplicated(subset=['content']).sum()

np.int64(0)

In [9]:
df.isnull().sum()

reviewId    0
content     1
score       0
app         0
dtype: int64

In [10]:
df['content'].isnull().sum()

np.int64(1)

In [12]:
df.dropna(subset=['content'], inplace=True)

In [13]:
df.isnull().sum()

reviewId    0
content     0
score       0
app         0
dtype: int64

In [14]:
df.shape

(129041, 4)

In [15]:
df.head()

Unnamed: 0,reviewId,content,score,app
0,6aa90e7d-be9d-4615-9332-dfca8062c77b,awesome,5,Facebook
1,fe84588e-aacf-448e-8e9a-b8d77ce01c59,good,5,Facebook
2,8038ade0-929b-4447-b02d-344d146685fd,i would say it's great but meta just keeps sus...,2,Facebook
3,39b156a8-4f22-491c-b90a-a19eddbc38e7,from 2012 my fb page in active mode ..Facebook...,5,Facebook
4,048e8ffd-5b5c-447b-b9d0-4da6b6f73581,I think very good,1,Facebook


In [16]:
df = df['content']

In [22]:
df.shape

(129041,)

In [19]:
# Only content is required for topic modeling 
df.head()

0                                              awesome
1                                                 good
2    i would say it's great but meta just keeps sus...
3    from 2012 my fb page in active mode ..Facebook...
4                                    I think very good
Name: content, dtype: object

We will start with LDA -> Count vectorizer + LDA. 

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
cv = CountVectorizer(max_df= 0.95, min_df=2, stop_words="english")

In [23]:
dtm = cv.fit_transform(df)

In [24]:
dtm.shape

(129041, 20032)

In [25]:
# Lets get LDA - 

In [26]:
from sklearn.decomposition import LatentDirichletAllocation

In [27]:
lda = LatentDirichletAllocation(n_components= 10, random_state= 42)

In [28]:
lda.fit(dtm)

In [29]:
len(lda.components_)

10

In [30]:
lda.components_.shape

(10, 20032)

### Ok, so we have got 20032 words. And in lda we got probrability of these words to get into topics

#### Lets get for topic 1 - what all words have highest probrability of coming in topic 1 

In [31]:
topic_one = lda.components_[0]

In [35]:
top_30_topic_one_words = topic_one.argsort()[-30:] 
top_30_topic_one_words
# This should give me 30 such words which have highest probrability - 

array([ 6035, 10205, 10065, 17099, 11527,  6022,  3440,  9366,  3434,
       12259, 17377,  4950,  5241, 16865,  8633, 15113,   808, 16848,
       17306, 10818,  6218, 12913,  1438, 12251,   936, 15340,  1297,
        3704, 11788, 14572])

Ok we have to get the word using indexes - we can do it with that get feature name 

In [34]:
cv.get_feature_names_out()[6035]

'feed'

In [37]:
for k in top_30_topic_one_words:
    print(cv.get_feature_names_out()[k])

feed
microsoft
memories
videos
page
feature
comments
like
comment
posts
way
don
edit
users
just
stories
ads
used
want
news
flipboard
read
articles
post
ai
super
app
content
people
skype


Lets print for every topic first 

In [41]:
for i in range(10):
    topic = lda.components_[i]
    top_30_words = topic.argsort()[-30:]
    print("Topic - ", i)
    print([cv.get_feature_names_out()[x] for x in top_30_words])
    print()

Topic -  0
['feed', 'microsoft', 'memories', 'videos', 'page', 'feature', 'comments', 'like', 'comment', 'posts', 'way', 'don', 'edit', 'users', 'just', 'stories', 'ads', 'used', 'want', 'news', 'flipboard', 'read', 'articles', 'post', 'ai', 'super', 'app', 'content', 'people', 'skype']

Topic -  1
['google', 'using', 'watching', 'movie', 'let', 'just', 'live', 'tv', 'want', 'line', 'device', 'don', 'shows', 'password', 'sign', 'email', 'love', 'help', 'code', 'new', 'log', 'movies', 'old', 'login', 'tiktok', 'watch', 'number', 'account', 'netflix', 'phone']

Topic -  2
['line', 'suspended', 'chat', 'log', 'whatsapp', 'use', 'phone', 'know', 'try', 'update', 'tried', 'got', 'just', 'reason', 'help', 'facebook', 'error', 'open', 'don', 'issue', 'send', 'support', 'viber', 'new', 'message', 'messages', 'fix', 'problem', 'app', 'account']

Topic -  3
['levels', 'interesting', 'played', 'mind', 'surfers', 'wonderful', 'aap', 'enjoy', 'subway', 'pass', 'great', 'just', 'level', 'reliable', 

In [42]:
lda_2 = LatentDirichletAllocation(n_components=6, random_state=42)

In [43]:
lda_2.fit(dtm)

In [44]:
for i in range(len(lda_2.components_)):
    topic = lda_2.components_[i]
    top_30_words = topic.argsort()[-30:]
    print("Topic - ", i)
    print([cv.get_feature_names_out()[x] for x in top_30_words])
    print()

Topic -  0
['love', 'users', 'page', 'used', 'platform', 'information', 'videos', 'way', 'feature', 'option', 'stories', 'want', 'great', 'just', 'user', 'ads', 'post', 'read', 'flipboard', 'social', 'media', 'articles', 'like', 'super', 'ai', 'content', 'people', 'skype', 'news', 'app']

Topic -  1
['want', 'just', 'device', 'movies', 'really', 'use', 'helpful', 'work', 'sign', 'service', 'don', 'password', 'code', 'email', 'new', 'line', 'help', 'old', 'watch', 'log', 'login', 'tiktok', 'love', 'app', 'number', 'netflix', 'phone', 'nice', 'account', 'good']

Topic -  2
['messenger', 'reason', 'don', 'thank', 'help', 'just', 'support', 'line', 'like', 'viber', 'error', 'add', 'whatsapp', 'open', 'send', 'issue', 'new', 'message', 'facebook', 'friends', 'chat', 'messages', 'update', 'easy', 'fix', 'problem', 'use', 'best', 'account', 'app']

Topic -  3
['fast', 'interesting', 'experience', 'aap', 'enjoy', 'video', 'games', 'happy', 'useful', 'perfect', 'cool', 'easy', 'app', 'time', 'q