In [23]:
import json
import pandas as pd
from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

from pygments import highlight
from pygments.lexers import JsonLexer
from pygments.formatters import TerminalFormatter

from google_play_scraper import Sort, reviews, app

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

app_packages = [
    'com.google.android.apps.meetings',
    'com.instagram.android',
    'com.microsoft.teams',
    'com.netflix.mediaclient',
    'com.zhiliaoapp.musically',
    'com.whatsapp',
    'us.zoom.videomeetings'
]

app_infos = []

for ap in tqdm(app_packages):
  info = app(ap, lang='en', country='us')
  del info['comments']
  app_infos.append(info)    

app_infos_df = pd.DataFrame(app_infos)
app_infos_df.to_csv('apps.csv', index=None, header=True)

app_reviews = []

for ap in tqdm(app_packages):
  for score in list(range(1, 6)):
    for sort_order in [Sort.NEWEST]:
      rvs, _ = reviews(
        ap,
        lang='en',
        country='us',
        sort=sort_order,
        count= 10000,
        filter_score_with=score
      )
      for r in rvs:
        r['sortOrder'] = 'most_relevant' if sort_order == Sort.MOST_RELEVANT else 'newest'
        r['appId'] = ap
      app_reviews.extend(rvs)

app_reviews_df = pd.DataFrame(app_reviews)
app_reviews_df.to_csv('reviews.csv', index=None, header=True)

100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:09<00:00,  1.33s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 7/7 [42:37<00:00, 365.32s/it]


In [9]:
import pandas as pd
data = pd.read_csv("reviews_v2.csv")
data_text = data[['content']]
data_text['index'] = data_text.index
documents = data_text

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer
import numpy as np

def lemmatize_only(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(str(text).encode('ascii',errors='ignore').decode()):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_only(token))
    return result

processed_docs = documents['content'].map(preprocess)
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=5000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

print('\nBag of Words Model:\n')
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))
    
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

print('\nTF-IDF Model:\n')
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))


Bag of Words Model:

Topic: 0 
Words: 0.082*"update" + 0.081*"work" + 0.055*"great" + 0.036*"issue" + 0.027*"network" + 0.025*"problem" + 0.020*"connection" + 0.017*"properly" + 0.017*"version" + 0.016*"time"
Topic: 1 
Words: 0.137*"class" + 0.095*"online" + 0.059*"super" + 0.038*"download" + 0.037*"help" + 0.035*"thank" + 0.032*"study" + 0.026*"school" + 0.025*"students" + 0.017*"attend"
Topic: 2 
Words: 0.051*"worst" + 0.041*"problem" + 0.027*"solve" + 0.025*"tiktok" + 0.024*"people" + 0.022*"show" + 0.021*"like" + 0.018*"problems" + 0.018*"time" + 0.015*"remove"
Topic: 3 
Words: 0.065*"time" + 0.053*"data" + 0.040*"easy" + 0.040*"connect" + 0.034*"experience" + 0.033*"take" + 0.028*"phone" + 0.028*"disconnect" + 0.025*"waste" + 0.021*"mobile"
Topic: 4 
Words: 0.301*"nice" + 0.052*"meet" + 0.042*"join" + 0.031*"sign" + 0.022*"open" + 0.022*"account" + 0.021*"say" + 0.019*"time" + 0.016*"error" + 0.015*"problem"
Topic: 5 
Words: 0.171*"best" + 0.138*"love" + 0.081*"screen" + 0.064*"e