In [None]:
import json
import pandas as pd
from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

from pygments import highlight
from pygments.lexers import JsonLexer
from pygments.formatters import TerminalFormatter

from google_play_scraper import Sort, reviews, app

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

app_packages = [
    'com.google.android.apps.meetings',
    'com.instagram.android',
    'com.microsoft.teams',
    'com.zhiliaoapp.musically',
    'com.whatsapp',
    'us.zoom.videomeetings'
]

app_infos = []

for ap in tqdm(app_packages):
  info = app(ap, lang='en', country='us')
  del info['comments']
  app_infos.append(info)    

app_infos_df = pd.DataFrame(app_infos)
app_infos_df.to_csv('apps.csv', index=None, header=True)

app_reviews = []

for ap in tqdm(app_packages):
  for score in list(range(1, 6)):
  #for x in list(range(1, 10)):
    for sort_order in [Sort.NEWEST]:
      rvs, _ = reviews(
        ap,
        lang='en',
        country='us',
        sort=sort_order,
        count= 10000,
        filter_score_with=score
        #filter_score_with=None
      )
      for r in rvs:
        r['sortOrder'] = 'most_relevant' if sort_order == Sort.MOST_RELEVANT else 'newest'
        r['appId'] = ap
      app_reviews.extend(rvs)

app_reviews_df = pd.DataFrame(app_reviews)
app_reviews_df.to_csv('reviews.csv', index=None, header=True)

In [3]:
import pandas as pd
data = pd.read_csv("reviews_tiktok_v2.csv")
data_text = data[['content']]
data_text['index'] = data_text.index
documents = data_text

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import numpy as np
import nltk

my_stopwords = ['enjoyable','terrible','beautiful','option','guess','omg','ali','enjoy','gay','sucks','things','alot','kinda','wo','plz','open','soo','day','meh','eh','tho','ha','life','pretty','kids','stuff','idk','lol','bekar','dont','hate','aap','a','amazing','am','and','application','app','apps','awesome','bad','best','better','bhai','but',
'cool','ever','excellent','experience','fabulous','fantastic','far','fine','forIN','fun','fuy','gd','good','google',
'great','gud','hai','happy','hi','in','i','instagram','insta','isZ','it','its','just','khan','k','kumar','like',
'loved','lovely','love','mast','me','messenger','much','my','nice','no','not','nyc','okay','ok','one','op','osm',
'perfect','sometimes','so','supe','super','supper','thank','thanks','the','this','tik','tiktok','tok','to','use','very',
'well','whatsapp','whats','what','with','wonderful','worst','wow','you','zoom','is','can','for','be','also','was','now',
'are','on','of','as','than','if','because','or','do','some','will','all','us','could','have','an','when','always','more',
'using','how','otherwise','we','from','your','you','appp','such','that','too','ap','ne','new','should','thing','there',
'android','awsome','lot','get','superb','really','yes','no','outstanding','keep','need','must','he','make','ca','see',
'by','at','has','been',"a","about","above","after","again","against","ain","all","am","an","and","any","are","aren",
"aren't","as","at","be","because","been","before","being","below","between","both","but","by","can","couldn",
"couldn't","d","did","didn","didn't","do","does","doesn","doesn't","doing","don","don't","down","during","each","few",
"for","from","further","had","hadn","hadn't","has","hasn","hasn't","have","haven","haven't","having","he","her","here",
"hers","herself","him","himself","his","how","i","if","in","into","is","isn","isn't","it","it's","its","itself","just",
"ll","m","ma","me","mightn","mightn't","more","most","mustn","mustn't","my","myself","needn","needn't","no","nor","not",
"now","o","of","off","on","once","only","or","other","our","ours","ourselves","out","over","own","re","s","same","shan",
"shan't","she","she's","should","should've","shouldn","shouldn't","so","some","such","t","than","that","that'll","the",
"their","theirs","them","themselves","then","there","these","they","this","those","through","to","too","under","until",
"up","ve","very","was","wasn","wasn't","we","were","weren","weren't","what","when","where","which","while","who","whom",
"why","will","with","won","won't","wouldn","wouldn't","y","you","you'd","you'll","you're","you've","your","yours",
"yourself","yourselves","could","he'd","he'll","he's","here's","how's","i'd","i'll","i'm","i've","let's","ought",
"she'd","she'll","that's","there's","they'd","they'll","they're","they've","we'd","we'll","we're","we've","what's",
"when's","where's","who's","why's","would","able","abst","accordance","according","accordingly","across","act",
"actually","added","adj","affected","affecting","affects","afterwards","ah","almost","alone","along","already","also",
"although","always","among","amongst","announce","another","anybody","anyhow","anymore","anyone","anything","anyway",
"anyways","anywhere","apparently","approximately","arent","arise","around","aside","ask","asking","auth","available",
"away","awfully","b","back","became","become","becomes","becoming","beforehand","begin","beginning","beginnings",
"begins","behind","believe","beside","besides","beyond","biol","brief","briefly","c","ca","came","cannot","can't",
"cause","causes","certain","certainly","co","com","come","comes","contain","containing","contains","couldnt","date",
"different","done","downwards","due","e","ed","edu","effect","eg","eight","eighty","either","else","elsewhere","end",
"ending","enough","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex",
"except","f","far","ff","fifth","first","five","fix","followed","following","follows","former","formerly","forth",
"found","four","furthermore","g","gave","get","gets","getting","give","given","gives","giving","go","goes","gone",
"got","gotten","h","happens","hardly","hed","hence","hereafter","hereby","herein","heres","hereupon","hes","hi","hid",
"hither","home","howbeit","however","hundred","id","ie","im","immediate","immediately","importance","important","inc",
"indeed","index","information","instead","invention","inward","itd","it'll","j","k","keep","keeps","kept","kg","km",
"know","known","knows","l","largely","last","lately","later","latter","latterly","least","less","lest","let","lets",
"like","liked","likely","line","little","'ll","look","looking","looks","ltd","made","mainly","make","makes","many",
"may","maybe","mean","means","meantime","meanwhile","merely","mg","might","million","miss","ml","moreover","mostly",
"mr","mrs","much","mug","must","n","na","name","namely","nay","nd","near","nearly","necessarily","necessary","need",
"needs","neither","never","nevertheless","new","next","nine","ninety","nobody","non","none","nonetheless","noone",
"normally","nos","noted","nothing","nowhere","obtain","obtained","obviously","often","oh","ok","okay","old","omitted",
"one","ones","onto","ord","others","otherwise","outside","overall","owing","p","page","pages","part","particular",
"particularly","past","per","perhaps","placed","please","plus","poorly","possible","possibly","potentially","pp",
"predominantly","present","previously","primarily","probably","promptly","proud","provides","put","q","que","quickly",
"quite","qv","r","ran","rather","rd","readily","really","recent","recently","ref","refs","regarding","regardless",
"regards","related","relatively","research","respectively","resulted","resulting","results","right","run","said","saw",
"say","saying","says","sec","section","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sent",
"seven","several","shall","shed","shes","show","showed","shown","showns","shows","significant","significantly",
"similar","similarly","since","six","slightly","somebody","somehow","someone","somethan","something","sometime",
"sometimes","somewhat","somewhere","soon","sorry","specifically","specified","specify","specifying","still","stop",
"strongly","sub","substantially","successfully","sufficiently","suggest","sup","sure","take","taken","taking","tell",
"tends","th","thank","thanks","thanx","thats","that've","thence","thereafter","thereby","thered","therefore","therein",
"there'll","thereof","therere","theres","thereto","thereupon","there've","theyd","theyre","think","thou","though",
"thoughh","thousand","throug","throughout","thru","thus","til","tip","together","took","toward","towards","tried",
"tries","truly","try","trying","ts","twice","two","u","un","unfortunately","unless","unlike","unlikely","unto","upon",
"ups","us","use","used","useful","usefully","usefulness","uses","using","usually","v","value","various","'ve","via",
"viz","vol","vols","vs","w","want","wants","wasnt","way","wed","welcome","went","werent","whatever","what'll","whats",
"whence","whenever","whereafter","whereas","whereby","wherein","wheres","whereupon","wherever","whether","whim",
"whither","whod","whoever","whole","who'll","whomever","whos","whose","widely","willing","wish","within","without",
"wont","words","world","wouldnt","www","x","yes","yet","youd","youre","z","zero","a's","ain't","allow","allows",
"apart","appear","appreciate","appropriate","associated","best","better","c'mon","c's","cant","changes","clearly",
"concerning","consequently","consider","considering","corresponding","course","currently","definitely","described",
"despite","entirely","exactly","example","going","greetings","hello","help","hopefully","ignored","inasmuch",
"indicate","indicated","indicates","inner","insofar","it'd","keep","keeps","novel","presumably","reasonably",
"second","secondly","sensible","serious","seriously","sure","t's","third","thorough","thoroughly","three","well","wonder"
]


def preprocess(text):
    result = []
    text = nltk.word_tokenize(str(text).encode('ascii',errors='ignore').decode())
    text = [token.lower() for token in text]
    words = [word for word in text if word.isalpha() and word not in my_stopwords and len(word) > 1]
    tags = nltk.pos_tag(words)
    pos_tagged = [p[0]+p[1] for p in tags]
    for token in pos_tagged:
        result.append(token)
    return result

processed_docs = documents['content'].map(preprocess)
bigram = gensim.models.Phrases(processed_docs, min_count=1, threshold=1)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram = gensim.models.Phrases(bigram[processed_docs], threshold=1) 
trigram_mod = gensim.models.phrases.Phraser(trigram)
trigram_docs = list()
for processed_doc in processed_docs:
    trigram_docs.append(trigram_mod[bigram_mod[processed_doc]])

dictionary = gensim.corpora.Dictionary(trigram_docs)
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=5000)
bow_corpus = [dictionary.doc2bow(doc) for doc in trigram_docs]

from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
#lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

#print('\nBag of Words Model:\n')
#for idx, topic in lda_model.print_topics(-1):
#    print('Topic: {} \nWords: {}\n'.format(idx, topic))
    
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=7,minimum_probability=0.01, minimum_phi_value=0.01,
                                             id2word=dictionary, alpha='symmetric',per_word_topics=True,passes=5, workers=5)

print('\nTF-IDF Model:\n')
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}\n'.format(idx, topic))
    
# Compute Perplexity
print('\nPerplexity: ', lda_model_tfidf.log_perplexity(corpus_tfidf))  
# a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = gensim.models.CoherenceModel(model=lda_model_tfidf, texts=trigram_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)



TF-IDF Model:

Topic: 0 
Words: 0.024*"accountNN" + 0.021*"famousJJ" + 0.020*"timeNN_passNN" + 0.015*"updateNN" + 0.014*"contentNN" + 0.013*"beautifulNN" + 0.011*"enjoyableJJ" + 0.010*"slowVB" + 0.010*"funnyJJ_videosNNS" + 0.008*"terribleJJ"

Topic: 1 
Words: 0.031*"entertainmentNN" + 0.025*"timeNN" + 0.021*"starNN" + 0.014*"addictiveJJ" + 0.012*"friendsNNS" + 0.011*"addictedVBN" + 0.008*"jobNN" + 0.008*"musicallyRB" + 0.007*"optionNN" + 0.007*"danceNN"

Topic: 2 
Words: 0.044*"entertainingVBG" + 0.022*"trashNN" + 0.016*"watchNN" + 0.016*"interestingVBG" + 0.013*"wasteNN_timeNN" + 0.012*"viralJJ" + 0.012*"annoyingVBG" + 0.009*"bitNN" + 0.008*"reasonNN" + 0.007*"talentNN"

Topic: 3 
Words: 0.033*"videoNN" + 0.032*"peopleNNS" + 0.017*"workingVBG" + 0.016*"addictingVBG" + 0.016*"toxicNN" + 0.014*"problemNN" + 0.013*"dataNNS" + 0.013*"gameNN" + 0.010*"stupidJJ" + 0.010*"problemsNNS"

Topic: 4 
Words: 0.027*"followersNNS" + 0.022*"starsNNS" + 0.019*"workNN" + 0.013*"easyJJ" + 0.012*"likesN