# library

In [2]:
# scraping tools
from google_play_scraper import app, Sort, reviews
# from app_store_scraper import AppStore

# dataframe
import pandas as pd
import numpy as np
pd.set_option('display.max_row',None)
pd.set_option('display.max_column',None)
# to plot
import matplotlib.pyplot as plt 

# Vietnamese NLP
from underthesea import sentiment     # check sentence sentiment
from underthesea import classify      # check sentence topic
from underthesea import word_tokenize # words segmentation


# scraping

In [10]:
# link to scrape: https://play.google.com/store/apps/details?id=com.ss.android.ugc.trill&hl=vi-VN

In [3]:
# get data from Google store website
result, continuation_token = reviews(
    'com.ss.android.ugc.trill',
    lang='en',
    # country='vn',
    sort=Sort.NEWEST,
    count=100000,
    filter_score_with= None
)

In [4]:
# convert to dataframe
data = pd.DataFrame(np.array(result),columns=['review'])
data = data.join(pd.DataFrame(data.pop('review').tolist()))
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   reviewId              100000 non-null  object        
 1   userName              100000 non-null  object        
 2   userImage             100000 non-null  object        
 3   content               99987 non-null   object        
 4   score                 100000 non-null  int64         
 5   thumbsUpCount         100000 non-null  int64         
 6   reviewCreatedVersion  67115 non-null   object        
 7   at                    100000 non-null  datetime64[ns]
 8   replyContent          1409 non-null    object        
 9   repliedAt             1409 non-null    datetime64[ns]
 10  appVersion            67115 non-null   object        
dtypes: datetime64[ns](2), int64(2), object(7)
memory usage: 8.4+ MB


In [5]:
data['at'].dt.year.value_counts()

at
2024    75191
2023    24809
Name: count, dtype: int64

In [6]:
# extract data from 2024
review_2024 = data[data['at'].dt.year == 2024]
review_2024.info()

<class 'pandas.core.frame.DataFrame'>
Index: 75191 entries, 0 to 75190
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   reviewId              75191 non-null  object        
 1   userName              75191 non-null  object        
 2   userImage             75191 non-null  object        
 3   content               75179 non-null  object        
 4   score                 75191 non-null  int64         
 5   thumbsUpCount         75191 non-null  int64         
 6   reviewCreatedVersion  49613 non-null  object        
 7   at                    75191 non-null  datetime64[ns]
 8   replyContent          1384 non-null   object        
 9   repliedAt             1384 non-null   datetime64[ns]
 10  appVersion            49613 non-null  object        
dtypes: datetime64[ns](2), int64(2), object(7)
memory usage: 6.9+ MB


In [7]:
review_2024.to_csv('review_2024_en_tiktok.csv', index=False)

In [8]:
review_2024.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,b9129a36-6af3-43c6-9f44-bf29fc7b2edb,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Boleh lah,4,0,35.9.4,2024-08-11 10:02:53,,NaT,35.9.4
1,18d26635-c231-42a8-951a-08cc558ab832,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,it's kinda odd cause I don't see who like my r...,3,0,35.6.3,2024-08-11 09:59:08,,NaT,35.6.3
2,aa0a34bf-286f-4b21-a543-d7bdd27f793b,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Pls give me dark mode im begging you,3,0,35.9.4,2024-08-11 09:57:57,,NaT,35.9.4
3,42973f46-4f63-47de-a6f7-a85266b2b035,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,It's fun I can watch videos I want,5,0,,2024-08-11 09:57:04,,NaT,
4,eff5d04a-91da-4f1c-9531-5d182429844c,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Its so amazing❤️❤️❤️,5,0,35.9.4,2024-08-11 09:56:30,,NaT,35.9.4


# testing NLP tools

In [18]:
sentiment('thấy giao hàng nhanh nhưng chất lượng hàng thì tệ lắm')

'negative'

In [41]:
classify('thấy kiểm duyệt không được tốt cho lắm')

['suc_khoe']

In [43]:
word_tokenize('kiểm duyệt rất là tệ, tik tok thì cũng hay đó nhiều video vui, nhưng mà nhiều thứ khác thì như cccccc')

['kiểm duyệt',
 'rất',
 'là',
 'tệ',
 ',',
 'tik tok',
 'thì',
 'cũng',
 'hay',
 'đó',
 'nhiều',
 'video',
 'vui',
 ',',
 'nhưng mà',
 'nhiều',
 'thứ',
 'khác',
 'thì',
 'như',
 'cccccc']