Source: https://www.kaggle.com/code/ashishkumarak/google-play-reviews-scraping-daily-update/notebook

## 1. Get the data from Google PlayStore 

In [1]:
import google_play_scraper

In [2]:
app_id = 'com.mobile.legends'
# https://play.google.com/store/apps/details?id=com.mobile.legends&hl=en

In [3]:
from google_play_scraper import Sort
from google_play_scraper.constants.element import ElementSpecs
from google_play_scraper.constants.regex import Regex
from google_play_scraper.constants.request import Formats
from google_play_scraper.utils.request import post

import pandas as pd
from datetime import datetime
from tqdm import tqdm
import time
import json
from time import sleep
from typing import List, Optional, Tuple

In [4]:
MAX_COUNT_EACH_FETCH = 199


class _ContinuationToken:
    __slots__ = (
        "token",
        "lang",
        "country",
        "sort",
        "count",
        "filter_score_with",
        "filter_device_with",
    )

    def __init__(
        self, token, lang, country, sort, count, filter_score_with, filter_device_with
    ):
        self.token = token
        self.lang = lang
        self.country = country
        self.sort = sort
        self.count = count
        self.filter_score_with = filter_score_with
        self.filter_device_with = filter_device_with


def _fetch_review_items(
    url: str,
    app_id: str,
    sort: int,
    count: int,
    filter_score_with: Optional[int],
    filter_device_with: Optional[int],
    pagination_token: Optional[str],
):
    dom = post(
        url,
        Formats.Reviews.build_body(
            app_id,
            sort,
            count,
            "null" if filter_score_with is None else filter_score_with,
            "null" if filter_device_with is None else filter_device_with,
            pagination_token,
        ),
        {"content-type": "application/x-www-form-urlencoded"},
    )
    match = json.loads(Regex.REVIEWS.findall(dom)[0])

    return json.loads(match[0][2])[0], json.loads(match[0][2])[-2][-1]


def reviews(
    app_id: str,
    lang: str = "en",
    country: str = "us",
    sort: Sort = Sort.MOST_RELEVANT,
    count: int = 100,
    filter_score_with: int = None,
    filter_device_with: int = None,
    continuation_token: _ContinuationToken = None,
) -> Tuple[List[dict], _ContinuationToken]:
    sort = sort.value

    if continuation_token is not None:
        token = continuation_token.token

        if token is None:
            return (
                [],
                continuation_token,
            )

        lang = continuation_token.lang
        country = continuation_token.country
        sort = continuation_token.sort
        count = continuation_token.count
        filter_score_with = continuation_token.filter_score_with
        filter_device_with = continuation_token.filter_device_with
    else:
        token = None

    url = Formats.Reviews.build(lang=lang, country=country)

    _fetch_count = count

    result = []

    while True:
        if _fetch_count == 0:
            break

        if _fetch_count > MAX_COUNT_EACH_FETCH:
            _fetch_count = MAX_COUNT_EACH_FETCH

        try:
            review_items, token = _fetch_review_items(
                url,
                app_id,
                sort,
                _fetch_count,
                filter_score_with,
                filter_device_with,
                token,
            )
        except (TypeError, IndexError):
            #funnan MOD start
            token = continuation_token.token
            continue
            #MOD end

        for review in review_items:
            result.append(
                {
                    k: spec.extract_content(review)
                    for k, spec in ElementSpecs.Review.items()
                }
            )

        _fetch_count = count - len(result)

        if isinstance(token, list):
            token = None
            break

    return (
        result,
        _ContinuationToken(
            token, lang, country, sort, count, filter_score_with, filter_device_with
        ),
    )


def reviews_all(app_id: str, sleep_milliseconds: int = 0, **kwargs) -> list:
    kwargs.pop("count", None)
    kwargs.pop("continuation_token", None)

    continuation_token = None

    result = []

    while True:
        _result, continuation_token = reviews(
            app_id,
            count=MAX_COUNT_EACH_FETCH,
            continuation_token=continuation_token,
            **kwargs
        )

        result += _result

        if continuation_token.token is None:
            break

        if sleep_milliseconds:
            sleep(sleep_milliseconds / 1000)

    return result

In [5]:
reviews_count = 25000

In [6]:
result = []
continuation_token = None


with tqdm(total=reviews_count, position=0, leave=True) as pbar:
    while len(result) < reviews_count:
        new_result, continuation_token = reviews(
            app_id,
            continuation_token=continuation_token,
            lang='en', #The language of review
            country='ph', #Country for which you want to scrape 
            sort=Sort.NEWEST,
            filter_score_with=None,
            count=199 #No need to change this
        )
        if not new_result:
            break
        result.extend(new_result)
        pbar.update(len(new_result))

25074it [01:39, 251.19it/s]                           


In [7]:
df = pd.DataFrame(result)

df.head(5)

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,daf84a04-08ad-471a-a3bd-eaa21397608d,Luffy Taro,https://play-lh.googleusercontent.com/a/ACg8oc...,THIS GANE IS NOT GOOD BECAUSE THERE ARE TOO MA...,1,0,,2024-06-09 02:31:45,,NaT,
1,5480c618-6f9e-4f8d-a041-0009c38fa8c7,Bernalyn Castillo (Derf Martinez Mom),https://play-lh.googleusercontent.com/a/ACg8oc...,Never play this because he scammed me igot Leg...,1,0,,2024-06-09 02:31:28,,NaT,
2,3b273c54-a3b5-4c54-aa87-9ee36bdc68e8,Ian Wolf Valdez,https://play-lh.googleusercontent.com/a-/ALV-U...,Wala lang nakakainis lose streak,5,0,,2024-06-09 02:30:08,,NaT,
3,f4e595cc-82bf-4ab5-abb0-9ff487f46000,Julie ann Fulgencio,https://play-lh.googleusercontent.com/a-/ALV-U...,Very nice game,1,0,1.8.79.9552,2024-06-09 02:27:47,,NaT,1.8.79.9552
4,e550291c-9817-4480-a1ec-fea09b789930,Randz Garcia,https://play-lh.googleusercontent.com/a/ACg8oc...,0/5 trash teammates always 5man team enemy bas...,1,0,,2024-06-09 02:26:52,,NaT,


In [8]:
df.columns

Index(['reviewId', 'userName', 'userImage', 'content', 'score',
       'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent',
       'repliedAt', 'appVersion'],
      dtype='object')

In [9]:
df = df[['reviewId', 'userName', 'content', 'score',
       'thumbsUpCount', 'reviewCreatedVersion', 'at', 'appVersion']]

In [10]:
df.shape

(25074, 8)

In [11]:
df.head()

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,reviewCreatedVersion,at,appVersion
0,daf84a04-08ad-471a-a3bd-eaa21397608d,Luffy Taro,THIS GANE IS NOT GOOD BECAUSE THERE ARE TOO MA...,1,0,,2024-06-09 02:31:45,
1,5480c618-6f9e-4f8d-a041-0009c38fa8c7,Bernalyn Castillo (Derf Martinez Mom),Never play this because he scammed me igot Leg...,1,0,,2024-06-09 02:31:28,
2,3b273c54-a3b5-4c54-aa87-9ee36bdc68e8,Ian Wolf Valdez,Wala lang nakakainis lose streak,5,0,,2024-06-09 02:30:08,
3,f4e595cc-82bf-4ab5-abb0-9ff487f46000,Julie ann Fulgencio,Very nice game,1,0,1.8.79.9552,2024-06-09 02:27:47,1.8.79.9552
4,e550291c-9817-4480-a1ec-fea09b789930,Randz Garcia,0/5 trash teammates always 5man team enemy bas...,1,0,,2024-06-09 02:26:52,


In [12]:
df.to_csv("raw_mobilelegends.csv")