In [None]:
!pip install -qq google-play-scraper

In [1]:
import json
import pandas as pd
from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

from pygments import highlight
from pygments.lexers import JsonLexer
from pygments.formatters import TerminalFormatter

from google_play_scraper import Sort, reviews, app

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

In [2]:
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup

In [3]:
categories = ["APPLICATION", "GAME", "ART_AND_DESIGN", "AUTO_AND_VEHICLES", "BEAUTY", "BOOKS_AND_REFERENCE",
              "BUSINESS", "COMICS", "COMMUNICATION", "DATING", "EDUCATION", "ENTERTAINMENT", "EVENTS", "FINANCE",
              "FOOD_AND_DRINK", "HEALTH_AND_FITNESS", "HOUSE_AND_HOME", "LIFESTYLE", "MAPS_AND_NAVIGATION", "MEDICAL",
              "MUSIC_AND_AUDIO", "NEWS_AND_MAGAZINES", "PARENTING", "PERSONALIZATION", "PHOTOGRAPHY", "PRODUCTIVITY",
              "SHOPPING", "SOCIAL", "SPORTS", "TOOLS", "TRAVEL_AND_LOCAL", "VIDEO_PLAYERS", "WEATHER"]

categories = [string + '?hl=en' for string in categories]

In [4]:
app_packages = []

url = "https://play.google.com/store/apps/category/"

for cat in categories[:10]:
    print(cat)
    html = urllib.request.urlopen(url+cat).read()
    soup = BeautifulSoup(html,'html.parser')
    tags = soup('a')
    ids = list()
    for tag in tags:
        x = tag.get('href', None)
        if x.find("/store/apps/details?id=") != -1:
           if not(x[23:] in ids):
                ids.append(x[23:])
    app_packages.append(ids)

APPLICATION?hl=en
GAME?hl=en
ART_AND_DESIGN?hl=en
AUTO_AND_VEHICLES?hl=en
BEAUTY?hl=en
BOOKS_AND_REFERENCE?hl=en
BUSINESS?hl=en
COMICS?hl=en
COMMUNICATION?hl=en
DATING?hl=en


In [5]:
app_packages = [item for sublist in app_packages for item in sublist]

In [6]:
len(app_packages)

496

In [8]:
app_infos = []

for ap in tqdm(app_packages):
    info = app(ap, lang='en', country='us')
    del info['comments']
    app_infos.append(info)

100%|██████████| 496/496 [01:18<00:00,  6.28it/s]


In [13]:
def print_json(json_object):
    json_str = json.dumps(json_object, indent=2, sort_keys=True,
                         default=str)
    print(highlight(json_str, JsonLexer(), TerminalFormatter()))

In [None]:
print_json(app_infos[0])

In [None]:
def format_title(title):
    sep_index = title.find(':') if title.find(':') != -1 else title.find('-')
    if sep_index != -1:
        title = title[:sep_index]
    return title[:10]
        

fig, axs = plt.subplots(2, len(app_infos)//2, figsize=(10,3))
for i, ax in enumerate(axs.flat):
    ai = app_infos[i]
    img = plt.imread(ai['icon'])
    ax.imshow(img)
    ax.set_title(format_title(ai['title']))
    ax.axis('off')

In [9]:
app_infos_df = pd.DataFrame(app_infos)
app_infos_df.head(n=2)

Unnamed: 0,title,description,descriptionHTML,summary,summaryHTML,installs,minInstalls,score,ratings,reviews,...,contentRatingDescription,adSupported,containsAds,released,updated,version,recentChanges,recentChangesHTML,appId,url
0,Facebook,Keeping up with friends is faster and easier t...,Keeping up with friends is faster and easier t...,"Find friends, watch live videos, play games & ...","Find friends, watch live videos, play games &a...","5,000,000,000+",5000000000,4.191038,101984168.0,29578125.0,...,,True,True,,1596500416,281.0.0.36.124,,,com.facebook.katana,https://play.google.com/store/apps/details?id=...
1,WhatsApp Messenger,WhatsApp from Facebook\r\n\r\nWhatsApp Messeng...,WhatsApp from Facebook<br><br>WhatsApp Messeng...,Simple. Personal. Secure.,Simple. Personal. Secure.,"5,000,000,000+",5000000000,4.306773,117936399.0,34487021.0,...,,,,"Oct 18, 2010",1596495381,2.20.196.16,* WhatsApp now supports animated stickers. You...,* WhatsApp now supports animated stickers. You...,com.whatsapp,https://play.google.com/store/apps/details?id=...


In [10]:
app_infos_df.to_csv('apps.csv', index=None, header=True)

# Get App Reviews

In [11]:
app_reviews = []

for ap in tqdm(app_packages):
    for score in range(1,6):
        for sort_order in [Sort.MOST_RELEVANT, Sort.NEWEST]:
            rvs = reviews(
                ap,
                lang='en',
                country='us',
                sort=sort_order,
                # scores of 1/2 are negative, scores of 3 are neutral
                # and scores of 4/5 are positive => need to balance class
                count=200 if score==3 else 100,
                filter_score_with=score
            )[0]
        
            for r in rvs:
                r['sortOrder'] = 'most_relevant' if sort_order == Sort.MOST_RELEVANT else 'newest'
                r['appId'] = ap

            app_reviews.extend(rvs)
        

100%|██████████| 496/496 [31:33<00:00,  3.82s/it]


In [14]:
print_json(app_reviews[0])

{
  [94m"appId"[39;49;00m: [33m"com.facebook.katana"[39;49;00m,
  [94m"at"[39;49;00m: [33m"2020-07-30 16:21:12"[39;49;00m,
  [94m"content"[39;49;00m: [33m"I wish the layout would return to the original layout 10 years ago as well. I really dislike the fact that I cannot see all of my friends but only who the algorithm chooses. You should see ALL your friends in the news feed. And I also dislike the fact that Facebook has strayed so far from \"social connection\" to rag magazine information, copy and paste of anything they find, an outright lies. I didn't join Facebook to read the newspaper or watch the news, or read lies shared."[39;49;00m,
  [94m"repliedAt"[39;49;00m: [34mnull[39;49;00m,
  [94m"replyContent"[39;49;00m: [34mnull[39;49;00m,
  [94m"reviewCreatedVersion"[39;49;00m: [33m"280.0.0.48.122"[39;49;00m,
  [94m"reviewId"[39;49;00m: [33m"gp:AOqpTOH3t9JKv4GwMvVB6oNsyTqHwZkwpxnOWFn6-MzFaqABjOEc6eNfFfyw5wzyUmalju5e9IIbl9jkGC5Ghg"[39;49;00m,
  [94m"score"

In [15]:
app_reviews_df = pd.DataFrame(app_reviews)
app_reviews_df.shape

(448748, 12)

In [16]:
app_reviews_df.head(n=2)

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,sortOrder,appId
0,gp:AOqpTOH3t9JKv4GwMvVB6oNsyTqHwZkwpxnOWFn6-Mz...,Denel SantaLucia,https://lh3.googleusercontent.com/a-/AOh14GhNs...,I wish the layout would return to the original...,1,2650,280.0.0.48.122,2020-07-30 16:21:12,,NaT,most_relevant,com.facebook.katana
1,gp:AOqpTOFsi-H0rtSiiiOKk05JRCLlKjgOsrao1jukcm_...,MySelf1,https://lh3.googleusercontent.com/-K3tM-QB4xzU...,"I don't even know what to write, Facebook, whi...",1,2280,281.0.0.36.124,2020-08-04 07:54:57,,NaT,most_relevant,com.facebook.katana


In [17]:
app_reviews_df.to_csv("reviews.csv", index=None, header=True)