In [None]:
!pip install -qq google-play-scraper

In [1]:
import json
import pandas as pd
from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

from pygments import highlight
from pygments.lexers import JsonLexer
from pygments.formatters import TerminalFormatter

from google_play_scraper import Sort, reviews, app

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

In [2]:
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup

In [3]:
categories = ["APPLICATION", "GAME", "ART_AND_DESIGN", "AUTO_AND_VEHICLES", "BEAUTY", "BOOKS_AND_REFERENCE",
              "BUSINESS", "COMICS", "COMMUNICATION", "DATING", "EDUCATION", "ENTERTAINMENT", "EVENTS", "FINANCE",
              "FOOD_AND_DRINK", "HEALTH_AND_FITNESS", "HOUSE_AND_HOME", "LIFESTYLE", "MAPS_AND_NAVIGATION", "MEDICAL",
              "MUSIC_AND_AUDIO", "NEWS_AND_MAGAZINES", "PARENTING", "PERSONALIZATION", "PHOTOGRAPHY", "PRODUCTIVITY",
              "SHOPPING", "SOCIAL", "SPORTS", "TOOLS", "TRAVEL_AND_LOCAL", "VIDEO_PLAYERS", "WEATHER"]

categories = [string + '?hl=en' for string in categories]

In [5]:
app_packages = []

url = "https://play.google.com/store/apps/category/"

for cat in categories[:10]:
    print(cat)
    html = urllib.request.urlopen(url+cat).read()
    soup = BeautifulSoup(html,'html.parser')
    tags = soup('a')
    ids = list()
    for tag in tags:
        x = tag.get('href', None)
        if x.find("/store/apps/details?id=") != -1:
           if not(x[23:] in ids):
                ids.append(x[23:])
    app_packages.append(ids)

APPLICATION?hl=en
GAME?hl=en
ART_AND_DESIGN?hl=en
AUTO_AND_VEHICLES?hl=en
BEAUTY?hl=en
BOOKS_AND_REFERENCE?hl=en
BUSINESS?hl=en
COMICS?hl=en
COMMUNICATION?hl=en
DATING?hl=en


In [6]:
app_packages = [item for sublist in app_packages for item in sublist]

In [7]:
len(app_packages)

470

In [8]:
app_infos = []

for ap in tqdm(app_packages):
    info = app(ap, lang='en', country='us')
    del info['comments']
    app_infos.append(info)

100%|██████████| 470/470 [02:56<00:00,  2.66it/s]


In [9]:
def print_json(json_object):
    json_str = json.dumps(json_object, indent=2, sort_keys=True,
                         default=str)
    print(highlight(json_str, JsonLexer(), TerminalFormatter()))

In [10]:
print_json(app_infos[0])

{
  [94m"adSupported"[39;49;00m: [34mnull[39;49;00m,
  [94m"androidVersion"[39;49;00m: [33m"4.0.3"[39;49;00m,
  [94m"androidVersionText"[39;49;00m: [33m"4.0.3 and up"[39;49;00m,
  [94m"appId"[39;49;00m: [33m"com.whatsapp"[39;49;00m,
  [94m"containsAds"[39;49;00m: [34mnull[39;49;00m,
  [94m"contentRating"[39;49;00m: [33m"Everyone"[39;49;00m,
  [94m"contentRatingDescription"[39;49;00m: [34mnull[39;49;00m,
  [94m"currency"[39;49;00m: [33m"USD"[39;49;00m,
  [94m"description"[39;49;00m: [33m"WhatsApp from Facebook\r\n\r\nWhatsApp Messenger is a FREE messaging app available for Android and other smartphones. WhatsApp uses your phone's Internet connection (4G/3G/2G/EDGE or Wi-Fi, as available) to let you message and call friends and family. Switch from SMS to WhatsApp to send and receive messages, calls, photos, videos, documents, and Voice Messages.\r\n\r\nWHY USE WHATSAPP:\r\n\r\n\u2022 NO FEES: WhatsApp uses your phone's Internet connection (4G/3G/2G/EDGE

In [None]:
def format_title(title):
    sep_index = title.find(':') if title.find(':') != -1 else title.find('-')
    if sep_index != -1:
        title = title[:sep_index]
    return title[:10]
        

fig, axs = plt.subplots(2, len(app_infos)//2, figsize=(10,3))
for i, ax in enumerate(axs.flat):
    ai = app_infos[i]
    img = plt.imread(ai['icon'])
    ax.imshow(img)
    ax.set_title(format_title(ai['title']))
    ax.axis('off')

In [11]:
app_infos_df = pd.DataFrame(app_infos)
app_infos_df.head(n=2)

Unnamed: 0,title,description,descriptionHTML,summary,summaryHTML,installs,minInstalls,score,ratings,reviews,...,contentRatingDescription,adSupported,containsAds,released,updated,version,recentChanges,recentChangesHTML,appId,url
0,WhatsApp Messenger,WhatsApp from Facebook\r\n\r\nWhatsApp Messeng...,WhatsApp from Facebook<br><br>WhatsApp Messeng...,Simple. Personal. Secure.,Simple. Personal. Secure.,"5,000,000,000+",5000000000,4.306773,117936090.0,34486942,...,,,,"Oct 18, 2010",1596495381,2.20.196.16,* WhatsApp now supports animated stickers. You...,* WhatsApp now supports animated stickers. You...,com.whatsapp,https://play.google.com/store/apps/details?id=...
1,YouTube,Get the official YouTube app for Android phone...,Get the official YouTube app for Android phone...,Enjoy your favorite videos and channels with t...,Enjoy your favorite videos and channels with t...,"5,000,000,000+",5000000000,4.098671,78981063.0,28210352,...,,True,True,"Oct 20, 2010",1596565847,Varies with device,,,com.google.android.youtube,https://play.google.com/store/apps/details?id=...


In [12]:
app_infos_df.to_csv('apps.csv', index=None, header=True)

# Get App Reviews

In [13]:
app_reviews = []

for ap in tqdm(app_packages):
    for score in range(1,6):
        for sort_order in [Sort.MOST_RELEVANT, Sort.NEWEST]:
            rvs = reviews(
                ap,
                lang='en',
                country='us',
                sort=sort_order,
                # scores of 1/2 are negative, scores of 3 are neutral
                # and scores of 4/5 are positive => need to balance class
                count=200 if score==3 else 100,
                filter_score_with=score
            )[0]
        
            for r in rvs:
                r['sortOrder'] = 'most_relevant' if sort_order == Sort.MOST_RELEVANT else 'newest'
                r['appId'] = ap

            app_reviews.extend(rvs)
        

  1%|▏         | 6/470 [00:54<1:10:17,  9.09s/it]


KeyboardInterrupt: 

In [None]:
print_json(app_reviews[0])

In [None]:
app_reviews_df = pd.DataFrame(app_reviews)
app_reviews_df.shape

In [None]:
app_reviews_df.head(n=2)

In [None]:
app_reviews_df.to_csv("reviews.csv", index=None, header=True)