## ⛏️ Dataset Scraping

Approach to retrieve apps:
1. Search for a keyword on the Google Play Store.
1. Repeat the process, changing the ***country*** option of the *google_play_scraper* library.
1. Remove duplicates.
1. Retrieve additional information from AndroZoo.

***Note:*** Manual analysis is required due to some apps that attempt to disguise themselves as something else (e.g., fake calculator apps to hide photos).

#### Import

In [None]:
# IMPORT
from    google_play_scraper import search,app
from    langdetect          import detect
from    tqdm                import tqdm
import  pandas              as pd
import  numpy               as np
import  langdetect
import  os

In [None]:
# Initialize tqdm for pandas.
tqdm.pandas()

In [None]:
print("⚡ START ⚡")

#### Parameters

The Androzoo CSV file is too large and is continuously updated. For this reason, it must be downloaded from the official website. (https://androzoo.uni.lu/lists)

In [None]:
ANDROZOO_PATH    = '0_androzoo.csv'
ANDROZOO_COLUMNS = ['sha256', 'sha1', 'md5', 'dex_date', 'apk_size', 'pkgName', 'vercode', 'vt_detection', 'vt_scan_date', 'dex_size', 'markets']

Parameters to perform the search.

In [None]:
# Keyword to be searched
KEYWORD   = "weather"

# List of countries to be used when searching on Google Play.
# 50
# COUNTRIES = ['US', 'CA', 'MX', 'GB', 'FR', 'DE', 'IT', 'ES', 'PT', 'NL', 'BE', 'IE', 'CH', 'AT', 'SE', 'NO', 'DK', 'FI', 'IS', 'GR', 'JP', 'CN', 'KR', 'IN', 'RU', 'BR', 'AR', 'CL', 'PE', 'CO', 'SA', 'AE', 'IL', 'EG', 'ZA', 'NG', 'KE', 'AU', 'NZ', 'FJ','KZ', 'UA', 'PL', 'CZ', 'HU', 'SK', 'RO', 'BG', 'HR', 'RS']

# 100
COUNTRIES  = ['US', 'CA', 'MX', 'GB', 'FR', 'DE', 'IT', 'ES', 'PT', 'NL', 'BE', 'IE', 'CH', 'AT', 'SE', 'NO', 'DK', 'FI', 'IS', 'GR', 'JP', 'CN', 'KR', 'IN', 'RU', 'BR', 'AR', 'CL', 'PE', 'CO', 'SA', 'AE', 'IL', 'EG', 'ZA', 'NG', 'KE', 'AU', 'NZ', 'FJ','KZ', 'UA', 'PL', 'CZ', 'HU', 'SK', 'RO', 'BG', 'HR', 'RS','SI', 'LT', 'LV', 'EE', 'BY', 'MD', 'AZ', 'GE', 'AM', 'UZ', 'TM', 'TJ', 'KG', 'TR', 'CY', 'GR', 'MT', 'VA', 'MC', 'AD', 'LU', 'SM', 'LI', 'FO', 'GL', 'IS', 'AX', 'BZ', 'GT', 'HN', 'SV', 'NI', 'CR', 'PA', 'JM', 'HT', 'DO', 'CU', 'TT', 'BS', 'BB', 'GD', 'AG', 'LC', 'VC', 'DM', 'KN', 'GY', 'SR']

# 250
# COUNTRIES = ['AF', 'AX', 'AL', 'DZ', 'AS', 'AD', 'AO', 'AI', 'AQ', 'AG', 'AR', 'AM', 'AW', 'AU', 'AT', 'AZ', 'BS', 'BH', 'BD', 'BB', 'BY', 'BE', 'BZ', 'BJ', 'BM', 'BT', 'BO', 'BQ', 'BA', 'BW', 'BV', 'BR', 'IO', 'BN', 'BG', 'BF', 'BI', 'CV', 'KH', 'CM', 'CA', 'KY', 'CF', 'TD', 'CL', 'CN', 'CX', 'CC', 'CO', 'KM', 'CG', 'CD', 'CK', 'CR', 'CI', 'HR', 'CU', 'CW', 'CY', 'CZ', 'DK', 'DJ', 'DM', 'DO', 'EC', 'EG', 'SV', 'GQ', 'ER', 'EE', 'ET', 'FK', 'FO', 'FJ', 'FI', 'FR', 'GF', 'PF', 'TF', 'GA', 'GM', 'GE', 'DE', 'GH', 'GI', 'GR', 'GL', 'GD', 'GP', 'GU', 'GT', 'GG', 'GN', 'GW', 'GY', 'HT', 'HM', 'VA', 'HN', 'HK', 'HU', 'IS', 'IN', 'ID', 'IR', 'IQ', 'IE', 'IM', 'IL', 'IT', 'JM', 'JP', 'JE', 'JO', 'KZ', 'KE', 'KI', 'KP', 'KR', 'KW', 'KG', 'LA', 'LV', 'LB', 'LS', 'LR', 'LY', 'LI', 'LT', 'LU', 'MO', 'MK', 'MG', 'MW', 'MY', 'MV', 'ML', 'MT', 'MH', 'MQ', 'MR', 'MU', 'YT', 'MX', 'FM', 'MD', 'MC', 'MN', 'ME', 'MS', 'MA', 'MZ', 'MM', 'NA', 'NR', 'NP', 'NL', 'NC', 'NZ', 'NI', 'NE', 'NG', 'NU', 'NF', 'MP', 'NO', 'OM', 'PK', 'PW', 'PS', 'PA', 'PG', 'PY', 'PE', 'PH', 'PN', 'PL', 'PT', 'PR', 'QA', 'RE', 'RO', 'RU', 'RW', 'BL', 'SH', 'KN', 'LC', 'MF', 'PM', 'VC', 'WS', 'SM', 'ST', 'SA', 'SN', 'RS', 'SC', 'SL', 'SG', 'SX', 'SK', 'SI', 'SB', 'SO', 'ZA', 'GS', 'SS', 'ES', 'LK', 'SD', 'SR', 'SJ', 'SZ', 'SE', 'CH', 'SY', 'TW', 'TJ', 'TZ', 'TH', 'TL', 'TG', 'TK', 'TO', 'TT', 'TN', 'TR', 'TM', 'TC', 'TV', 'UG', 'UA', 'AE', 'GB', 'US', 'UM', 'UY', 'UZ', 'VU','VE','VN','YE','ZM','ZW']

Paths

In [None]:
# To map categories to categoryID
CATEGORIES_PATH    = '2_categoriesMapping.csv'

# Output Path
SEARCH_OUTPUT_PATH = './ScrapingOutput/{}/search.csv'.format(KEYWORD)

In [None]:
if not os.path.exists('./ScrapingOutput/{}'.format(KEYWORD)):
    os.makedirs('./ScrapingOutput/{}'.format(KEYWORD))

### 1. Load Androzoo

In [None]:
# 1. Load Androzoo
androzooDF = pd.read_csv(ANDROZOO_PATH, names=ANDROZOO_COLUMNS)
androzooDF = androzooDF.rename(columns={'pkg_name': 'pkgName'})
print("#️⃣ Apps: {}\n".format(androzooDF.shape[0]))

### 2. Perform scraping on Google Play

In [None]:
# Perform search on Google Play given a keyword
def searchOnGooglePlayStore(keyword, countries):
    # Empty DF
    searchDF = pd.DataFrame()

    # Search on Google Play Store
    print("🔍 Searching on Google Play Store")
    for country in countries:
        searchDF = pd.concat([searchDF, pd.DataFrame(search(KEYWORD, lang="en", country=country, n_hits=30))])
       
    print("#️⃣ {: <30}: {}".format("Search", searchDF.shape[0]))
    # Clean description
    searchDF['description'] = searchDF['description'].str.replace(r"\r\n|\r|\n", ' ', regex=True)

    # Remove duplicates
    print("\n⛏️ Removing Duplicates")
    searchDF = searchDF.drop_duplicates(subset='appId')
    searchDF = searchDF.loc[:, ["appId","genre","description"]]
    searchDF = searchDF.rename(columns={'appId': 'pkgName'})   
    searchDF = searchDF.reset_index()
    print("#️⃣ {: <30}: {}".format("Duplicates Removed", searchDF.shape[0]))
    
    # Remove non english
    print("\n⛏️ Removing Non English")
    toDrop = []
    for index, row in searchDF.iterrows():
        if detect(row['description']) != "en":
            toDrop.append(index)
    searchDF = searchDF.drop(toDrop)
    print("#️⃣ {: <30}: {}".format("Non English Removed", searchDF.shape[0]))
    
    return searchDF

# Get info from AndroZoo
def getInfoFromAndrozoo(androzooDF, searchDF):

    print("\n⛏️ Retrieving info from AndroZoo")
    # Merge the DataFrames
    mergedDF = pd.merge(searchDF, androzooDF, on='pkgName', how='inner')

    # Reorganize the info

    mergedDF = mergedDF.sort_values(by='dex_date', ascending=False)
    mergedDF = mergedDF.drop_duplicates(subset='pkgName')
    mergedDF = mergedDF[['sha256','pkgName','genre','description']]
    mergedDF = mergedDF.set_index(['sha256'])
    print("#️⃣ {: <30}: {}".format("AndroZoo Info collected", searchDF.shape[0]))

    return mergedDF

# Map categories to categoryIDs
def mapCategories(searchDF, categoriesDF):
    # Create a dict
    category_map = dict(zip(categoriesDF['category'], categoriesDF['categoryID']))

    # Replace values using the map
    searchDF['genre'] = searchDF['genre'].map(category_map)

    # Rename
    searchDF = searchDF.rename(columns={'genre': 'categoryID'})
    return searchDF

In [None]:
# Search on Google Play Store
searchDF = searchOnGooglePlayStore(KEYWORD, COUNTRIES)

# Get the remaining info from Androzoo
searchDF = getInfoFromAndrozoo(androzooDF, searchDF)

# Map the Google Categories to Categories ID
categoriesDF = pd.read_csv(CATEGORIES_PATH,index_col=False)
searchDF     = mapCategories(searchDF, categoriesDF)

### 3. Save Results

In [None]:
# Save everything
searchDF.to_csv(SEARCH_OUTPUT_PATH)

In [None]:
print("\n🔚 END")