## 🔢 Description Crawler

In [1]:
# Imports
import numpy    as np
import pandas   as pd
import google_play_scraper 
import langdetect
import datetime
import requests
import time
import json
import os

#### Initialization

In [2]:
print("⚡ Start - {} ⚡\n".format(datetime.datetime.now()))
startTime = datetime.datetime.now()

⚡ Start - 2024-07-20 17:09:28.802943 ⚡



#### 📥 1) Load Data 

In [3]:
DATA_PATH = "../../0_Data/3_MalCatSet.csv"

# Read the data
appsDF = pd.read_csv(DATA_PATH)

# TEST
appsDF = appsDF.head(3)

print("--- #️⃣ Apps: {} ".format(appsDF.shape[0]))

--- #️⃣ Apps: 3 


In [4]:
appsDF.head(5)

Unnamed: 0,sha256,pkgName,classID
0,24D3490CF23842A791CBB5B10F1427808F4B163F9C4927...,com.rytong.airchina,Airlines
1,2D3D869A1DF82ACDCABBB08277ADECB8E5B64DB4DB516C...,com.rytong.hnair,Airlines
2,69C81403C28E86FE90975BAB668DD9F3FDC4330717A484...,com.nepel.scandriveanti,Antivirus


In [5]:
def getGoogleEnglishDescription(pkgName):
	try:
		# Use googlePlayScraper Library
		result = google_play_scraper.app(pkgName, lang='en', country='us')

		if result is not None and result['description'] is not None:
			try:
				# Use langdetect
				lang = langdetect.detect(result['description'])
				# If the description is in English, return it
				if lang == "en":
					description = result['description'].replace('\n', ' ').replace('\r', '')
					return pd.Series([description], index=['description'])
				else:
					return pd.Series([np.nan], index=['description'])
			except langdetect.LangDetectException:
				return pd.Series([np.nan], index=['description'])
		else:
			return pd.Series([np.nan], index=['description'])
	except Exception:
		return pd.Series([np.nan], index=['description'])

In [6]:
appsDF['gpDescription'] = appsDF['pkgName'].apply(getGoogleEnglishDescription)
appsDF.head(3)

Unnamed: 0,sha256,pkgName,classID,gpDescription
0,24D3490CF23842A791CBB5B10F1427808F4B163F9C4927...,com.rytong.airchina,Airlines,Air China is China’s only national flag carrie...
1,2D3D869A1DF82ACDCABBB08277ADECB8E5B64DB4DB516C...,com.rytong.hnair,Airlines,Hainan Airlines Company Profile Hainan Airlin...
2,69C81403C28E86FE90975BAB668DD9F3FDC4330717A484...,com.nepel.scandriveanti,Antivirus,


In [7]:
def getDescriptionFromMetaDataAndroZoo(pkgName, retries=5, delay=10):
    url = 'https://androzoo.uni.lu/api/get_gp_metadata/{}'.format(pkgName)
    params = {'apikey': os.getenv('ANDROZOO_API_KEY')}
    
    attempt = 0
    while attempt < retries:
        response = requests.get(url, params=params)
        
        # Return Description
        if response.status_code == 200:
            return response.json()[0]['descriptionHtml']
        # Retry 
        elif response.status_code in [502, 503]:
            attempt += 1
            print(f"Attempt {attempt} failed with status code {response.status_code}. Retrying in {delay} seconds...")
            time.sleep(delay)
        else:
            print(f"Request failed with status code {response.status_code}. No retries left.")
            return None
    
    print("Max retries exceeded. Request failed.")
    return None

In [8]:
appsDF['azDescription'] = appsDF['pkgName'].apply(getDescriptionFromMetaDataAndroZoo)
appsDF.head(3)

Attempt 1 failed with status code 502. Retrying in 10 seconds...
Attempt 1 failed with status code 502. Retrying in 10 seconds...
Attempt 2 failed with status code 503. Retrying in 10 seconds...
Attempt 3 failed with status code 503. Retrying in 10 seconds...
Attempt 4 failed with status code 503. Retrying in 10 seconds...
Attempt 5 failed with status code 503. Retrying in 10 seconds...
Max retries exceeded. Request failed.


Unnamed: 0,sha256,pkgName,classID,gpDescription,azDescription
0,24D3490CF23842A791CBB5B10F1427808F4B163F9C4927...,com.rytong.airchina,Airlines,Air China is China’s only national flag carrie...,Air China is China’s only national flag carrie...
1,2D3D869A1DF82ACDCABBB08277ADECB8E5B64DB4DB516C...,com.rytong.hnair,Airlines,Hainan Airlines Company Profile Hainan Airlin...,Hainan Airlines was founded in 1993 in Hainan ...
2,69C81403C28E86FE90975BAB668DD9F3FDC4330717A484...,com.nepel.scandriveanti,Antivirus,,


### Filter the data.

In [9]:
# Create the 'description' column
appsDF['description'] = appsDF['gpDescription'].fillna(appsDF['azDescription'])
print("--- #️⃣ Apps: {} ".format(appsDF.shape[0]))

# Drop rows where both 'gpDescription' and 'azDescription' are NaN
appsDF = appsDF.dropna(subset=['description'])
print("--- #️⃣ Apps: {} ".format(appsDF.shape[0]))

appsDF = appsDF.drop(columns=['gpDescription', 'azDescription'])
appsDF.head(3)

--- #️⃣ Apps: 3 
--- #️⃣ Apps: 2 


Unnamed: 0,sha256,pkgName,classID,description
0,24D3490CF23842A791CBB5B10F1427808F4B163F9C4927...,com.rytong.airchina,Airlines,Air China is China’s only national flag carrie...
1,2D3D869A1DF82ACDCABBB08277ADECB8E5B64DB4DB516C...,com.rytong.hnair,Airlines,Hainan Airlines Company Profile Hainan Airlin...


In [11]:
appsDF.to_csv("../TmpData/3_MalCatSet_Description.csv")

##### 🔚 End

In [10]:
endTime = datetime.datetime.now()
print("\n🔚 --- End - {} --- 🔚".format(endTime))

# Assuming endTime and startTime are in seconds
totalTime = endTime - startTime
minutes = totalTime.total_seconds() // 60
seconds = totalTime.total_seconds() % 60
print("⏱️ --- Time: {:02d} minutes and {:02d} seconds --- ⏱️".format(int(minutes), int(seconds)))


🔚 --- End - 2024-07-20 17:12:14.988264 --- 🔚
⏱️ --- Time: 02 minutes and 46 seconds --- ⏱️
