## 🔢 Description Crawler

In [1]:
# Imports
import numpy    as np
import pandas   as pd
import google_play_scraper 
import langdetect
import datetime
import requests
import time
import json
import os

#### Initialization

In [2]:
print("⚡ Start - {} ⚡\n".format(datetime.datetime.now()))
startTime = datetime.datetime.now()

⚡ Start - 2024-07-25 08:48:46.907032 ⚡



#### 📥 1) Load Data 

In [3]:
# Load the dataframes
az16DF = pd.read_csv("../../0_Data/azFiltered16.csv")
mudflowDF = pd.read_csv("../../0_Data/4_MudFlow.csv")

# Select and rename columns
az16DF = az16DF[['sha256', 'pkg_name']]
az16DF.rename(columns={'pkg_name': 'pkgName'}, inplace=True)

mudflowDF = mudflowDF[['sha256', 'pkgName']]

# Concatenate the dataframes
maliciousDF = pd.concat([az16DF, mudflowDF], axis=0)

# Print sizes of dataframes with emojis
print(f"📄 Size of az16DF      : {az16DF.shape}")
print(f"📄 Size of mudflowDF   : {mudflowDF.shape}")
print(f"📄 Size of maliciousDF : {maliciousDF.shape}")

# Remove duplicates based on sha256 column
maliciousDF = maliciousDF.drop_duplicates(subset='sha256')

# Print size of maliciousDF after removing duplicates
print(f"📄 Size of maliciousDF : {maliciousDF.shape}")

# Display the first 2 rows of the resulting dataframe
maliciousDF.head(3)

📄 Size of az16DF      : (15342, 2)
📄 Size of mudflowDF   : (8038, 2)
📄 Size of maliciousDF : (23380, 2)
📄 Size of maliciousDF : (23380, 2)


Unnamed: 0,sha256,pkgName
0,00023BCFC14CE3027A542987394A97C64B6C1B9304090F...,com.kuyhaa.android.auto.call.recorder.acr.lite...
1,000447D7EBEC825293D013A4DB671AFBDB01618D46AE57...,com.greenncardd.senegalgospelmusic
2,00044AE8A85BC29EA36DB54F444D888E56425B0EC6CF0F...,wiki.no.mans.sky.game


GP

In [4]:
def getGoogleEnglishDescription(pkgName):
	try:
		# Use googlePlayScraper Library
		result = google_play_scraper.app(pkgName, lang='en', country='us')

		if result is not None and result['description'] is not None:
			try:
				# Use langdetect
				lang = langdetect.detect(result['description'])
				# If the description is in English, return it
				if lang == "en":
					description = result['description'].replace('\n', ' ').replace('\r', '')
					return pd.Series([description], index=['description'])
				else:
					return pd.Series([np.nan], index=['description'])
			except langdetect.LangDetectException:
				return pd.Series([np.nan], index=['description'])
		else:
			return pd.Series([np.nan], index=['description'])
	except Exception:
		return pd.Series([np.nan], index=['description'])

In [5]:
maliciousDF['gpDescription'] = maliciousDF['pkgName'].apply(getGoogleEnglishDescription)
maliciousDF.head(3)

Unnamed: 0,sha256,pkgName,gpDescription
0,00023BCFC14CE3027A542987394A97C64B6C1B9304090F...,com.kuyhaa.android.auto.call.recorder.acr.lite...,
1,000447D7EBEC825293D013A4DB671AFBDB01618D46AE57...,com.greenncardd.senegalgospelmusic,
2,00044AE8A85BC29EA36DB54F444D888E56425B0EC6CF0F...,wiki.no.mans.sky.game,


AZ

In [6]:
# def getDescriptionFromMetaDataAndroZoo(pkgName, retries=10, delay=20):
# 	url = 'https://androzoo.uni.lu/api/get_gp_metadata/{}'.format(pkgName)
# 	params = {'apikey': os.getenv('ANDROZOO_API_KEY')}
	
# 	print("--- 📦 PKG NAME: {}".format(pkgName))
	
# 	attempt = 0
# 	while attempt < retries:
# 		print(f"--- ▶️ Attempt #{attempt}")
# 		response = requests.get(url, params=params)
		
# 		# Return Description
# 		if response.status_code == 200:
# 			print(response.json)
# 			return response.json()[0]['descriptionHtml']
# 		# Retry 
# 		elif response.status_code in [502, 503, 400]:
# 			attempt += 1
# 			print(f"--- ❌ Attempt {attempt} failed with status code {response.status_code}. Retrying in {delay} seconds...")
# 			time.sleep(delay)
# 		else:
# 			print(f"--- ❌ Request failed with status code {response.status_code}. No retries left.")
# 			return None
	
# 	print("--- ☹️ Max retries exceeded. Request failed.")
# 	return None

In [7]:
# appsDF['azDescription'] = appsDF['pkgName'].apply(getDescriptionFromMetaDataAndroZoo)
# appsDF.head(3)

### Filter the data.

In [8]:
# # Create the 'description' column
# appsDF['googlePlayDescription'] = appsDF['gpDescription'].fillna(appsDF['azDescription'])
# print("--- #️⃣ Apps: {} ".format(appsDF.shape[0]))

# # Drop rows where both 'gpDescription' and 'azDescription' are NaN
# appsDF = appsDF.dropna(subset=['description'])
# print("--- #️⃣ Apps: {} ".format(appsDF.shape[0]))

# appsDF = appsDF.drop(columns=['gpDescription', 'azDescription'])
# appsDF.head(3)

In [9]:
maliciousDF['googlePlayDescription'] = maliciousDF['gpDescription']
print("--- #️⃣ malicious: {} ".format(maliciousDF.shape[0]))

maliciousDF = maliciousDF.dropna(subset=['googlePlayDescription'])
print("--- #️⃣ malicious: {} ".format(maliciousDF.shape[0]))

maliciousDF = maliciousDF.drop(columns=['gpDescription'])
maliciousDF.head(3)

--- #️⃣ malicious: 23380 
--- #️⃣ malicious: 1061 


Unnamed: 0,sha256,pkgName,googlePlayDescription
9,001E53DDA101AF19670FBD7FB5D348B21641036240895F...,es.aroundpixels.hsk3lite,Don't worry: we know that <b>studying Chinese ...
13,002557D37607615083739ABEB8D6FDC05CD27EC93C1104...,com.onesports.score,AiScore is your LIVESCORE EXPERT. More informa...
17,00280B2E4FA921E570467396A380FA4248AFA10372B97E...,com.perception.soc.en,"Play the best classic MMORPG, only in Empire O..."


In [10]:
maliciousDF.to_csv("../TmpData/erMaligno_Description.csv", index=False)

##### 🔚 End

In [11]:
endTime = datetime.datetime.now()
print("\n🔚 --- End - {} --- 🔚".format(endTime))

# Assuming endTime and startTime are in seconds
totalTime = endTime - startTime
minutes = totalTime.total_seconds() // 60
seconds = totalTime.total_seconds() % 60
print("⏱️ --- Time: {:02d} minutes and {:02d} seconds --- ⏱️".format(int(minutes), int(seconds)))


🔚 --- End - 2024-07-25 10:00:19.734634 --- 🔚
⏱️ --- Time: 71 minutes and 32 seconds --- ⏱️
