## 🔢 Description Crawler

In [9]:
# Imports
import numpy    as np
import pandas   as pd
import google_play_scraper 
import langdetect
import datetime
import subprocess
import requests
import time
import json
import gzip
import os

#### Initialization

In [10]:
print("⚡ Start - {} ⚡\n".format(datetime.datetime.now()))
startTime = datetime.datetime.now()

⚡ Start - 2024-07-23 14:40:57.121651 ⚡



#### 📥 1) Load Data 

In [11]:
# Load the dataframes
az16DF = pd.read_csv("../../0_Data/azFiltered16.csv")
mudflowDF = pd.read_csv("../../0_Data/4_MudFlow.csv")

# Select and rename columns
az16DF = az16DF[['sha256', 'pkg_name']]
az16DF.rename(columns={'pkg_name': 'pkgName'}, inplace=True)

mudflowDF = mudflowDF[['sha256', 'pkgName']]

# Concatenate the dataframes
maliciousDF = pd.concat([az16DF, mudflowDF], axis=0)

# Print sizes of dataframes with emojis
print(f"📄 Size of az16DF      : {az16DF.shape}")
print(f"📄 Size of mudflowDF   : {mudflowDF.shape}")
print(f"📄 Size of maliciousDF : {maliciousDF.shape}")

# Remove duplicates based on sha256 column
maliciousDF = maliciousDF.drop_duplicates(subset='sha256')

# Print size of maliciousDF after removing duplicates
print(f"📄 Size of maliciousDF : {maliciousDF.shape}")

# Display the first 2 rows of the resulting dataframe
maliciousDF = maliciousDF.head(2)

📄 Size of az16DF      : (15342, 2)
📄 Size of mudflowDF   : (8038, 2)
📄 Size of maliciousDF : (23380, 2)
📄 Size of maliciousDF : (23380, 2)


Get data from AZ-META

In [12]:
AZ_PATH = "../../0_Data/gp-metadata-full.jsonl.gz"

In [13]:
pkgNamesList = maliciousDF["pkgName"].to_list()

In [14]:
print(pkgNamesList)

['com.kuyhaa.android.auto.call.recorder.acr.lite.autocallrecorder.voicecallrecorder.tools', 'com.greenncardd.senegalgospelmusic']


In [15]:
# Decompress the gzipped file and pipe the output to jq
gzip_process = subprocess.Popen(
    ["gzip", "-cd", AZ_PATH],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE
)

jq_process = subprocess.Popen(
    ["jq",
     "--argjson",
     "pkgNames",
     json.dumps(pkgNamesList),
     'select(any(.docid == $pkgNames[]; .)) | .docid + "," + .descriptionHtml'],
    stdin=gzip_process.stdout,
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE
)

# Ensure the gzip process doesn't receive any further input
gzip_process.stdout.close()

# Get the output and any errors from jq
output, error = jq_process.communicate()

if jq_process.returncode == 0:
    data = output.decode("utf-8").replace("\"", "")
else:
    print(f"Error: {error.decode('utf-8')}")

KeyboardInterrupt: 

In [None]:
print(data)

##### 🔚 End

In [None]:
endTime = datetime.datetime.now()
print("\n🔚 --- End - {} --- 🔚".format(endTime))

# Assuming endTime and startTime are in seconds
totalTime = endTime - startTime
minutes = totalTime.total_seconds() // 60
seconds = totalTime.total_seconds() % 60
print("⏱️ --- Time: {:02d} minutes and {:02d} seconds --- ⏱️".format(int(minutes), int(seconds)))