## 🔢 Description Crawler

In [None]:
# Imports
import numpy    as np
import pandas   as pd
import datetime
import tempfile
import subprocess
import requests
import time
import json
import gzip
import os

#### Initialization

In [None]:
print("⚡ Start - {} ⚡\n".format(datetime.datetime.now()))
startTime = datetime.datetime.now()

#### 📥 1) Load Data 

In [None]:
# Load the dataframes
az16DF = pd.read_csv("../../0_Data/azFiltered16.csv")
mudflowDF = pd.read_csv("../../0_Data/4_MudFlow.csv")

# Select and rename columns
az16DF = az16DF[['sha256', 'pkg_name']]
az16DF.rename(columns={'pkg_name': 'pkgName'}, inplace=True)

mudflowDF = mudflowDF[['sha256', 'pkgName']]

# Concatenate the dataframes
maliciousDF = pd.concat([az16DF, mudflowDF], axis=0)

# Print sizes of dataframes with emojis
print(f"📄 Size of az16DF      : {az16DF.shape}")
print(f"📄 Size of mudflowDF   : {mudflowDF.shape}")
print(f"📄 Size of maliciousDF : {maliciousDF.shape}")

# Remove duplicates based on sha256 column
maliciousDF = maliciousDF.drop_duplicates(subset='sha256')

# Print size of maliciousDF after removing duplicates
print(f"📄 Size of maliciousDF : {maliciousDF.shape}")

# Display the first 2 rows of the resulting dataframe
maliciousDF.head(2)

Get data from AZ-META

In [None]:
# Define file paths
AZ_PATH = "../../0_Data/gp-metadata-full.jsonl"
OUTPUT_CSV_PATH = "descriptions.csv"

In [None]:
pkgNamesList = maliciousDF["pkgName"].to_list()
print(len(pkgNamesList))

pkgNamesList = list(set(pkgNamesList))
print(len(pkgNamesList))

In [None]:
# Create a temporary file to hold the JSON data
with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8') as temp_file:
    json.dump(pkgNamesList, temp_file)
    temp_file_path = temp_file.name

# Run jq command and capture the output
try:
    byteResult = subprocess.check_output([
        "jq",
        '--argfile', 'pkgNames', temp_file_path,
        'select(any(.docid == $pkgNames[]; .)) | .docid + "," + .descriptionHtml',
        AZ_PATH
    ])
except subprocess.CalledProcessError as e:
    print(f"Error running jq: {e}")
    raise
finally:
    # Clean up temporary file
    os.remove(temp_file_path)

data = byteResult.decode("utf-8").replace("\"", "")

# Split data into rows
rows = data.split('\n')

# Prepare lists to hold the data
pkgNames = []
descriptions = []

# Process each row to extract pkgName and description
for row in rows:
    if row:
        pkgName, description = row.split(',', 1)  # Split only on the first comma
        pkgNames.append(pkgName)
        descriptions.append(description)

# Create a DataFrame from the lists
result_df = pd.DataFrame({
    'pkgName': pkgNames,
    'descriptionHtml': descriptions
})

result_df.to_csv(OUTPUT_CSV_PATH, index=False)
print(f"DataFrame saved to {OUTPUT_CSV_PATH}")

##### 🔚 End

In [None]:
endTime = datetime.datetime.now()
print("\n🔚 --- End - {} --- 🔚".format(endTime))

# Assuming endTime and startTime are in seconds
totalTime = endTime - startTime
minutes = totalTime.total_seconds() // 60
seconds = totalTime.total_seconds() % 60
print("⏱️ --- Time: {:02d} minutes and {:02d} seconds --- ⏱️".format(int(minutes), int(seconds)))