## ⛏️ App Libraries - Preprocessing

Preprocess the list of App Libraries.

#### Imports

In [None]:
# IMPORT
from   tqdm                 import tqdm
import pandas               as pd
import numpy                as np
import ast

import appLibrariesUtils

In [None]:
# Initialize TQDM library for Pandas
tqdm.pandas()

In [None]:
print("⚡ START ⚡\n")

#### Parameters

In [None]:
# Ground-Truth Dataset
INPUT_PATH  = "../TMP/4f_AppLibrariesData.csv"

# Output Path
OUTPUT_PATH = "../TMP/4f_AppLibrariesDataPreprocessed.csv"

TMP_PATH = "../TMP"
if not os.path.exists(TMP_PATH):
    os.makedirs(TMP_PATH)
    print("📁🆕 Folder created       :", TMP_PATH)
else:
    print("📁✅ Folder already exists:", TMP_PATH)

### 1. Load Data

In [None]:
appsDF = pd.read_csv(INPUT_PATH, index_col=False)
print("#️⃣ Apps: {}".format(appsDF.shape[0]))

appsDF.head(3)

In [None]:
# Replace "None" with NaN values
appsDF['appRawLibraries'] = appsDF['appRawLibraries'].replace('None', np.nan)

# Remove NaN values from the 'appLibraries' column
appsDF.dropna(subset=['appRawLibraries'], inplace=True)

In [None]:
print("\n🔨 Reading data as lists")
appsDF['appRawLibraries'] = appsDF['appRawLibraries'].progress_apply(ast.literal_eval) 

### 2. Get App Libraries

In [None]:
# Load txt files with libraries and system libraries
WHITELIST_LIBRARIES        = "libraries.txt"
SYSTEM_WHITELIST_LIBRARIES = "systems.txt"

whitelistLibrariesList       = appLibrariesUtils.loadTxtFile(WHITELIST_LIBRARIES)
systemWhitelistLibrariesList = appLibrariesUtils.loadTxtFile(SYSTEM_WHITELIST_LIBRARIES)

In [None]:
print("\n🔨 Retrieving libraries")
appsDF['appLibraries'] = appsDF['appRawLibraries'].progress_apply(lambda appRawLibraries: appLibrariesUtils.getLibraries(appRawLibraries, whitelistLibrariesList))

print("\n🔨 Retrieving system libraries")
appsDF['appSystemLibraries'] = appsDF['appRawLibraries'].progress_apply(lambda appRawLibraries: appLibrariesUtils.getSystemLibraries(appRawLibraries, systemWhitelistLibrariesList))

### 3. Save everything

In [None]:
appsDF = appsDF[['sha256','classID',"appLibraries","appSystemLibraries"]]

# Save the result
appsDF.to_csv(OUTPUT_PATH,index=False)
appsDF.head(5)