## ⛏️ App Permissions - Embedding

### Imports

In [None]:
#IMPORT
from sklearn.feature_extraction.text    import CountVectorizer
from sklearn.feature_extraction.text    import TfidfVectorizer
from   tqdm                             import tqdm
import pandas                           as pd
import numpy                            as np
import ast
import os

In [None]:
# Initialize TQDM library for Pandas
tqdm.pandas()

In [None]:
print("⚡ START ⚡")

#### Parameters

In [None]:
# Ground-Truth Dataset
INPUT_PATH  = "../TMP/4b_AppPermissionsData.csv"

# Output Path
OUTPUT_PATH = "../TMP/4b_AppPermissionsFeatures.csv"

TMP_PATH = "../TMP"
if not os.path.exists(TMP_PATH):
    os.makedirs(TMP_PATH)
    print("📁🆕 Folder created       :", TMP_PATH)
else:
    print("📁✅ Folder already exists:", TMP_PATH)

In [None]:
DELIMITER = "&&&"

### 1. Load Data

In [None]:
appsDF = pd.read_csv(INPUT_PATH, index_col=False)
print("#️⃣ Apps: {}".format(appsDF.shape[0]))

appsDF.head(3)

In [None]:
print("\n🔨 1. Reading data as lists")
appsDF['usedPermissions']       = appsDF['usedPermissions'].progress_apply(ast.literal_eval)      
appsDF['requestedPermissions']  = appsDF['requestedPermissions'].progress_apply(ast.literal_eval)      

### 2. Embedding

In [None]:
def getAvgLen(appsDF, column):
    totLen = appsDF[column].apply(len).sum()
    return totLen / appsDF[column].count()

print("📐 Avg Used Permissions Len      : {}".format(getAvgLen(appsDF,'usedPermissions')))
print("📐 Avg Requested Permissions Len : {}".format(getAvgLen(appsDF,'requestedPermissions')))

In [None]:
# Create an instance of Vectorizer to transform the permissions into feature Vectors
vectorizer = TfidfVectorizer(tokenizer = lambda text: text.split(DELIMITER), max_features = 1536)

# Vectorize the Permissions
appsDF['usedPermissionsFeatures']       = vectorizer.fit_transform([DELIMITER.join(lst) for lst in appsDF['usedPermissions'].values]).toarray().tolist()
appsDF['requestedPermissionsFeatures']  = vectorizer.fit_transform([DELIMITER.join(lst) for lst in appsDF['requestedPermissions'].values]).toarray().tolist()

print("📐 FV Used Permissions Len      : {}".format(len(appsDF.loc[0,'usedPermissionsFeatures'])))
print("📐 FV Requested Permissions Len : {}".format(len(appsDF.loc[0,'requestedPermissionsFeatures'])))

### 3. Save Everything

In [None]:
appsDF = appsDF[['sha256', 'classID', 'usedPermissionsFeatures', 'requestedPermissionsFeatures']]

# Save the result
appsDF.to_csv(OUTPUT_PATH,index=False)

In [None]:
print("\n🔚 END \n")