## ⛏️ DexRay - Clustering

Notebook to cluster image representations of apps.

In [None]:
# IMPORT
from   sklearn.metrics  import adjusted_rand_score
from   sklearn.cluster  import KMeans
from   tqdm             import tqdm
import pandas           as pd
import numpy            as np
import ast

In [None]:
# Initialize TQDM library for Pandas
tqdm.pandas()

In [None]:
print("⚡ START ⚡")

#### Parameters

In [None]:
# Where to temporarily store APK Files
APK_PATH    = "../../../../0_Data/APKS/"

# Ground-Truth Dataset
INPUT_PATH  = "../TMP/3b_DexrayData.csv"

# Output Path
OUTPUT_PATH = "../3b_DexrayClusteringLabels.csv"

In [None]:
RANDOM_SEED = 151836

### 1. Load Data

In [None]:
appsDF = pd.read_csv(INPUT_PATH, index_col=False)
print("#️⃣ Apps: {}".format(appsDF.shape[0]))

appsDF.head(3)

In [None]:
print("⛏️ Reorganizing features as lists")
appsDF['features'] = appsDF['features'].progress_apply(ast.literal_eval)

### 2. Clustering

In [None]:
def getARIscore(clusteringLabels):
    ariScore = adjusted_rand_score(appsDF["classID"].values, clusteringLabels)
    print("🎯 ARI: {:.4f}".format(ariScore))
    return

In [None]:
NUM_CLUSTERS = 50

In [None]:
# Get the features
featuresArray = np.array(appsDF['features'].to_list())

# Apply KMeans clustering to probs_matrix, with the same seed
kmeans = KMeans(n_clusters = NUM_CLUSTERS, random_state = RANDOM_SEED, n_init='auto').fit(featuresArray)

clusteringLabels = kmeans.labels_
print("\n⭐ DexRay")
getARIscore(clusteringLabels)

### 3. Save everything

In [None]:
# Save the labels into the Pandas DF
appsDF = appsDF.loc[:, ['sha256']]
appsDF['clusterID'] = clusteringLabels

In [None]:
# Save the result
appsDF.to_csv(OUTPUT_PATH, index=False)
appsDF

print("\n🔚 END \n")