## ⛏️ Drebin - Clustering

Notebook to cluster applications using KMeans and the features extracted using Drebin.

### Imports

In [None]:
# IMPORT
from    sklearn.metrics             import adjusted_rand_score
from    sklearn.feature_selection   import VarianceThreshold
from    sklearn.decomposition       import PCA
from    sklearn.decomposition       import SparsePCA
from    sklearn.cluster             import KMeans
from    tqdm                        import tqdm
import  pandas                      as pd
import  numpy                       as np
import  ast
import  os

In [None]:
# Initialize TQDM library for Pandas
tqdm.pandas()

In [None]:
print("⚡ START ")

#### Parameters

In [None]:
# Ground-Truth Dataset
INPUT_PATH  = "../TMP/3a_DrebinData.csv"

# Output Path
OUTPUT_PATH = "../3a_DrebinClusteringLabels.csv"

TMP_PATH = "../TMP"
if not os.path.exists(TMP_PATH):
    os.makedirs(TMP_PATH)
    print("📁🆕 Folder created       :", TMP_PATH)
else:
    print("📁✅ Folder already exists:", TMP_PATH)

In [None]:
RANDOM_SEED = 151836

### 1. Load Data

In [None]:
appsDF = pd.read_csv(INPUT_PATH, index_col=False)
print("#️⃣ Apps: {}".format(appsDF.shape[0]))

appsDF.head(3)

In [None]:
print("\n🔨 Reading data as lists")
appsDF['drebinFeatures'] = appsDF['drebinFeatures'].progress_apply(ast.literal_eval)    

### Clustering - KMeans

In [None]:
def getARIscore(clusteringLabels):
    ariScore = adjusted_rand_score(appsDF["classID"].values, clusteringLabels)
    print("🎯 ARI: {:.4f}".format(ariScore))
    return

In [None]:
# # Parameters
# NUM_CLUSTERS = 50

# PARAMETERS (TEST)
NUM_CLUSTERS = 5

Baseline

In [None]:
print("\n⭐ KMeans baseline")

featuresArray = np.array(appsDF['drebinFeatures'].to_list())
print("📐 Len features: {}".format(len(featuresArray[0])))

# Apply K-means clustering with 5 clusters
kmeans = KMeans(n_clusters = NUM_CLUSTERS, init='k-means++', max_iter=300, n_init='auto', random_state=RANDOM_SEED)
kmeans.fit(featuresArray)

clusteringLabels = kmeans.labels_
getARIscore(clusteringLabels)

Kmeans + PCA

In [None]:
# print("\n⭐ KMeans + PCA")

# pca = PCA(random_state = RANDOM_SEED)

# filteredFeaturesArray = pca.fit_transform(featuresArray)
# print("📐 Len features: {}".format(len(filteredFeaturesArray[0])))

# # Apply K-means clustering with clusters
# kmeansPCA = KMeans(n_clusters = NUM_CLUSTERS, init='k-means++', max_iter=300, n_init='auto', random_state=RANDOM_SEED)
# kmeansPCA.fit(filteredFeaturesArray)

# ariScore = getARIscore(kmeansPCA.labels_)
# print("ARI: {}".format(ariScore))

### 3. Save everything.

In [None]:
# Save the labels into the Pandas DF
appsDF = appsDF.loc[:, ['sha256']]
appsDF['clusterID'] = clusteringLabels

In [None]:
# Save the result
appsDF.to_csv(OUTPUT_PATH,index=False)
appsDF

In [None]:
print("\n🔚 END \n")