## ⛏️ Drebin - Clustering

Notebook to cluster applications using KMeans and the features extracted using Drebin.

### Imports

In [1]:
# IMPORT
from    sklearn.metrics             import adjusted_rand_score
from    sklearn.feature_selection   import VarianceThreshold
from    sklearn.decomposition       import PCA
from    sklearn.decomposition       import SparsePCA
from    sklearn.cluster             import KMeans
from    tqdm                        import tqdm
import  pandas                      as pd
import  numpy                       as np
import  ast

In [2]:
# Initialize TQDM library for Pandas
tqdm.pandas()

In [3]:
print("⚡ START ")

⚡ START 


#### Parameters

In [4]:
# Where to temporarily store APK Files
APK_PATH    = "../../../0_Data/APKS/"

# Ground-Truth Dataset
INPUT_PATH  = "../TMP/3a_DrebinData.csv"

# Output Path
OUTPUT_PATH = "../3a_DrebinClusteringLabels.csv"

In [5]:
RANDOM_SEED = 151836

### 1. Load Data

In [6]:
appsDF = pd.read_csv(INPUT_PATH, index_col=False)
print("#️⃣ Apps: {}".format(appsDF.shape[0]))

appsDF.head(3)

#️⃣ Apps: 3


Unnamed: 0,sha256,pkgName,classID,googlePlayCategoryID,googlePlayDescription,drebinFeatures
0,9B30837BD2474AC3623A43D052F7ADC4C63E4AA9981F0F...,my.android.calc,Calculator,TOOLS,Handiness universal percentage calculator for ...,"[1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,686DE8D8A0D08992CB135BC7A0500D0109D9697A1140B8...,com.vpn.basiccalculator,Calculator,TOOLS,CITIZEN CALCULATOR by ANGEL NX is best Mobile ...,"[1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, ..."
2,A49864DCC90F6730569455BDFA39B4B7CF70AE0C34D656...,com.ba.fractioncalculator,Calculator,EDUCATION,"<b>Free offline fraction calculator</b> ✌, sup...","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, ..."


In [7]:
print("\n🔨 Reading data as lists")
appsDF['drebinFeatures'] = appsDF['drebinFeatures'].progress_apply(ast.literal_eval)    


🔨 Reading data as lists


100%|██████████| 3/3 [00:00<00:00, 4746.48it/s]


### Clustering - KMeans

In [8]:
def getARIscore(clusteringLabels):
    ariScore = adjusted_rand_score(appsDF["classID"].values, clusteringLabels)
    print("🎯 ARI: {:.4f}".format(ariScore))
    return

In [9]:
# Parameters
NUM_CLUSTERS = 50

Baseline

In [10]:
print("\n⭐ KMeans baseline")

featuresArray = np.array(appsDF['drebinFeatures'].to_list())
print("📐 Len features: {}".format(len(featuresArray[0])))

# Apply K-means clustering with 5 clusters
kmeans = KMeans(n_clusters = NUM_CLUSTERS, init='k-means++', max_iter=300, n_init='auto', random_state=RANDOM_SEED)
kmeans.fit(featuresArray)

clusteringLabels = kmeans.labels_
getARIscore(clusteringLabels)


⭐ KMeans baseline
📐 Len features: 54
🎯 ARI: 0.0000


Kmeans + PCA

In [16]:
# print("\n⭐ KMeans + PCA")

# pca = PCA(random_state = RANDOM_SEED)

# filteredFeaturesArray = pca.fit_transform(featuresArray)
# print("📐 Len features: {}".format(len(filteredFeaturesArray[0])))

# # Apply K-means clustering with clusters
# kmeansPCA = KMeans(n_clusters = NUM_CLUSTERS, init='k-means++', max_iter=300, n_init='auto', random_state=RANDOM_SEED)
# kmeansPCA.fit(filteredFeaturesArray)

# ariScore = getARIscore(kmeansPCA.labels_)
# print("ARI: {}".format(ariScore))

### 3. Save everything.

In [21]:
# Save the labels into the Pandas DF
appsDF = appsDF.loc[:, ['sha256']]
appsDF['clusterID'] = clusteringLabels

In [22]:
# Save the result
appsDF.to_csv(OUTPUT_PATH,index=False)
appsDF

Unnamed: 0,sha256,clusterID
0,9B30837BD2474AC3623A43D052F7ADC4C63E4AA9981F0F...,2
1,686DE8D8A0D08992CB135BC7A0500D0109D9697A1140B8...,1
2,A49864DCC90F6730569455BDFA39B4B7CF70AE0C34D656...,0


In [23]:
print("\n🔚 END \n")


🔚 END 

