## ⛏️ App Strings - Clustering

Cluster applications using App Strings extracted from the APK files.

#### Imports

In [None]:
# IMPORT
from    sklearn.metrics             import adjusted_rand_score
from    sklearn.feature_selection   import VarianceThreshold
from    sklearn.decomposition       import PCA
from    sklearn.decomposition       import SparsePCA
from    sklearn.cluster             import KMeans
from    sklearn.metrics             import silhouette_score
from    tqdm                        import tqdm
import  pandas                      as pd
import  numpy                       as np
import  ast

In [None]:
# Initialize TQDM library for Pandas
tqdm.pandas()

In [None]:
print("⚡ START ⚡\n")

#### Parameters

In [None]:
# Where to temporarily store APK Files
APK_PATH    = "../../../0_Data/APKS/"

# Ground-Truth Dataset
INPUT_PATH  = "../TMP/4d_AppStringsFeatures.csv"

# Output Path
OUTPUT_PATH = "../4d_AppStringsClusteringLabels.csv"

In [None]:
RANDOM_SEED = 151836

In [None]:
# FIXED PATHS
APK_PATH    = "../../../0_Data/APKS/"
INPUT_PATH  = "../3b_AppStringsFeatures.csv"
OUTPUT_PATH = "../3b_AppStringsClusteringLabels.csv"

### 1. Load Data

In [None]:
appsDF = pd.read_csv(INPUT_PATH, index_col=False)
print("#️⃣ Apps: {}".format(appsDF.shape[0]))

appsDF.head(3)

In [None]:
print("\n🔨 1. Reading data as lists")
appsDF['appStringsFeatures'] = appsDF['appStringsFeatures'].progress_apply(ast.literal_eval)    

### 2. Clustering

In [None]:
def getARIscore(clusteringLabels):
    ariScore = adjusted_rand_score(appsDF["classID"].values, clusteringLabels)
    print("🎯 ARI: {:.4f}".format(ariScore))
    return

In [None]:
# Parameters
NUM_CLUSTERS = 50

In [None]:
print("\n⭐ KMeans - Basline ")

featuresArray = np.array(appsDF['appStringsFeatures'].to_list())
print("📐 Len features: {}".format(len(featuresArray[0])))

#Apply K-means clustering with 5 clusters
kmeans = KMeans(n_clusters = NUM_CLUSTERS, init='k-means++', max_iter=300, n_init='auto', random_state = RANDOM_SEED)
kmeans.fit(featuresArray)

clusteringLabels = kmeans.labels_
getARIscore(clusteringLabels)

With PCA

In [None]:
def getNumComponentsByVarianceThreshold(featuresArray, varianceThreshold):
    print("📐 Len features  : {}".format(len(featuresArray[0])))

    pca = PCA(random_state = RANDOM_SEED)
    filteredFeaturesArray = pca.fit_transform(featuresArray)

    # Calculate the cumulative sum of explained variance ratios
    cumulativeVarianceRatio = np.cumsum(pca.explained_variance_ratio_)

    # Determine the number of components exceeding the threshold
    numComponents = np.argmax(cumulativeVarianceRatio >= varianceThreshold) + 1
    print("📐 Num components: {}".format(numComponents))
    
    return numComponents

Get number of components to get 0.95 of variance

In [None]:
featuresArray = np.array(appsDF['appStringsFeatures'].to_list())
numComponents = getNumComponentsByVarianceThreshold(featuresArray, 0.95)

In [None]:
print("⭐ KMeans + PCA ")

# Define PCA
pca = PCA(n_components = numComponents, random_state = RANDOM_SEED)

# Apply PCA
featuresArray = np.array(appsDF['appStringsFeatures'].to_list())
filteredFeaturesArray = pca.fit_transform(featuresArray)

# Apply K-means clustering with clusters
kmeansPCA = KMeans(n_clusters = NUM_CLUSTERS, init='k-means++', max_iter=300, n_init='auto', random_state = RANDOM_SEED)
kmeansPCA.fit(filteredFeaturesArray)

clusteringLabels = kmeansPCA.labels_
getARIscore(clusteringLabels)

### 3. Save Everything

In [None]:
# Save the labels into the Pandas DF
appsDF = appsDF.loc[:, ['sha256']]
appsDF['clusterID'] = clusteringLabels

In [None]:
# Save the result
appsDF.to_csv(OUTPUT_PATH,index=False)
appsDF.head(3)

In [None]:
print("\n🔚 END \n")