## ⛏️ SUN - Clustering

Use Affinity Propagation over a list of keywords extracted from APKUse Affinity Propagation to cluster applications based on a list of keywords extracted from APK files.

#### Imports

In [None]:
# IMPORT
from   sklearn.metrics                    import adjusted_rand_score
from   sklearn.feature_extraction.text    import TfidfVectorizer
from   sklearn.cluster                    import AffinityPropagation
from   tqdm                               import tqdm
import pandas                             as pd
import numpy                              as np
import ast

In [None]:
# Initialize TQDM library for Pandas
tqdm.pandas()

In [None]:
print("⚡ START ⚡")

#### Parameters


In [None]:
# Ground-Truth Dataset
INPUT_PATH  = "../TMP/1d_SunDataPreprocessed.csv"

# Output Path
OUTPUT_PATH = "../1d_SunClusteringLabels.csv"

TMP_PATH = "../TMP"
if not os.path.exists(TMP_PATH):
    os.makedirs(TMP_PATH)
    print("📁🆕 Folder created       :", TMP_PATH)
else:
    print("📁✅ Folder already exists:", TMP_PATH)

In [None]:
RANDOM_SEED = 151836

### 1. Load Data

In [None]:
appsDF = pd.read_csv(INPUT_PATH, index_col=False)
print("#️⃣ Apps: {}".format(appsDF.shape[0]))

appsDF.head(3)

In [None]:
print("\n🔨 Loading data as lists")
appsDF['keywords'] = appsDF['keywords'].progress_apply(ast.literal_eval)   

### 2. Vectorizing

In [None]:
# Create corpus for TFDIF
allKeywords = set(keyword for keywords in appsDF['keywords'] for keyword in keywords)
corpus = [' '.join(keywords) for keywords in appsDF['keywords']]

# Use TF-IDF 
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

#### 3. Clustering

In [None]:
def getARIscore(clusteringLabels):
    ariScore = adjusted_rand_score(appsDF["classID"].values, clusteringLabels)
    print("🎯 ARI: {:.4f}".format(ariScore))
    return

In [None]:
NUM_CLUSTERS = 50

In [None]:
# Function to randomly change preference parameter
def randomizePreference():
    return np.random.uniform(low=-9, high=-7)

# Function to perform affinity propagation and return the number of clusters
def getNumClusters(X, preference):

    affinityPropagationModel = AffinityPropagation(preference = preference, damping = 0.9, max_iter = 1000, random_state = RANDOM_SEED)
    affinityPropagationModel.fit(X.toarray())
    
    nClusters = len(affinityPropagationModel.cluster_centers_indices_)

    # Get the labels
    clusteringLabels = affinityPropagationModel.labels_
     
    return nClusters, clusteringLabels

bestPreference   = None
bestNumClusters  = None
clusteringLabels = None

for i in range(1000):  # Run 1000 iterations
    preference = randomizePreference()
    nClusters, clusteringLabels = getNumClusters(X, preference)

    print("\n#️⃣ Iteration  : {}".format(i))
    print("⭐ Preference : {}".format(preference))
    print("⭐ nClusters  : {}".format(nClusters))  

    if nClusters == NUM_CLUSTERS:
        break

In [None]:
print("\n⭐ Sun et al.")
getARIscore(clusteringLabels)

### 3. Save Everything

In [None]:
# Save the labels into the Pandas DF
appsDF = appsDF.loc[:, ['sha256']]
appsDF['clusterID'] = clusteringLabels

In [None]:
# Save the result
appsDF.to_csv(OUTPUT_PATH, index=False)
appsDF.head(3)

In [None]:

print("\n🔚 END \n")