## ⛏️ Rajasegaran - Clustering

Cluster Apps using their icon-based representation.

#### Imports

In [None]:
# IMPORT
from   sklearn.metrics              import adjusted_rand_score
from   sklearn.metrics.pairwise     import cosine_distances as cosineDistance
from   sklearn.cluster              import KMeans
from   PIL                          import Image
from   tqdm                         import tqdm
import pandas                       as pd
import numpy                        as np
import sys, os, ast

In [None]:
# Initialize TQDM library for Pandas
tqdm.pandas()

In [None]:
print("⚡ START ⚡\n")

#### Data Paths

In [None]:
# Ground-Truth Dataset
INPUT_PATH  = "../TMP/2c_RajasegaranFeatures.csv"

# Outputh Path
OUTPUT_PATH = "../2c_RajasegaranClusteringLabels.csv"

TMP_PATH = "../TMP"
if not os.path.exists(TMP_PATH):
    os.makedirs(TMP_PATH)
    print("📁🆕 Folder created       :", TMP_PATH)
else:
    print("📁✅ Folder already exists:", TMP_PATH)

In [None]:
RANDOM_SEED = 151836

### 1. Load Data

In [None]:
appsDF = pd.read_csv(INPUT_PATH, index_col=False)
appsDF = appsDF.loc[:,['sha256','classID']]
print("#️⃣ Apps: {}".format(appsDF.shape[0]))

appsDF.head(3)

In [None]:
print("⛏️ Reorganizing features as lists")
appsDF['contentEmbedding'] = appsDF['contentEmbedding'].progress_apply(ast.literal_eval)
appsDF['styleEmbedding'] = appsDF['styleEmbedding'].progress_apply(ast.literal_eval)

### 2. Compute Cosine Distance

In [None]:
contentMatrix = np.array(appsDF['contentEmbedding'].to_list())
contentCosineMatrix = cosineDistance(contentMatrix)

styleMatrix = np.array(appsDF['styleEmbedding'].to_list())
styleCosineMatrix = cosineDistance(styleMatrix)

cosine = contentCosineMatrix + styleCosineMatrix

### 3. Clustering

In [None]:
def getARIscore(clusteringLabels):
    ariScore = adjusted_rand_score(appsDF["classID"].values, clusteringLabels)
    print("🎯 ARI: {:.4f}".format(ariScore))
    return

In [None]:
# PARAMETERS
NUM_CLUSTERS = 50

# PARAMETERS (TEST)
NUM_CLUSTERS = 5

In [None]:
model = KMeans(n_clusters = NUM_CLUSTERS,init='k-means++', random_state = RANDOM_SEED, n_init='auto')
model.fit(cosine)

# Get the labels
clusteringLabels = model.labels_
print("\n⭐ Rajasegaran")
getARIscore(clusteringLabels)

### 4. Save Everything

In [None]:
# Save the labels into the Pandas DF
appsDF = appsDF.loc[:, ['sha256']]
appsDF['clusterID'] = clusteringLabels

# Save the result
appsDF.to_csv(OUTPUT_PATH,index=False)
appsDF.head(3)

In [None]:
print("\n🔚 END \n")