## ⛏️ RQ2 New Approach - Clustering 

Use KMeans to cluster apps based on the embeddings generated by the OpenAI models.

#### Imports

In [None]:
# IMPORT
from    sklearn.metrics     import adjusted_rand_score
from    sklearn.cluster     import KMeans
from    tqdm                import tqdm
from    sklearn.cluster     import KMeans
from    tqdm                import tqdm
from    joblib              import dump
import pandas               as pd
import numpy                as np

In [None]:
# Initialize TQDM library for Pandas
tqdm.pandas()

In [None]:
print("⚡ START ⚡")

#### Parameters

In [None]:
# Ground-Truth Dataset
INPUT_PATH  = "../TMP/2a_GcataFeatures.csv"

# Output Path
OUTPUT_PATH = "../2a_GcataClusteringLabels.csv"

TMP_PATH = "../TMP"
if not os.path.exists(TMP_PATH):
    os.makedirs(TMP_PATH)
    print("📁🆕 Folder created       :", TMP_PATH)
else:
    print("📁✅ Folder already exists:", TMP_PATH)

In [None]:
RANDOM_SEED = 151836

### 1. Load Data

In [None]:
appsDF = pd.read_csv(INPUT_PATH, index_col=False)
print("#️⃣ Apps: {}".format(appsDF.shape[0]))

appsDF.head(3)

In [None]:
# Convert the string column to lists
print("\n🔨 1. Reorganize features as list")
appsDF['features'] = appsDF['features'].progress_apply(eval)

### 2. Clustering - KMeans

In [None]:
def getARIscore(clusteringLabels):
    ariScore = adjusted_rand_score(appsDF["classID"].values, clusteringLabels)
    print("🎯 ARI: {:.4f}".format(ariScore))
    return

In [None]:
# Parameters
NUM_CLUSTERS = 50

In [None]:
featuresArray = np.array(appsDF['features'].to_list())
print("📐 Len features: {}".format(len(featuresArray[0])))

# Apply K-means clustering with 5 clusters
kmeans = KMeans(n_clusters = NUM_CLUSTERS, init='k-means++', max_iter=300, n_init='auto', random_state = RANDOM_SEED)
kmeans.fit(featuresArray)

# Get labels
clusteringLabels = kmeans.labels_

print("\n⭐ KMeans baseline")
getARIscore(clusteringLabels)

### 3. Save everything

In [None]:
# Save the model to a file
dump(kmeans, 'kmeansModelGcata.joblib')

In [None]:
# Save the labels into the Pandas DF
appsDF = appsDF.loc[:, ['sha256']]
appsDF['clusterID'] = clusteringLabels

# Save the result
appsDF.to_csv(OUTPUT_PATH,index=False)
appsDF.head(3)

In [None]:
print("\n🔚 END \n")