## ⛏️ CHABADA - Clustering

Notebook for performing KMeans clustering on apps using the topics inferred by LDA.

#### Imports

In [None]:
# IMPORT
from   sklearn.metrics    import adjusted_rand_score
from   sklearn.cluster    import KMeans
from   tqdm               import tqdm
from   joblib             import dump
import pandas             as pd
import numpy              as np
import os

In [None]:
# Initialize TQDM library for Pandas
tqdm.pandas()

In [None]:
print("⚡ START ⚡")

#### Parameters

In [None]:
# Ground-Truth Dataset
INPUT_PATH  = "../TMP/1b_ChabadaTopics.csv"

# Output Path
OUTPUT_PATH = "../1b_ChabadaClusteringLabels.csv"

TMP_PATH = "../TMP"
if not os.path.exists(TMP_PATH):
    os.makedirs(TMP_PATH)
    print("📁🆕 Folder created       :", TMP_PATH)
else:
    print("📁✅ Folder already exists:", TMP_PATH)

In [None]:
RANDOM_SEED = 151836

### 1. Load Data

In [None]:
appsDF = pd.read_csv(INPUT_PATH, index_col=False)
print("#️⃣ Apps: {}".format(appsDF.shape[0]))

appsDF.head(3)

### 2. Clustering - KMeans

In [None]:
def getARIscore(clusteringLabels):
    ariScore = adjusted_rand_score(appsDF["classID"].values, clusteringLabels)
    print("🎯 ARI: {:.4f}".format(ariScore))
    return

In [None]:
# # PARAMETERS
# NUM_TOPICS   = 50
# NUM_CLUSTERS = 50
# PARAMETERS (TEST)
NUM_TOPICS   = 5
NUM_CLUSTERS = 5

#### Create the probability Matrix

In [None]:
# Create an empty probability Matrix
probabilityMatrix = pd.DataFrame(np.zeros((appsDF.shape[0], NUM_TOPICS)))

print("Shape: {}".format(probabilityMatrix.shape))

# Fill the matrix
for appID, row in appsDF.iterrows():
    for i in range(1,5):
        if row['topic{}'.format(i)] != -1:
            probabilityMatrix.iloc[appID,row['topic{}'.format(i)]] = row['probability{}'.format(i)]
           

#### Training

In [None]:
# Apply KMeans clustering to probs_matrix, with the same seed
model = KMeans(n_clusters = NUM_CLUSTERS, random_state = RANDOM_SEED, n_init='auto').fit(probabilityMatrix.values)

# Get the labels
clusteringLabels = model.predict(probabilityMatrix.values)

print("\n⭐ Chabada")
getARIscore(clusteringLabels)

### 3. Save Everything

In [None]:
# Save the model to a file
dump(model, 'kmeansModelChabada.joblib')

In [None]:
# Save the labels into the Pandas DF
appsDF = appsDF.loc[:, ['sha256']]
appsDF['clusterID'] = clusteringLabels

In [None]:
# Save the result
appsDF.to_csv(OUTPUT_PATH,index=False)
appsDF.head(5)

In [None]:
print("\n🔚 END")