## 🏷️ Sub-Phase 2B : Cluster Android Methods

Applying a clustering algorithm to group methods exhibiting similar semantic characteristics

In [None]:
# Imports
from   dotenv      		import load_dotenv
from   collections 		import Counter
from   sklearn.cluster 	import KMeans
import pandas      as pd
import numpy       as np
import datetime
import json
import os

##### Parameters

In [None]:
TMP_PATH = "../../0_Data/TMP"

#### Initialization

In [None]:
print("⚡ START: {} ⚡".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
initTime = datetime.datetime.now()

In [None]:
# Create TMP Folder
if not os.path.exists(TMP_PATH):
	os.makedirs(TMP_PATH)
	print("--- 📁🆕 Folder created       : {}\n".format(TMP_PATH))
else:
	print("--- 📁✅ Folder already exists: {}\n".format(TMP_PATH))

In [None]:
# Load .env Info
load_dotenv()

#### 📥 1) Load Data 

In [None]:
# Model used to generate embeddings
MODEL = "text-embedding-3-small"

In [None]:
# Data Path
DATA_PATH = "./0_PipelineData/2_methodsEmbedding_{}.csv".format(MODEL)

# Load the CSV file into a DataFrame
methodsDF = pd.read_csv(DATA_PATH)

# Show the first few rows of the DataFrame
methodsDF.head(3)

In [None]:
# Convert to numpy arrays
methodsDF['methodEmbedding'] = methodsDF['methodEmbedding'].apply(lambda x: np.array(json.loads(x)))

#### 🖥️ 2) Process Embeddings and Cluster using K-Means

In [None]:
# Number of clusters
N_CLUSTERS = 150

# TEST
N_CLUSTERS = 2

# Print info
print("--- 🔹 N_CLUSTERS : {}".format(N_CLUSTERS))

In [None]:
# Get the list
X = np.vstack(methodsDF['methodEmbedding'])

print("--- 🔹 Size of X  : {}".format(X.shape))
		
# Perform KMeans clustering
kmeans = KMeans(n_clusters=N_CLUSTERS)

# Fit the model and predict the cluster labels
labels = kmeans.fit_predict(X)

# Add the cluster labels to the DataFrame
methodsDF["clusterID_{}".format(N_CLUSTERS)] = labels

# Calculate and print min, max, avg, and median of cluster sizes
clusterCounts = Counter(labels)
clusterSizes  = list(clusterCounts.values())
print("--- 🔹 Min cluster size    : {}".format(min(clusterSizes)))
print("--- 🔹 Max cluster size    : {}".format(max(clusterSizes)))
print("--- 🔹 Avg cluster size    : {:.2f}".format(np.mean(clusterSizes)))
print("--- 🔹 Median cluster size : {}".format(np.median(clusterSizes)))

methodsDF.head(3)

#### 💾 3) Save Results

In [None]:
# Where to save the results
RESULTS_PATH = "./0_PipelineData/"

# Save the labelled Methods
filePath  = RESULTS_PATH + "3_methodsClusters_{}.csv".format(MODEL)

# Drop the 'methodEmbedding' column before saving
methodsDF.drop(columns=["methodEmbedding"], inplace=True)

# Save clusters
methodsDF.to_csv(filePath, index=False)
print("--- 💾 Saved Clusters: {}".format(filePath))

##### 🔚 End

In [None]:
endTime = datetime.datetime.now()
print("\n🔚 --- END:  {} --- 🔚".format(endTime.strftime("%Y-%m-%d %H:%M:%S")))

# Assuming endTime and initTime are datetime objects
totalTime = endTime - initTime
hours     = totalTime.total_seconds() // 3600
minutes   = (totalTime.total_seconds() % 3600) // 60
seconds   = totalTime.total_seconds() % 60
print("⏱️ --- Time: {:02d} hours and {:02d} minutes [{:02d} seconds] --- ⏱️".format(int(hours), int(minutes), int(totalTime.total_seconds())))