## 🏷️ Sub-Phase 2C : Labelling Clusters

Once these clusters of semantically similar methods were formed, the final step involved assigning a meaningful and descriptive label to each.

In [None]:
# Imports
from   dotenv      import load_dotenv
import pandas      as pd
import datetime
import json
import os

# Custom Imports
import sys
sys.path.append('../')
import LLMUtils

##### Parameters

In [None]:
TMP_PATH = "../../0_Data/TMP"

#### Initialization

In [None]:
print("⚡ START: {} ⚡".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
initTime = datetime.datetime.now()

In [None]:
# Create TMP Folder
if not os.path.exists(TMP_PATH):
	os.makedirs(TMP_PATH)
	print("--- 📁🆕 Folder created       : {}\n".format(TMP_PATH))
else:
	print("--- 📁✅ Folder already exists: {}\n".format(TMP_PATH))

In [None]:
# Load .env Info
load_dotenv()

#### 📥 1) Load Data 

In [None]:
# Methods used for the embedding
MODEL = "text-embedding-3-small"

In [None]:
# Data Path
DATA_PATH = "./0_PipelineData/3_methodsClusters_{}.csv".format(MODEL)

# Load the CSV file into a DataFrame
methodsDF = pd.read_csv(DATA_PATH)

# Show
methodsDF.head(3)

#### 🖥️ 2) Process Embeddings and cluster

In [None]:
# Model to be used
MODEL = "gpt-4o-mini"

In [None]:
print("\n--- ⭕ LLM Init & Check")
print("--- ⭕ Model: {}".format(MODEL))

# OpenAI (PAYING)
llmInterface = LLMUtils.ChatGPTInterface(model=MODEL, pricing=0.150)
print(llmInterface.sendRequest("Ping!"))

In [None]:
PROMPT_PHASE4_TEMPLATE = """
You are an expert in Android security and privacy.

### Task
You will be provided with a list of Android API methods. 
The return values of all these methods can be exploited by an attacker—either individually or in combination with other data—for malicious purposes.

Your goal is to assign a single category label that best represents the type of sensitive information exposed by the API return values.

### Examples
"LOCATION": "Using location APIs, attackers can track a user’s real-time or historical movements, enabling stalking, targeted advertising, or physical security threats.",
"CONTACTS": "Access to contact data can result in privacy violations, such as unauthorized sharing of personal information or social engineering attacks.",

### Instructions
- Carefully analyze the group of Android API methods.
- Assign only **one** category label that best summarizes the exposure according to the return values of the API methods.
- Try to avoid vague or generic labels such as "SENSITIVE_DATA", "PRIVATE_INFO", or "SECURITY_RISK".
- Do **not** include any explanations, reasoning, or extra output — return only the label.

### Output Format
[LABEL_ID]: [ATTACK_DESCRIPTION]

### Input
{}
"""

In [None]:
# Number of clusters
N_CLUSTERS = 150

# TEST
N_CLUSTERS = 2

# Print info
print("--- 🔹 N_CLUSTERS: {}".format(N_CLUSTERS))

In [None]:
# Column name for clustering
clusterCol = "clusterID_{}".format(N_CLUSTERS)

# Ensure the column can hold string values
methodsDF[clusterCol] = methodsDF[clusterCol].astype(str)

# Group the DataFrame by the clustering column
groupDF = methodsDF.groupby(clusterCol)

# Dictionary to store privacy labels for this combination
privacyLabels = {}

# Iterate through each cluster
for clusterValue, clusterDf in groupDF:
	print("------ 🔸 Cluster ID                :  {}".format(clusterValue))

	# Prepare input for the prompt
	inputMethods = "\n".join(clusterDf['methodSignature'].tolist())
	prompt = PROMPT_PHASE4_TEMPLATE.format(inputMethods)

	# Send prompt to the LLM and get response
	response = llmInterface.sendRequest(prompt)

	# Basic validation for response
	if ": " not in response:
		print("--- ⚠️ Invalid response format: {}".format(response))
		continue

	# Extract label and description
	privacyLabelID, privacyLabelDescription = response.split(": ", 1)
	privacyLabelID = privacyLabelID.strip("[]")
	privacyLabelDescription = privacyLabelDescription.replace('"', '').replace('\t', '').replace('\n', '').strip()

	# Print the extracted values
	print("------ 🔸 Privacy Label ID          : {}".format(privacyLabelID))
	print("------ 🔸 Privacy Label Description : {}".format(privacyLabelDescription))
	print("---" * 20)

	# Warn if the label ID is already used
	if privacyLabelID in privacyLabels:
		print("--- ⚠️ Warning: '{}' already exists.".format(privacyLabelID))

	# Store label
	privacyLabels[privacyLabelID] = privacyLabelDescription

	# Replace the cluster value with the privacyLabelID in the original DataFrame
	methodsDF.loc[methodsDF[clusterCol] == clusterValue, clusterCol] = privacyLabelID

#### 💾 3) Save Results

In [None]:
# Where to save the results
RESULTS_PATH = "./0_PipelineData/"

# Save the privacyLabels dictionary as a JSON file
jsonFilePath = RESULTS_PATH +  "4_privacyLabels_{}.json".format(MODEL)
with open(jsonFilePath, 'w') as jsonFile:
	json.dump(privacyLabels, jsonFile, indent=4)
print("--- 💾 Saved Privacy Labels  : {}".format(jsonFilePath))

# Save the labelled Methods
filePath = RESULTS_PATH + "4_methodsPrivacyLabels_{}.csv".format(MODEL)
methodsDF.to_csv(filePath, index=False)
print("--- 💾 Saved Labelled Methods : {}".format(filePath))

##### 🔚 End

In [None]:
endTime = datetime.datetime.now()
print("\n🔚 --- END:  {} --- 🔚".format(endTime.strftime("%Y-%m-%d %H:%M:%S")))

# Assuming endTime and initTime are datetime objects
totalTime = endTime - initTime
hours     = totalTime.total_seconds() // 3600
minutes   = (totalTime.total_seconds() % 3600) // 60
seconds   = totalTime.total_seconds() % 60
print("⏱️ --- Time: {:02d} hours and {:02d} minutes [{:02d} seconds] --- ⏱️".format(int(hours), int(minutes), int(totalTime.total_seconds())))