## 🏷️ Sub-Phase 2A : Generate embeddings for Android Methods.

Generating a numerical vector representation (embedding) for each method labeled ADVANTAGE_GAIN

In [None]:
# Imports
from   dotenv      import load_dotenv
import pandas      as pd
import datetime
import os

# Custom Imports
import sys
sys.path.append('../')
import LLMUtils 

##### Parameters

In [None]:
TMP_PATH = "../../0_Data/TMP"

#### Initialization

In [None]:
print("⚡ START: {} ⚡".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
initTime = datetime.datetime.now()

In [None]:
# Create TMP Folder
if not os.path.exists(TMP_PATH):
	os.makedirs(TMP_PATH)
	print("--- 📁🆕 Folder created       : {}\n".format(TMP_PATH))
else:
	print("--- 📁✅ Folder already exists: {}\n".format(TMP_PATH))

In [None]:
# Load .env Info
load_dotenv()

#### 📥 1) Load Data 

In [None]:
# MODEL used
MODEL = "gpt-4o-mini"

In [None]:
# Data Path
DATA_PATH = "./0_PipelineData/1_llmPrefilteredMethods_{}.csv".format(MODEL)

# Load the CSV file into a DataFrame
methodsDF = pd.read_csv(DATA_PATH)

# Display the first few rows of the DataFrame
methodsDF.head(3)

In [None]:
# Count the number of rows in the DataFrame per type
labelCounts = methodsDF['llmLabel_START'].value_counts()
print("📊 Label Counts:")
for label, count in labelCounts.items():
	print("🔹 {} : {}".format(label, count))

# Filter out the rows where 'llmLabel_START' is not 'ADVANTAGE_GAIN'
methodsDF = methodsDF[methodsDF['llmLabel_START'] == 'ADVANTAGE_GAIN']

# Order the methods alphabetically by 'methodSignature'
methodsDF = methodsDF.sort_values(by='methodSignature')

# Test purposes
methodsDF = methodsDF.head(10)

#### 🖥️ 2) LLM Embeddings

In [None]:
# Small
MODEL = "text-embedding-3-small"

# Interface
openAIEmbeddingsInterface = LLMUtils.openAiEmbeddingsInterface(MODEL)

In [None]:
# PARAMETERS
DOCUMENTATION_PATH = "../0_Data/methodsDocumentationFiles/"

In [None]:
print("\n--- ⭕ START\n")

# To store the results
methodsDF['methodEmbedding'] = None

# Process each row in the DataFrame
def processRow(row):
	# Get information from the row
	sha256          = row['sha256']
	methodSignature = row['methodSignature']

	# Print the method signature
	print("--- 🔍 Android Method: {}".format(methodSignature))

	# Retrieve documentation and sourceCode
	documentationPath = os.path.join(DOCUMENTATION_PATH, "{}.txt".format(sha256))
	with open(documentationPath, 'r') as DocFile:
		documentation = DocFile.read()

	# Combine methodSignature and Documentation
	combinedText = methodSignature + "\n" + documentation

	# Get embedding for the combined text
	combinedEmbedding = openAIEmbeddingsInterface.getEmbedding(combinedText)
	print("--- 🔹 Combined Embedding : DONE")

	# Store the embedding into the DataFrame
	methodsDF.at[row.name, 'methodEmbedding'] = combinedEmbedding

	print("---"*20)

# Apply the function to the DataFrame
_ = methodsDF.apply(processRow, axis=1)

#### 💾 3) Save Results

In [None]:
# Where to store the results
RESULTS_PATH = "./0_PipelineData/"

# Save the labelled Methods
filePath  = RESULTS_PATH + "2_methodsEmbedding_{}.csv".format(MODEL)

# Save
methodsDF.to_csv(filePath, index=False)
print("--- 💾 Saved Embeddings: {}".format(filePath))

##### 🔚 End

In [None]:
endTime = datetime.datetime.now()
print("\n🔚 --- END:  {} --- 🔚".format(endTime.strftime("%Y-%m-%d %H:%M:%S")))

# Assuming endTime and initTime are datetime objects
totalTime = endTime - initTime
hours     = totalTime.total_seconds() // 3600
minutes   = (totalTime.total_seconds() % 3600) // 60
seconds   = totalTime.total_seconds() % 60
print("⏱️ --- Time: {:02d} hours and {:02d} minutes [{:02d} seconds] --- ⏱️".format(int(hours), int(minutes), int(totalTime.total_seconds())))