## 🏷️ Sub-Phase 1: Initial Filtering

Methods are assigned one of these labels: ADVANTAGE_GAIN if the return value could be exploited, or NO_INFORMATION otherwise.

In [None]:
# Imports
from   dotenv   import load_dotenv
import pandas   as pd
import datetime
import json
import os

# Custom Imports
import sys
sys.path.append('../')
import AnalysisUtils
import LLMUtils 

##### Parameters

In [None]:
TMP_PATH = "../0_Data/TMP"

#### Initialization

In [None]:
print("⚡ START: {} ⚡".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
initTime = datetime.datetime.now()

In [None]:
# Create TMP Folder
if not os.path.exists(TMP_PATH):
	os.makedirs(TMP_PATH)
	print("--- 📁🆕 Folder created       : {}\n".format(TMP_PATH))
else:
	print("--- 📁✅ Folder already exists: {}\n".format(TMP_PATH))

In [None]:
# Load .env Info
load_dotenv()

#### 📥 1) Load Data 

In [None]:
# Starting set of methods  Full Set (45k)
DATA_PATH = "../0_Data/androidMethods.csv"

# Load the CSV file into a DataFrame
methodsDF = pd.read_csv(DATA_PATH)

In [None]:
# Test purposes
methodsDF = methodsDF.head(3)

methodsDF.head(3)

#### 🖥️ 2) LLM Analysis

In [None]:
# MODEL to be used
MODEL = "gpt-4o-mini"

In [None]:
print("\n--- ⭕ LLM Init & Check")
print("--- ⭕ Model: {}".format(MODEL))

# OpenAI (PAYING)
llmInterface = LLMUtils.ChatGPTInterface(model=MODEL, pricing=0.150)
print(llmInterface.sendRequest("Ping!"))

In [None]:
PROMPT_PHASE1 = """
You are an expert in Android App Security.  
Your task is to analyze the given Android API method, focusing only on its return value and assessing whether an attacker could exploit it.  

## **Task Instructions:**  
1. Examine the provided documentation to understand the function, context, and significance of the return value.  
2. Determine whether an attacker could exploit this return value for an advantage, either independently or in combination with other data.
3. Consider potential indirect advantages an attacker might gain from this return value. (Example: `<android.hardware.SensorManager: android.hardware.Sensor getDefaultSensor(int,boolean)>` can reveal the presence of a specific sensor, which can later be used to retrieve sensor data.)
4. Only analyze the return value. Do not consider how the method can be used in general.  

## Output Format:
Reply with only one of the following labels 
- ADVANTAGE_GAIN – if the return value could provide an attacker with useful information or an advantage. 
- NO_INFORMATION – if the return value provides no useful information to an attacker.

Provide only the label—no explanations or additional text.
"""

In [None]:
# Paths
DOCUMENTATION_PATH = "../0_Data/methodsDocumentationFiles/"

# Number of replies to get
NUM_REPLIES = 5

# Max requests to send to LLMs
MAX_REQUESTS = 10

In [None]:
print("\n--- ⭕ START\n")

# Process the DataFrame
def processRow(row):
	# Get information from the row
	sha256          = row['sha256']
	methodSignature = row['methodSignature']

	print("==="*20)
	print("\n--- 🔍 Android Method: {}".format(methodSignature))

	# Retrieve documentation and sourceCode
	documentationPath = os.path.join(DOCUMENTATION_PATH, "{}.txt".format(sha256))
	with open(documentationPath, 'r') as DocFile:
		documentation = DocFile.read()

	# Create an object representing the Android Method
	androidMethod = AnalysisUtils.AndroidMethod(sha256, methodSignature, documentation)

	# Create the prompt
	prompt = androidMethod.addAllToPrompt(PROMPT_PHASE1)

	# Test purposes
	# print(prompt)

	labelFrequency = AnalysisUtils.labelAndroidMethod(llmInterface, prompt, ['ADVANTAGE_GAIN','NO_INFORMATION'], NUM_REPLIES, MAX_REQUESTS)
	llmLabel 	   = AnalysisUtils.getMostFrequentLabel(labelFrequency)
	print("--- 🔍 Label Frequency     :", labelFrequency)
	print("--- 🏷️ Most Frequent Label :", llmLabel)

	# Save the label
	methodsDF.at[row.name, 'llmLabel_START'] = llmLabel

	print("==="*20)

# Apply the function to the DataFrame
_ = methodsDF.apply(processRow, axis=1)

### 💾 3) Print Stats and Save Results

In [None]:
# Count the occurrences of each label in the 'llmLabel_START' column
labelCounts = methodsDF['llmLabel_START'].value_counts()

# Print the counts with emojis and percentages
total = labelCounts.sum()

print("==="*20)
print("\n--- 📊 Final Results:")
print("--- ✅ ADVANTAGE_GAIN: {} [{:.2f}%]".format(labelCounts.get('ADVANTAGE_GAIN', 0), (labelCounts.get('ADVANTAGE_GAIN', 0) / total) * 100 if total > 0 else 0))
print("--- ❌ NO_INFORMATION: {} [{:.2f}%]".format(labelCounts.get('NO_INFORMATION', 0), (labelCounts.get('NO_INFORMATION', 0) / total) * 100 if total > 0 else 0))
print("==="*20)

In [None]:
RESULTS_PATH = "./0_PipelineData/"

# Create the results directory if it doesn't exist
if not os.path.exists(RESULTS_PATH):
	print("--- 📁🆕 Creating results folder: {}".format(RESULTS_PATH))
	os.makedirs(RESULTS_PATH)

# Save the labelled DataFrame to a CSV file
methodsDF.to_csv(os.path.join(RESULTS_PATH, "1_llmPrefilteredMethods_{}.csv".format(MODEL)), index=False)

##### 🔚 End

In [None]:
endTime = datetime.datetime.now()
print("\n🔚 --- END:  {} --- 🔚".format(endTime.strftime("%Y-%m-%d %H:%M:%S")))

# Assuming endTime and initTime are datetime objects
totalTime = endTime - initTime
hours     = totalTime.total_seconds() // 3600
minutes   = (totalTime.total_seconds() % 3600) // 60
seconds   = totalTime.total_seconds() % 60
print("⏱️ --- Time: {:02d} hours and {:02d} minutes [{:02d} seconds] --- ⏱️".format(int(hours), int(minutes), int(totalTime.total_seconds())))