## 🔢 Sub Phase 2: Embedding

In [None]:
# Imports
from   dotenv   import load_dotenv
import pandas   as pd
import datetime
import sys
import os

# Add the upper folder to sys.path
sys.path.insert(0, "../")
from   Embedding   import EmbeddingsManager
from   RedisClient import RedisClient
from   App         import App

#### Parameters

In [None]:
# TMP Folder
TMP_PATH = "../../../0_Data/TMP/"

#### Initialization

In [None]:
print("⚡ Start - {} ⚡\n".format(datetime.datetime.now()))
startTime = datetime.datetime.now()

In [None]:
# Create TMP Folder
if not os.path.exists(TMP_PATH):
	os.makedirs(TMP_PATH)
	print("📁🆕 Folder created       :", TMP_PATH)
else:
	print("📁✅ Folder already exists:", TMP_PATH)

#### 📥 1) Load Data 

In [None]:
REDIS_PREFIX = "test"

In [None]:
#DATASET = "malicious"
DATASET = "androcatset"

DIRECTION        = "backward"
SOURCES_APPROACH = "nosources"

redisProjectKey = REDIS_PREFIX + ".{}.{}.{}".format(DATASET, DIRECTION, SOURCES_APPROACH)
print("--- 🔑 Redis Key: ", redisProjectKey)

📡 Redis Connection

In [None]:
# Load .env file
load_dotenv()
redisClientExtraction = RedisClient(host=os.getenv("REDIS_SERVER"), 
                                    port=os.getenv("REDIS_PORT"), 
                                    db=os.getenv("REDIS_DB"), 
                                    password=os.getenv("REDIS_PSW"), 
                                    projectKey = redisProjectKey)

In [None]:
# Paths
#INPUT_PATH   = "../../../0_Data/0_AndroCatSet.csv"
INPUT_PATH   = "../../../0_Data/1_AndroCatSet_Mini.csv"
#INPUT_PATH   = "../../../0_Data/3_MaliciousApps.csv"

# Read the data
appsDF = pd.read_csv(INPUT_PATH)
print("--- #️⃣ Apps: {} ".format(appsDF.shape[0]))

# TEST
#appsDF = appsDF.head(1)
appsDF.head(5)

#### 🔢 2) Load Apps and Generate Numerical Embeddings

In [None]:
# Where to store mebeddings
redisClientEmbedding  = RedisClient(host=os.getenv("REDIS_SERVER"), 
									port=os.getenv("REDIS_PORT"), 
									db=os.getenv("REDIS_DB"), 
									password=os.getenv("REDIS_PSW"), 
									projectKey = REDIS_PREFIX + ".embeddings")

In [None]:
EMBEDDING_MODEL = "gpt"
#EMBEDDING_MODEL = "codebert"
#EMBEDDING_MODEL = "sfr"

# Create an Embedding Manager
embeddingsManager = EmbeddingsManager(redisClientEmbedding, EMBEDDING_MODEL)
print(embeddingsManager)

In [None]:
def processRow(row):
		# Print message 
		print("\n--- 🔑 Analyzing APK: {} 🔑 ---".format(row['sha256']))

		# Create App instance
		app = App(row['sha256'], row['pkgName'], row['classID'])

		# Download Data Flows from Redis
		app.downloadDataFlowsFromRedis(redisClientExtraction)
		
		# Check if dataFlows have been extracted and are not empty.
		if(app.dataFlows is not None and not app.dataFlows.isEmpty()):

				# Load DataFlows into Embeddings Manager
				embeddingsManager.loadDataFlowsFromApp(app.dataFlows)

# Apply the function to each row in the DataFrame
_ = appsDF.apply(processRow, axis=1)

In [None]:
print(embeddingsManager)
embeddingsManager.generateMethodsEmbeddings(redisClientEmbedding, EMBEDDING_MODEL)
print(embeddingsManager)

##### 🔚 End

In [None]:
endTime = datetime.datetime.now()
print("\n🔚 --- End - {} --- 🔚".format(endTime))

# Assuming endTime and startTime are in seconds
totalTime = endTime - startTime
minutes = totalTime.total_seconds() // 60
seconds = totalTime.total_seconds() % 60
print("⏱️ --- Time: {:02d} minutes and {:02d} seconds --- ⏱️".format(int(minutes), int(seconds)))