## 🔢 Data FlowsEmbedding

Generate Data Flows Embedding using different models.

1. GPT Embedding Model by OpenAI
2. CodeBert
3. SfrMistral

In [None]:
# Imports
from   dotenv   import load_dotenv
import pandas   as pd
import itertools
import datetime
import json
import sys
import os

# Add the upper folder to sys.path
sys.path.insert(0, "../../")
from   RedisClient import RedisClient
from   App         import App
from   App         import DataFlows
from   Embedding   import EmbeddingsManager

#### Parameters

In [None]:
# TMP Folder
TMP_PATH = "../../../../0_Data/TMP/"

#### Initialization

In [None]:
print("⚡ Start - {} ⚡\n".format(datetime.datetime.now()))
startTime = datetime.datetime.now()

In [None]:
# Create TMP Folder
if not os.path.exists(TMP_PATH):
	os.makedirs(TMP_PATH)
	print("📁🆕 Folder created       :", TMP_PATH)
else:
	print("📁✅ Folder already exists:", TMP_PATH)

#### 📥 1) Load Data 

In [None]:
# Datasets
#DATASET = "0_AndroCatSet"
#DATASET = "1_AndroCatSetMini"
DATASET = "2_AndroCatSetTestSet"
#DATASET = "3_MalCatSet"
#DATASET = "4_Mudflow"

📡 Redis Connection

In [None]:
# Load .env file
load_dotenv()

In [None]:
if DATASET in ["0_AndroCatSet", "1_AndroCatSetMini", "2_AndroCatSetTestSet"]: 
	REDIS_PROJECT_KEY = "marco.dataflow.extraction.androcatset.backward.pairs"
	
if DATASET == "3_MalCatSet": 
	REDIS_PROJECT_KEY = "marco.dataflow.extraction.malcatsetall.backward.pairs"

if DATASET == "4_Mudflow": 
	REDIS_PROJECT_KEY = "marco.dataflow.extraction.mudflowall.backward.pairs"

redisClientExtraction = RedisClient(host=os.getenv("REDIS_SERVER"), port=os.getenv("REDIS_PORT"), db=os.getenv("REDIS_DB"), password=os.getenv("REDIS_PSW"), projectKey = REDIS_PROJECT_KEY)

In [None]:
DATA_PATH = "../../../../0_Data/{}.csv".format(DATASET) 

# Read the data
appsDF = pd.read_csv(DATA_PATH)

# Print Number
print("--- #️⃣ Apps: {} ".format(appsDF.shape[0]))

# TEST
appsDF = appsDF.head(10)
appsDF.head(5)

#### 🔢 2) Load Apps and Generate Numerical Embeddings

In [None]:
EMBEDDING_MODEL = "gpt"
#EMBEDDING_MODEL = "codebert"
#EMBEDDING_MODEL = "sfr"

#EMBEDDING_SCHEMA = "onlysignatures"
EMBEDDING_SCHEMA = "fullstatements"

In [None]:
if EMBEDDING_SCHEMA == "onlysignatures":
	redisClientEmbedding  = RedisClient(host=os.getenv("REDIS_SERVER"), 
									port=os.getenv("REDIS_PORT"), 
									db=os.getenv("REDIS_DB"), 
									password=os.getenv("REDIS_PSW"), 
									projectKey = "marco.dataflow.embedding.onlysignatures")
			
if EMBEDDING_SCHEMA== "fullstatements":
	redisClientEmbedding  = RedisClient(host=os.getenv("REDIS_SERVER"), 
									port=os.getenv("REDIS_PORT"), 
									db=os.getenv("REDIS_DB"), 
									password=os.getenv("REDIS_PSW"), 
									projectKey = "marco.dataflow.embedding.fullstatements")

In [None]:
# Create an Embedding Manager
embeddingsManager = EmbeddingsManager(redisClientEmbedding, EMBEDDING_MODEL)
print(embeddingsManager)

In [None]:
def processRow(row):
		# Print message 
		print("\n--- 🔑 Analyzing APK: {} 🔑 ---".format(row['sha256']))

		# Create App instance
		app = App(row['sha256'], row['pkgName'], row['classID'])

		# Download Data Flows from Redis
		app.downloadDataFlowsFromRedis(redisClientExtraction)
		
		# Check if dataFlows have been extracted and are not empty.
		if(app.dataFlows is not None and not app.dataFlows.isEmpty()):
				
				# Keep only signature
				if EMBEDDING_SCHEMA == "onlysignatures":
					app.dataFlows.keepOnlySignatures()
					
				# Load DataFlows into Embeddings Manager
				embeddingsManager.loadDataFlowsFromApp(app.dataFlows)

# Apply the function to each row in the DataFrame
_ = appsDF.apply(processRow, axis=1)

In [None]:
print(embeddingsManager)

embeddingsManager.generateMethodsEmbeddings(redisClientEmbedding, EMBEDDING_MODEL)

print(embeddingsManager)
if embeddingsManager.shape == 0:
	print("--- ⏭️ No NEW EMBEDDINGS Generated")

##### 🔚 End

In [None]:
endTime = datetime.datetime.now()
print("\n🔚 --- End - {} --- 🔚".format(endTime))

# Assuming endTime and startTime are in seconds
totalTime = endTime - startTime
minutes = totalTime.total_seconds() // 60
seconds = totalTime.total_seconds() % 60
print("⏱️ --- Time: {:02d} minutes and {:02d} seconds --- ⏱️".format(int(minutes), int(seconds)))