## 🦾 Sub-Phase 3: Category-Based Anomaly Detection Training 

In [53]:
# Imports
from   dotenv   import load_dotenv
import pandas   as pd
import datetime
import sys
import os

# Add the upper folder to sys.path
sys.path.insert(0, "../")
from   Training    import TrainingManager
from   RedisClient import RedisClient
from   App         import App

#### Parameters

In [54]:
# Model Folder --> Where to save models
MODELS_PATH = "../../../0_Data/MODELS/"

# TMP Folder
TMP_PATH    = "../../../0_Data/TMP/"

#### Initialization

In [55]:
print("⚡ Start - {} ⚡\n".format(datetime.datetime.now()))
startTime = datetime.datetime.now()

⚡ Start - 2024-08-02 17:18:05.608348 ⚡



In [56]:
# Create TMP Folder
if not os.path.exists(TMP_PATH):
	os.makedirs(TMP_PATH)
	print("📁🆕 Folder created       : {}\n".format(TMP_PATH))
else:
	print("📁✅ Folder already exists: {}\n".format(TMP_PATH))

# Create MODELS Folder
if not os.path.exists(MODELS_PATH):
	os.makedirs(MODELS_PATH)
	print("📁🆕 Folder created       : {}\n".format(MODELS_PATH))
else:
	print("📁✅ Folder already exists: {}\n".format(MODELS_PATH))

📁✅ Folder already exists: ../../../0_Data/TMP/

📁🆕 Folder created       : ../../../0_Data/MODELS/



#### 📥 1) Load Data 

In [57]:
REDIS_PREFIX = "test"

In [58]:
DATASET          = "androcatset"
DIRECTION        = "backward"
SOURCES_APPROACH = "nosources"

redisProjectKey = REDIS_PREFIX + ".{}.{}.{}".format(DATASET, DIRECTION, SOURCES_APPROACH)
print("--- 🔑 Redis Key: ", redisProjectKey)

--- 🔑 Redis Key:  test.androcatset.backward.nosources


📡 Redis Connection

In [59]:
# Load .env file
load_dotenv()
redisClientExtraction = RedisClient(host=os.getenv("REDIS_SERVER"), 
                                    port=os.getenv("REDIS_PORT"), 
                                    db=os.getenv("REDIS_DB"), 
                                    password=os.getenv("REDIS_PSW"), 
                                    projectKey = redisProjectKey)

In [60]:
# Paths
INPUT_PATH   = "../../../0_Data/2_AndroCatSet_TrainingSet.csv"
INPUT_PATH   = "../../../0_Data/1_AndroCatSet_Mini.csv"

# Read the data
appsDF = pd.read_csv(INPUT_PATH)
print("--- #️⃣ Apps: {} ".format(appsDF.shape[0]))

# TEST
#appsDF = appsDF.head(3)
appsDF.head(10)

--- #️⃣ Apps: 50 


Unnamed: 0,sha256,pkgName,classID,googlePlayCategoryID,googlePlayDescription
0,9B30837BD2474AC3623A43D052F7ADC4C63E4AA9981F0F...,my.android.calc,Calculator,TOOLS,Handiness universal percentage calculator for ...
1,686DE8D8A0D08992CB135BC7A0500D0109D9697A1140B8...,com.vpn.basiccalculator,Calculator,TOOLS,CITIZEN CALCULATOR by ANGEL NX is best Mobile ...
2,A49864DCC90F6730569455BDFA39B4B7CF70AE0C34D656...,com.ba.fractioncalculator,Calculator,EDUCATION,"<b>Free offline fraction calculator</b> ✌, sup..."
3,85C80B7ED3799C04CBD107DD3004F91ED4DC2BF8D75C35...,de.sfr.calctape,Calculator,FINANCE,What happens if you combine a standard calcula...
4,66442DEF269FF22FB4177759343D973DA1A6A3AFE2ECE3...,com.digitalchemy.calculator.freefraction,Calculator,TOOLS,Fraction Calculator Plus is your best way to d...
5,E62452B248DDDC34646D98BEC604C236B865115DA1CD4B...,com.digitalchemy.calculator.freedecimal,Calculator,TOOLS,<strong>Calculator Plus - a perfect calculator...
6,D35EDCE75773CB40CAC2BBD6AC294DE8103F3B7CA11DD6...,com.candl.athena,Calculator,TOOLS,Express your style with the free calculator ma...
7,0708505CD2893C15589BD3BCBEEA22F414B0EF23B7BC11...,com.dencreak.dlcalculator,Calculator,PRODUCTIVITY,This calculator allows you to easily handle al...
8,5482249D7EF2DCA96CDC791433F86EE0C962A1F71712C4...,calculator.innovit.com.calculatrice,Calculator,TOOLS,Calculator is the essential tool for your smar...
9,F1D6B18311B1C80936935BC6199A338E599AF4509E0DAB...,apps.r.calculator,Calculator,TOOLS,Calculator allows you to perform simple and co...


#### 🏷️ 2) Load Data Flows Embeddings and Train a Model.

In [61]:
# Where embeddingsa are stores
redisClientEmbedding  = RedisClient(host=os.getenv("REDIS_SERVER"), 
									port=os.getenv("REDIS_PORT"), 
									db=os.getenv("REDIS_DB"), 
									password=os.getenv("REDIS_PSW"), 
									projectKey = REDIS_PREFIX + ".embeddings")

In [62]:
EMBEDDING_MODEL = "gpt"
#EMBEDDING_MODEL = "codebert"
#EMBEDDING_MODEL = "sfr"

In [63]:
# Group by category
groupedDF = appsDF.groupby('classID')

In [64]:
for categoryID, categoryDF in groupedDF:
	print("\n🏷️ --- category ID: {} --- 🏷️".format(categoryID))
	print("--- #️⃣ Num. of apps: {}".format(categoryDF.shape[0]))

	# TRAINING MANAGER
	trainingManager = TrainingManager(EMBEDDING_MODEL)

	def processRow(row):
		# Print message 
		print("\n--- 🔑 Analyzing APK: {} 🔑 ---".format(row['sha256']))

		# Create App instance
		app = App(row['sha256'], row['pkgName'], row['classID'])

		# Get Data Flows From Redis
		app.downloadDataFlowsFromRedis(redisClient = redisClientExtraction)

		#Check if dataFlows have been extracted and are not empty.
		if(app.dataFlows is not None and not app.dataFlows.isEmpty()):
			
			app.downloadPairsEmbeddingsFromRedis(redisClientEmbedding, EMBEDDING_MODEL)

			# Load DataFlows into Embeddings Manager
			trainingManager.loadEmbeddingsFromApp(app.embeddings[EMBEDDING_MODEL])

		print("--- ⏱️ Loaded into Training Manager\n--- ⏱️ Time: {} \n--- ⏱️ Time Elapsed: {} seconds".format(datetime.datetime.now(), int((datetime.datetime.now() - startTime).total_seconds())))
		print("--- 📐 New Embedding shape: {}".format(trainingManager.embeddings.shape))

	# Apply the function to each row in the DataFrame
	_ = categoryDF.apply(processRow, axis=1)

	print("\n\n--- ✅ Training Manager Fully Loaded")
	print("--- ⏱️ Time: {} \n--- ⏱️ Time Elapsed: {} seconds".format(datetime.datetime.now(), int((datetime.datetime.now() - startTime).total_seconds())))

	# Print Loaded Manager
	print(trainingManager)

	# Train the model
	try:
		trainingManager.trainAnomalyDetectionModel(MODELS_PATH + "{}/{}.joblib".format(categoryID, EMBEDDING_MODEL))
	except:
		continue

	# Print Results
	print(trainingManager.trainingResults)


🏷️ --- category ID: Calculator --- 🏷️
--- #️⃣ Num. of apps: 10

--- 🔑 Analyzing APK: 9B30837BD2474AC3623A43D052F7ADC4C63E4AA9981F0F9DAA658F825EBDB224 🔑 ---
--- ❌ Data Flows Unavailaible on Redis
--- ⏱️ Loaded into Training Manager
--- ⏱️ Time: 2024-08-02 17:18:05.698642 
--- ⏱️ Time Elapsed: 0 seconds
--- 📐 New Embedding shape: (0,)

--- 🔑 Analyzing APK: 686DE8D8A0D08992CB135BC7A0500D0109D9697A1140B8EDADF19B3B3BD2220C 🔑 ---
--- ❌ Data Flows Unavailaible on Redis
--- ⏱️ Loaded into Training Manager
--- ⏱️ Time: 2024-08-02 17:18:05.703518 
--- ⏱️ Time Elapsed: 0 seconds
--- 📐 New Embedding shape: (0,)

--- 🔑 Analyzing APK: A49864DCC90F6730569455BDFA39B4B7CF70AE0C34D656D078790CBA3521C0A0 🔑 ---
--- ❌ Data Flows Unavailaible on Redis
--- ⏱️ Loaded into Training Manager
--- ⏱️ Time: 2024-08-02 17:18:05.705586 
--- ⏱️ Time Elapsed: 0 seconds
--- 📐 New Embedding shape: (0,)

--- 🔑 Analyzing APK: 85C80B7ED3799C04CBD107DD3004F91ED4DC2BF8D75C3589176EEC1D61F3D02E 🔑 ---
--- ❌ Data Flows Unavailaib

##### 🔚 End

In [65]:
endTime = datetime.datetime.now()
print("\n🔚 --- End - {} --- 🔚".format(endTime))

# Assuming endTime and startTime are in seconds
totalTime = endTime - startTime
minutes = totalTime.total_seconds() // 60
seconds = totalTime.total_seconds() % 60
print("⏱️ --- Time: {:02d} minutes and {:02d} seconds --- ⏱️".format(int(minutes), int(seconds)))


🔚 --- End - 2024-08-02 17:18:13.537546 --- 🔚
⏱️ --- Time: 00 minutes and 07 seconds --- ⏱️
