## ⛏️ Sub-Phase 1 : Backward Taint Analysis for Data Flow Pairs Extraction

In [None]:
# Imports
from   dotenv   import load_dotenv
import pandas   as pd
import datetime
import sys
import os

# Add the upper folder to sys.path
sys.path.insert(0, "../")
from   RedisClient import RedisClient
from   App         import App

#### Parameters

In [None]:
# TMP Folder
TMP_PATH = "../../../../0_Data/TMP/"

#### Initialization

In [None]:
print("⚡ Start - {} ⚡\n".format(datetime.datetime.now()))
startTime = datetime.datetime.now()

In [None]:
# Create TMP Folder
if not os.path.exists(TMP_PATH):
	os.makedirs(TMP_PATH)
	print("📁🆕 Folder created       :", TMP_PATH)
else:
	print("📁✅ Folder already exists:", TMP_PATH)

#### 📥 1) Read Data and Push to Redis

In [None]:
REDIS_PREFIX = "test"

In [None]:
#DATASET = "malicious"
DATASET = "androcatset"

DIRECTION = "forward"
#DIRECTION = "backward"

SOURCES_APPROACH = "docflow"
#SOURCES_APPROACH = "nosources"

redisProjectKey = REDIS_PREFIX + ".{}.{}.{}".format(DATASET, DIRECTION, SOURCES_APPROACH)
print("--- 🔑 Redis Key: ", redisProjectKey)

#### 📡 Redis connection

You can use this cell to check the current status of the extraction. You may have to launch the extraction multiple times until you have the success list containing all the analyzes apps.

In [None]:
load_dotenv()
redisClientExtraction = RedisClient(host=os.getenv("REDIS_SERVER"), 
									port=os.getenv("REDIS_PORT"), 
									db=os.getenv("REDIS_DB"), 
									password=os.getenv("REDIS_PSW"), 
									projectKey = redisProjectKey)

redisClientExtraction.printStatus()

In [None]:
# Paths
INPUT_PATH   = "../../../0_Data/0_AndroCatSet.csv"
#INPUT_PATH   = "../../../0_Data/1_AndroCatSet_Mini.csv"
#INPUT_PATH   = "../../../0_Data/3_MaliciousApps.csv"

# Read the data
appsDF = pd.read_csv(INPUT_PATH)
print("--- #️⃣ Apps: {} ".format(appsDF.shape[0]))

# TEST
appsDF = appsDF.head(3)
appsDF

In [None]:
#To push
redisClientExtraction.loadPopList(list(appsDF['sha256'].values))

#### 🔁 Extraction Loop Execution: Pop from Redis and extract, then push results back.

In [None]:
# Path to Android Platforms
load_dotenv()
ANDROID_PATH = os.getenv("ANDROID_PATH")

In [None]:
# Path to the Java Script used for Data Flows Extaction
JAVA_EXTRACTOR_PATH = "../../1_Java/damflow_extractor/target/damflow_extractor-1.0-jar-with-dependencies.jar"

# Timeout for Data Flow Analysis
TIMEOUT = 7200

In [None]:
# Pop from Redis popList
while (sha256 := redisClientExtraction.client.rpop(redisClientExtraction.popKey) ) is not None:

	# Get sha256
	sha256 = sha256.decode("utf-8") 
	print("=="*40+"\n")
	print("🔑 Analyzing APK: {}".format(sha256))

	  # Skip if already processed
	if redisClientExtraction.client.hget(redisClientExtraction.resultsKey, sha256) is not None:
		print("\n⏭️  Already Processed --> Skip")
		continue
   
	# Launch Difuzer
	try:
		# Create App instance
		app = App(sha256 = sha256)

		# Extract data flows
		app.extractDataFlows(TMP_PATH, JAVA_EXTRACTOR_PATH, ANDROID_PATH, DIRECTION, SOURCES_APPROACH, TIMEOUT)

		# Convert to JSON
		jsonString = app.dataFlows.toJsonString()

		# Store results into Redis
		redisClientExtraction.client.hset(redisClientExtraction.resultsKey, sha256, jsonString)
		print("\n✅ Success for APK: {}".format(sha256), flush=True)
		
	# Print exception and store into errorList
	except Exception as e:
		print("\n❌  Failed with Exception {}".format(e), flush=True)
		redisClientExtraction.client.lpush(redisClientExtraction.errorKey, sha256)
	print("=="*40+"\n")