## 🧪 RQ3: Analysis

Analyze Apps with same pkgName but downloaded from different Locations

Apps should be placed in a folder named "0_Data" in different subfolders using the name of the locations i.e. "Luxembourg", "Santiago", ...

In [None]:
# Imports
from   loguru     import logger
from   dotenv     import load_dotenv
import pandas     as pd
import datetime
import hashlib
import os
import gc

# Custom Imports
import sys
sys.path.append('../')
import AppUtils 	
import PairwiseAnalysisUtils

##### Parameters

In [None]:
TMP_PATH = "../../0_Data/TMP/"

#### Initialization

In [None]:
print("⚡ START: {} ⚡".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
initTime = datetime.datetime.now()

In [None]:
# Create TMP Folder
if not os.path.exists(TMP_PATH):
	os.makedirs(TMP_PATH)
	print("--- 📁🆕 Folder created       : {}\n".format(TMP_PATH))
else:
	print("--- 📁✅ Folder already exists: {}\n".format(TMP_PATH))

In [None]:
# Load .env Info
load_dotenv()

In [None]:
# Disable root logging for AndroGuard
logger.remove() 

#### 📥 1] Load Apps and check

In [None]:
# Where apps are located
DATA_PATH = "./0_Data/"

# Location List
LOCATION_LIST = ['LosAngeles', 'Santiago', 'Tokyo', 'Luxembourg', 'TelAviv', 'Sydney', 'Johannesburg']

# Results Path
RESULTS_PATH = "./1_Results/"

In [None]:
print("--- ⭕ [START] Hash Check for Apps")
print("--- ⏳ Start Time: {}\n".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))

# To store the apps with hash mismatch
allCombinationsDict = {}

# For each location combination, run the analysis
for i, location1 in enumerate(LOCATION_LIST):
	for location2 in LOCATION_LIST[i+1:]:

		folderLocation1 = os.path.join(DATA_PATH, location1 + "Apps/")
		folderLocation2 = os.path.join(DATA_PATH, location2 + "Apps/")

		print("--- 🌍 Location 1         : {}".format(location1))
		print("--- 📁 Folder Location 1  : {}".format(folderLocation1))
		print("--- 🌍 Location 2         : {}".format(location2))
		print("--- 📁 Folder Location 2  : {}".format(folderLocation2))

		# Get matching subfolders (pkg names)
		pkgNameList = []
		if folderLocation1 and folderLocation2:
			subfolders1 = set(os.listdir(folderLocation1))
			subfolders2 = set(os.listdir(folderLocation2))
			pkgNameList = list(subfolders1.intersection(subfolders2))
		print("\n--- #️⃣ Matching subfolders     : {}".format(len(pkgNameList)))

		# Order the list of package names
		pkgNameList.sort()

		# Check Hash
		appsDictList = []
		for pkgName in pkgNameList:
			
			try:
				apkPath1 = os.path.join(folderLocation1, pkgName, pkgName + ".apk")
				apkPath2 = os.path.join(folderLocation2, pkgName, pkgName + ".apk")

				with open(apkPath1, "rb") as f1, open(apkPath2, "rb") as f2:
					hash1 = hashlib.sha256(f1.read()).hexdigest()
					hash2 = hashlib.sha256(f2.read()).hexdigest()

					if hash1 != hash2:
						# print("--- ❗ Hash mismatch for {}:".format(pkgName))
						# print("--- {} hash: {}".format(location1, hash1))
						# print("--- {} hash: {}".format(location2, hash2))

						# Create a dictionary for the app and appeend it to the list
						appsDict = {
							location1: {
								"id"		: location1 + "_" + pkgName,
								"path"		: os.path.join(folderLocation1, pkgName, pkgName + ".apk"),
								"pkgName"	: pkgName
							},
							location2: {
								"id"		: location2 + "_" + pkgName,
								"path"		: os.path.join(folderLocation2, pkgName, pkgName + ".apk"),
								"pkgName"	: pkgName
							}
						}
						appsDictList.append(appsDict)

					else:
						#print("--- ✅ Hash match for {}".format(pkgName))
						continue
					
			except FileNotFoundError as e:
				print("--- ❗ File not found: {}".format(e))
				continue

		print("--- #️⃣ Apps with hash mismatch : {}".format(len(appsDictList)))

		# Save the apps with hash mismatch to a dict
		dictKey = location1 + "_" + location2
		allCombinationsDict[dictKey] = appsDictList

		print("\n" + "---"*20 + "\n")

print("\n--- ⭕ [END] Hash Check for Apps")
print("--- ⏳ End Time   : {}".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
print("--- ⏳ Total Time : {}".format(datetime.datetime.now() - initTime))
print("\n" + "==="*25 + "\n")

#### 🧪 2] Analysis

In [None]:
# print(allCombinationsDict)

In [None]:
print("--- ⭕ [START] Pairwise Analysis")
print("--- ⏳ Start Time: {}\n".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))

# Results
results = []

# Print all keys and values in allCombinationsDict
for key, value in allCombinationsDict.items():
	location1, location2 = key.split("_")
	print("--- 🌍 Location 1  : {}".format(location1))
	print("--- 🌍 Location 2  : {}".format(location2))

	# Save the results to a CSV file
	appsDictList = value
	print("--- #️⃣ Apps        : {}".format(len(appsDictList)))

	for idx, appPair in enumerate(appsDictList):
		print("\n--- 🔄 Analysis            :  {} out of {}\n".format(idx, len(appsDictList)))
	
		# Get info and build the app objects
		app1Info = appPair[location1]
		app2Info = appPair[location2]

		# Get the package name
		pkgName = app1Info["pkgName"]
		print("--- 📦 Package Name        : {}".format(pkgName))

		# Create App objects only when needed
		location1App = AppUtils.App(app1Info["id"], pkgName, TMP_PATH, app1Info["path"])
		location2App = AppUtils.App(app2Info["id"], pkgName, TMP_PATH, app2Info["path"])

		# Create Analysis object
		analysis = PairwiseAnalysisUtils.PairwiseAnalysis(location1App, location2App)

		try:
			# Analysis Phases
			analysis.runAnalysisSetup()
			analysis.runExtraction()
			analysis.runComparison(silentMode=False)
			analysis.runScoresComputation()

			# Get scores
			overallScore = analysis.scores["overallScore"]
			scoreColumns = {"Score_{}".format(k): v for k, v in analysis.scores.items()}

			# Store results
			results.append({
					"pkgName": pkgName,
					"location1": location1,
					"location2": location2,
					"overallScore": overallScore,
					**scoreColumns
			})
			print("--- ✅ [Success] Finished  : {}".format(pkgName))	

		except Exception as e:
			print("--- ❌ [Error]   Analyzing : {}".format(pkgName, e))	

			# Delete
			analysis.runCleaning()
			
			# test
			# break

			continue
			
		finally:
			# To use JSON Intermediate files
			analysis.runCleaning()

			# Memory cleanup
			del location1App, location2App, analysis
			gc.collect()
			
		# test
		# break

	print("\n" + "---"*20 + "\n")

print("\n--- ⭕ [END] Pairwise Analysis")
print("--- ⏳ End Time   : {}".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
print("--- ⏳ Total Time : {}".format(datetime.datetime.now() - initTime))
print("\n" + "==="*20 + "\n")

#### 💾 3] Save Results

In [None]:
# Create a pandas DataFrame from the results list
resultsDF = pd.DataFrame(results)

# Reorder the data
resultsDF = resultsDF.sort_values(by=["pkgName", "location1", "location2"]).reset_index(drop=True)

# Show
resultsDF.head()

In [None]:
# Save the DataFrame to a CSV file in the RESULTS_PATH directory
csvPath = os.path.join(RESULTS_PATH, "RQ3_Results.csv")

# Save
resultsDF.to_csv(csvPath, index=False)
print("--- 💾 Results saved to {}".format(csvPath))

##### 🔚 End

In [None]:
endTime = datetime.datetime.now()
print("\n🔚 --- END:  {} --- 🔚".format(endTime.strftime("%Y-%m-%d %H:%M:%S")))

# Assuming endTime and initTime are datetime objects
totalTime = endTime - initTime
hours     = totalTime.total_seconds() // 3600
minutes   = (totalTime.total_seconds() % 3600) // 60
seconds   = totalTime.total_seconds() % 60
print("⏱️ --- Time: {:02d} hours and {:02d} minutes [{:02d} seconds] --- ⏱️".format(int(hours), int(minutes), int(totalTime.total_seconds())))