## 🧪 RQ2: Sensitive Permissions Analysis

Analysis

In [None]:
# Imports
from   dotenv     			import load_dotenv
from   collections          import Counter
import matplotlib.pyplot 	as plt
import matplotlib.colors   	as mcolors
import seaborn 				as sns
import pandas    			as pd
import datetime
import os

##### Parameters

In [None]:
TMP_PATH = "../../0_Data/TMP/"

#### Initialization

In [None]:
print("⚡ START: {} ⚡".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
initTime = datetime.datetime.now()

In [None]:
# Create TMP Folder
if not os.path.exists(TMP_PATH):
	os.makedirs(TMP_PATH)
	print("--- 📁🆕 Folder created       : {}\n".format(TMP_PATH))
else:
	print("--- 📁✅ Folder already exists: {}\n".format(TMP_PATH))

In [None]:
# Load .env Info
load_dotenv()

#### 🧪 Analysis

In [None]:
# Specify the data path
DATA_PATH = './0_Data/3_Permissions'

# Read all JSON files in the DATA_PATH folder and store them in a dictionary
locationDFs   = {}
locationsList = []
for fileName in os.listdir(DATA_PATH):
	if fileName.endswith('.json') and fileName.startswith('permissions_'):
		location = fileName.split('_')[1].split('.')[0]
		filePath = os.path.join(DATA_PATH, fileName)

		locationDFs[location] = pd.read_json(filePath)
		locationsList.append(location)

In [None]:
for location, df in locationDFs.items():
	print("--- 📍 {:<14}: {:<5} ".format(location, df.shape[0]))

Load Permissions with levels

In [None]:
# File with all the permissions
PERMISSION_LIST_PATH = "./0_Data/androidPermissions.csv"

# Read the CSV file into a DataFrame
permissionDF = pd.read_csv(PERMISSION_LIST_PATH)
permissionDF.head(3)

# Get the list of all dangerous permissions
dangerousPermissionsList = permissionDF[permissionDF['protectionLevel'].fillna('').str.contains('dangerous')]['constantValue'].tolist()

print("--- #️⃣ Number of dangerous permissions:", len(dangerousPermissionsList))
for perm in dangerousPermissionsList:
	print(perm)

Stats

In [None]:
for location, df in locationDFs.items():
    print("\n--- 📍 Location: {:<14}".format(location))
    
    # Compute basic metrics
    df['permissionCount'] = df['permissions'].apply(len)
    avgPermLen = df['permissionCount'].mean()
    print("--- 🔹 Avg # permissions                                : {:.2f}".format(avgPermLen))
    
    # Filter for dangerous permissions
    df['dangerousPermission'] = df['permissions'].apply(
        lambda perms: [p for p in perms if p in dangerousPermissionsList]
    )
    df['dangerousCount'] = df['dangerousPermission'].apply(len)
    
    # Compute % dangerous
    df['dangerousPercent'] = df.apply(
        lambda row: (row['dangerousCount'] / row['permissionCount'] * 100) if row['permissionCount'] > 0 else 0,
        axis=1
    )
    
    # Average dangerous permissions
    avgDangerousPerm = df['dangerousCount'].mean()
    print("--- 🔸 Avg # dangerous permissions                      : {:.2f}".format(avgDangerousPerm))
    
    avgDangerousPercent = df['dangerousPercent'].mean()
    print("--- 🔸 Avg % dangerous permissions                      : {:.2f}%".format(avgDangerousPercent))
    
    # Median dangerous %
    medianDangerousPercent = df['dangerousPercent'].median()
    print("--- 🔸 Median % dangerous permissions                   : {:.2f}%".format(medianDangerousPercent))
    
    # Std deviation of dangerous %
    stdDangerousPercent = df['dangerousPercent'].std()
    print("--- 🔸 Std dev of % dangerous permissions               : {:.2f}%".format(stdDangerousPercent))
    
    # Apps with at least 1 dangerous permission
    numAppsWithDangerous = (df['dangerousCount'] > 0).sum()
    print("--- 🔸 # apps with at least one dangerous permission    : {}".format(numAppsWithDangerous))
    
    # Top 5 dangerous permissions
    allDangerousPerms = df['dangerousPermission'].sum()
    commonDangerousPerms = Counter(allDangerousPerms).most_common(5)
    print("--- 🔸 Top dangerous permissions:")
    for perm, count in commonDangerousPerms:
        print("      - {} ({})".format(perm, count))
    
    # Outlier apps (dangerous% > 50)
    numOutliers = (df['dangerousPercent'] > 50).sum()
    print("--- ⚠️ Apps with >50% dangerous permissions             : {}".format(numOutliers))


#### Plots

In [None]:
# Where to store Plots
PLOTS_PATH = './0_Data/Plots/'

# Random seed
RANDOM_SEED = 777

# COLORS
COLORS  = ["#89CFFD", '#FF8282', '#C084FC', '#FFE066', '#90C67C','#FFB347', '#60B5FF']

# Size
SMALL_SIZE  = 16
MEDIUM_SIZE = 18
LARGE_SIZE  = 20

# Dot size
DOT_SIZE = 60
#DOT_SIZE = 30

# NEW CUSTOM CMAP
CUSTOM_CMAP   = mcolors.ListedColormap(["#006837", "#1a9850", "#66bd63", "#a6d96a", "#d9ef8b", "#fee08b", "#fdae61", "#f46d43", "#d73027", "#a50026"])
CUSTOM_CMAP_R = mcolors.ListedColormap(CUSTOM_CMAP.colors[::-1])

CUSTOM_CMAP_GRADIENT   = mcolors.LinearSegmentedColormap.from_list("CUSTOM_CMAP_GRADIENT", CUSTOM_CMAP.colors)
CUSTOM_CMAP_GRADIENT_R = mcolors.LinearSegmentedColormap.from_list("CUSTOM_CMAP_R_GRADIENT", CUSTOM_CMAP_R.colors)

Dangerous Permissions Delta

In [None]:
# 1. Combine all data for overall baseline (counts)
allApps = pd.concat(locationDFs.values(), ignore_index=True)
overallAvgDangerous = allApps['dangerousCount'].mean()
overallAvgPermissions = allApps['permissionCount'].mean()
print("Overall avg # dangerous permissions: {:.2f}".format(overallAvgDangerous))
print("Overall avg # total permissions: {:.2f}".format(overallAvgPermissions))

# 2. Compute avg counts per location
locationAvgsDangerous   = {}
locationAvgsPermissions = {}

for location, df in locationDFs.items():
    locationAvgsDangerous[location] = df['dangerousCount'].mean()
    locationAvgsPermissions[location] = df['permissionCount'].mean()

# 3. Create DataFrame with differences
dfDangerous = pd.DataFrame.from_dict(locationAvgsDangerous, orient='index', columns=['avgCount'])
dfDangerous['location'] = dfDangerous.index
dfDangerous['metric'] = 'Dangerous Permissions'
dfDangerous['diffFromOverall'] = dfDangerous['avgCount'] - overallAvgDangerous

dfPermissions = pd.DataFrame.from_dict(locationAvgsPermissions, orient='index', columns=['avgCount'])
dfPermissions['location'] = dfPermissions.index
dfPermissions['metric'] = 'Total Permissions'
dfPermissions['diffFromOverall'] = dfPermissions['avgCount'] - overallAvgPermissions


# Combine
dfPlot = pd.concat([dfDangerous, dfPermissions], ignore_index=True)

# 4. Sort locations by dangerous difference for consistent ordering
order = dfDangerous.sort_values('diffFromOverall')['location']

In [None]:
# 1. Define color categories based on diff and metric
def getColorGroup(row):
    if row['metric'] == 'Dangerous Permissions':
        return 'Dangerous (+)' if row['diffFromOverall'] > 0 else 'Dangerous (-)'
    else:
        return 'Total (+)' if row['diffFromOverall'] > 0 else 'Total (-)'

dfPlot['colorGroup'] = dfPlot.apply(getColorGroup, axis=1)

# 2. Define a custom color palette (inverted: greens <-> reds)
customPalette = {
    'Total (+)'     : '#FF8282',   # was green, now red
    'Total (-)'     : '#90C67C',   # was red, now green
    'Dangerous (+)' : '#AF3E3E',   # was green, now red
    'Dangerous (-)' : '#537D5D'    # was red, now green
}

# 3. Sort again if needed
order = dfDangerous.sort_values('diffFromOverall')['location'].tolist()

# 4. Plot
plt.figure(figsize=(10, 8))
sns.barplot(
    data=dfPlot,
    x='diffFromOverall',
    y='location',
    hue='colorGroup',
    order=order,
    palette=customPalette
)


plt.axvline(0, color='black', linewidth=1, linestyle='--')

plt.xlim(-5, 5)
plt.xticks(ticks=range(-5, 6, 1), fontsize = SMALL_SIZE)
plt.yticks(fontsize = SMALL_SIZE)

plt.grid(axis='x', linestyle='--', alpha=0.6)

plt.xlabel('Difference from Overall Average', fontsize=MEDIUM_SIZE)
plt.ylabel('Location', fontsize=MEDIUM_SIZE)

# Place legend outside the plot at the bottom center with 4 columns
plt.legend(
    title='Permission Type & Direction',
    fontsize=MEDIUM_SIZE,
    title_fontsize=MEDIUM_SIZE,
    loc='lower center',
    bbox_to_anchor=(0.5, -0.35),
    ncol=2,
)

# Save the figure to PDF and PNG
plt.savefig(os.path.join(PLOTS_PATH, "rq2_permissionsDelta.pdf"), format="pdf", bbox_inches="tight")
plt.savefig(os.path.join(PLOTS_PATH, "rq2_permissionsDelta.png"), format="png", dpi=300, bbox_inches="tight")

plt.tight_layout()
plt.show()

Jaccard Index for Dangerous Permissions

In [None]:
# Collect sets of dangerous permissions per location
location_perms = {}
for location, df in locationDFs.items():
    perms = {p for perms in df['dangerousPermission'] for p in perms}
    location_perms[location] = perms
    
# Reuse the location_perms dictionary
locations = list(location_perms.keys())
similarityMatrix = pd.DataFrame(index=locations, columns=locations)

for loc1 in locations:
    for loc2 in locations:
        set1, set2 = location_perms[loc1], location_perms[loc2]
        intersection = len(set1 & set2)
        union = len(set1 | set2)
        similarity = intersection / union if union else 0
        similarityMatrix.loc[loc1, loc2] = similarity

# Convert to float type and scale to 0-100
similarityMatrix = similarityMatrix.astype(float) * 100

# Plot
plt.figure(figsize=(8, 6))
sns.heatmap(similarityMatrix, annot=True, cmap=CUSTOM_CMAP_GRADIENT_R, square=True, fmt=".2f", vmin=50, vmax=100, cbar_kws={'label': 'Jaccard Similarity (%)'})
plt.title("Jaccard Similarity of Dangerous Permissions Between Locations")
plt.tight_layout()
plt.show()


##### 🔚 End

In [None]:
endTime = datetime.datetime.now()
print("\n🔚 --- END:  {} --- 🔚".format(endTime.strftime("%Y-%m-%d %H:%M:%S")))

# Assuming endTime and initTime are datetime objects
totalTime = endTime - initTime
hours     = totalTime.total_seconds() // 3600
minutes   = (totalTime.total_seconds() % 3600) // 60
seconds   = totalTime.total_seconds() % 60
print("⏱️ --- Time: {:02d} hours and {:02d} minutes [{:02d} seconds] --- ⏱️".format(int(hours), int(minutes), int(totalTime.total_seconds())))