## 🧪 RQ2: Privacy Policies Analysis

In [None]:
# Imports
from   dotenv     			import load_dotenv
from   sklearn.manifold 	import TSNE
from   ast 					import literal_eval
import matplotlib.pyplot 	as plt
import seaborn 				as sns
import pandas    			as pd
import datetime
import os

##### Parameters

In [None]:
TMP_PATH = "../../0_Data/TMP/"

#### Initialization

In [None]:
print("⚡ START: {} ⚡".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
initTime = datetime.datetime.now()

In [None]:
# Create TMP Folder
if not os.path.exists(TMP_PATH):
	os.makedirs(TMP_PATH)
	print("--- 📁🆕 Folder created       : {}\n".format(TMP_PATH))
else:
	print("--- 📁✅ Folder already exists: {}\n".format(TMP_PATH))

In [None]:
# Load .env Info
load_dotenv()

#### 📥 1) Load Data

In [None]:
# Specify the data path
DATA_PATH = './0_Data/1_AppsWithDownloads/'

# Read all CSV files in the DATA_PATH folder and store them in a dictionary
locationDFs   = {}
locationsList = []
for fileName in os.listdir(DATA_PATH):
	if fileName.endswith('_Downloads.csv'):
		location = fileName.split('_')[0]
		filePath = os.path.join(DATA_PATH, fileName)

		# Save the DataFrame to the dictionary
		locationDFs[location] = pd.read_csv(filePath)
		locationsList.append(location)

In [None]:
for location, df in locationDFs.items():
	print("--- 📍 {:<14}: {:<5} Unique Pkg Names".format(location, df.shape[0]))

In [None]:
EMBEDDINGS_PATH = './0_Data/PrivacyPoliciesEmbeddings/'

for location, df in locationDFs.items():
	print("--- 📍 {}: {:<5}".format(location, df.shape[0]))

	embeddingDF = pd.read_csv(os.path.join(EMBEDDINGS_PATH, 'privacyPoliciesEmbeddings_{}.csv'.format(location)))
	print("--- 🔢 Loaded embeddings   : {} ".format(embeddingDF.shape[0]))

	# Print the number of apps before merging
	print("--- #️⃣ Apps before merging : {}".format(df.shape[0]))

	#print(embeddingDF.head(3))

	# Merge embeddings into the original df based on 'sha256'
	df = df.merge(
		embeddingDF[['sha256', 'embedding']],
		on='sha256',
		how='left',
	)

	# Remove rows with NaN in the 'embedding' column
	df = df.dropna(subset=['embedding'])

	# Convert the 'embedding' column from string to list using ast.literal_eval
	df['embedding'] = df['embedding'].apply(literal_eval)

	# Sort by numDownloads
	df = df.sort_values(by='numDownloads', ascending=False)

	# Print the number of apps after merging
	print("--- #️⃣ Apps after merging  : {}".format(df.shape[0]))

	# Save the merged DataFrame to a new CSV file
	locationDFs[location] = df

	print("\n" + "---"*20 + "\n")

### 🧪 2) Analysis

In [None]:
# Where to store Plots
PLOTS_PATH = './0_Data/Plots/'

# Random seed
RANDOM_SEED = 777

# COLORS
COLORS  = ["#89CFFD", '#FF8282', '#C084FC', '#FFE066', '#90C67C','#FFB347', '#60B5FF']

# Parameters
SMALL_SIZE  = 16
MEDIUM_SIZE = 18
LARGE_SIZE  = 20

# Dot size
DOT_SIZE = 60
#DOT_SIZE = 30

In [None]:
# Randomly sample up to 100 rows from each DataFrame in locationDFs
locationDFs = {loc: df_.sample(n=min(100, len(df_)), random_state=RANDOM_SEED) for loc, df_ in locationDFs.items()}

# Convert the dictionary of DataFrames (locationDFs) into a single DataFrame with a 'location' column
dataDF = pd.concat(
	[df_.assign(location=loc) for loc, df_ in locationDFs.items()],
	ignore_index=True
)

In [None]:
# Convert the embeddings column into a 2D array
X = pd.DataFrame(dataDF['embedding'].tolist())
y = dataDF['location']

# Optional: Use t-SNE for dimensionality reduction to 2D
reducer = TSNE(n_components=2, random_state=RANDOM_SEED)  
X_2d = reducer.fit_transform(X)

# Create a new DataFrame for plotting
plotDF = pd.DataFrame({
    'x': X_2d[:, 0],
    'y': X_2d[:, 1],
    'label': y
})

# Plot using seaborn
plt.figure(figsize=(8, 8))
sns.scatterplot(data=plotDF, x='x', y='y', hue='label', palette=COLORS, s=DOT_SIZE, alpha=1) 

plt.xlabel('')
plt.ylabel('')

plt.legend(title='Location', fontsize=11, title_fontsize=12)
plt.grid(True)

plt.savefig(os.path.join(PLOTS_PATH, "rq2_privacyPoliciesScatter") + ".pdf", bbox_inches='tight')
plt.savefig(os.path.join(PLOTS_PATH, "rq2_privacyPoliciesScatter") + ".png", bbox_inches='tight', dpi=300)

plt.show()

##### 🔚 End

In [None]:
endTime = datetime.datetime.now()
print("\n🔚 --- END:  {} --- 🔚".format(endTime.strftime("%Y-%m-%d %H:%M:%S")))

# Assuming endTime and initTime are datetime objects
totalTime = endTime - initTime
hours     = totalTime.total_seconds() // 3600
minutes   = (totalTime.total_seconds() % 3600) // 60
seconds   = totalTime.total_seconds() % 60
print("⏱️ --- Time: {:02d} hours and {:02d} minutes [{:02d} seconds] --- ⏱️".format(int(hours), int(minutes), int(totalTime.total_seconds())))