## 🧪 RQ2: Third-Party Libs Analysis

In [None]:
# Imports
from   dotenv     			import load_dotenv
import matplotlib.pyplot 	as plt
import matplotlib.colors   	as mcolors
import seaborn 				as sns
import pandas    			as pd
import datetime
import json
import os

##### Parameters

In [None]:
TMP_PATH = "../../0_Data/TMP/"

#### Initialization

In [None]:
print("⚡ START: {} ⚡".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
initTime = datetime.datetime.now()

In [None]:
# Create TMP Folder
if not os.path.exists(TMP_PATH):
	os.makedirs(TMP_PATH)
	print("--- 📁🆕 Folder created       : {}\n".format(TMP_PATH))
else:
	print("--- 📁✅ Folder already exists: {}\n".format(TMP_PATH))

In [None]:
# Load .env Info
load_dotenv()

#### 📥 1) Load Data

In [None]:
# Specify the data path
DATA_PATH = "./0_Data/2_Libs"

# Read all JSON files in the DATA_PATH folder and store them in a dictionary
locationDFs   = {}
locationsList = []
for fileName in os.listdir(DATA_PATH):
	if fileName.startswith('libs_'):
		location = fileName.split('_')[1]
		filePath = os.path.join(DATA_PATH, fileName)

		# Load JSON data
		with open(filePath, 'r') as f:
			data = json.load(f)

		# Create DataFrame and ensure 'libs' column is present
		df = pd.DataFrame(data)

		locationDFs[location] = df
		locationsList.append(location)

In [None]:
for location, df in locationDFs.items():
	print("--- 📍 {:<18}: {:<5} Unique Pkg Names".format(location, df.shape[0]))

Create Matrix [Lib]x[Location] with normalized usage

In [None]:
# Create a set of all unique libraries across all locations
allLibs = set()
for dfLoc in locationDFs.values():
	for libs in dfLoc['libs']:
		allLibs.update(libs)
allLibs = sorted(allLibs)

# Initialize the result DataFrame
dataDF = pd.DataFrame({'library': allLibs})

# For each location, calculate normalized usage for each library
for loc in locationsList:
	dfLoc = locationDFs[loc]
	libCounts = dfLoc['libs'].explode().value_counts()
	totalApps = len(dfLoc)
	# Normalized usage: number of apps using the lib / total number of apps in location
	normUsage = dataDF['library'].map(lambda lib: libCounts.get(lib, 0) / totalApps)
	dataDF[loc.replace('.json', '')] = normUsage

# Calculate normalized usage for each library across all locations (overall)
overallLibCounts = pd.Series(0, index=allLibs)
totalAppsOverall = 0
for dfLoc in locationDFs.values():
	overallLibCounts = overallLibCounts.add(dfLoc['libs'].explode().value_counts(), fill_value=0)
	totalAppsOverall += len(dfLoc)
dataDF['overallUsage'] = dataDF['library'].map(lambda lib: round(overallLibCounts.get(lib, 0) / totalAppsOverall, 2))

# Move 'overallUsage' to the second column
cols   = dataDF.columns.tolist()
cols.insert(1, cols.pop(cols.index('overallUsage')))
dataDF = dataDF[cols]

# Sort by overallUsage
dataDF = dataDF.sort_values(by='overallUsage', ascending=False).reset_index(drop=True)

dataDF.head()

### 3] Plots

In [None]:
# Where to store Plots
PLOTS_PATH = './0_Data/Plots/'

# Random seed
RANDOM_SEED = 777

# COLORS
COLORS  = ["#89CFFD", '#FF8282', '#C084FC', '#FFE066', '#90C67C','#FFB347', '#60B5FF']

# Parameters
SMALL_SIZE  = 16
MEDIUM_SIZE = 18
LARGE_SIZE  = 20

# Dot size
DOT_SIZE = 60
#DOT_SIZE = 30

In [None]:
# NEW CUSTOM CMAP
CUSTOM_CMAP   = mcolors.ListedColormap(["#006837", "#1a9850", "#66bd63", "#a6d96a", "#d9ef8b", "#fee08b", "#fdae61", "#f46d43", "#d73027", "#a50026"])
CUSTOM_CMAP_R = mcolors.ListedColormap(CUSTOM_CMAP.colors[::-1])

CUSTOM_CMAP_GRADIENT   = mcolors.LinearSegmentedColormap.from_list("CUSTOM_CMAP_GRADIENT", CUSTOM_CMAP.colors)
CUSTOM_CMAP_GRADIENT_R = mcolors.LinearSegmentedColormap.from_list("CUSTOM_CMAP_R_GRADIENT", CUSTOM_CMAP_R.colors)

In [None]:
# Select top 30 libraries by overallUsage
top30 = dataDF.head(30).copy()

# Extract only location columns (excluding 'library' and 'overallUsage')
locationCols = [col for col in top30.columns if col not in ['library', 'overallUsage']]

# Compute difference matrix: location usage - overall usage
diffMatrix = top30[locationCols].subtract(top30['overallUsage'], axis=0)

# Format y-axis labels to include overallUsage
top30['label_with_usage'] = top30.apply(lambda row: f"{row['library']} [{row['overallUsage']:.2f}]", axis=1)
diffMatrix.index = top30['label_with_usage']

plt.figure(figsize=(8, max(6, len(diffMatrix) * 0.25)))
ax = sns.heatmap(diffMatrix, cmap=CUSTOM_CMAP_GRADIENT_R, center=0, annot=False, linewidths=0.5, cbar_kws={'label': 'Usage Delta'})

plt.xlabel('Location', fontsize=SMALL_SIZE)
plt.ylabel('Library [Overall Usage]', fontsize=SMALL_SIZE)

plt.xticks(fontsize=13, rotation=90)
plt.yticks(fontsize=13)

# Set color limits on the heatmap
ax.collections[0].set_clim(-0.3, 0.3)
cbar = ax.collections[0].colorbar
cbar.set_label('Usage Delta', fontsize=SMALL_SIZE)
cbar.ax.tick_params(labelsize=SMALL_SIZE)

# Save the figure to PDF and PNG
plt.savefig(os.path.join(PLOTS_PATH, "rq2_libsHetamap.pdf"), format="pdf", bbox_inches="tight")
plt.savefig(os.path.join(PLOTS_PATH, "rq2_libsHetamap.png"), format="png", dpi=300, bbox_inches="tight")

plt.tight_layout()
plt.show()

##### 🔚 End

In [None]:
endTime = datetime.datetime.now()
print("\n🔚 --- END:  {} --- 🔚".format(endTime.strftime("%Y-%m-%d %H:%M:%S")))

# Assuming endTime and initTime are datetime objects
totalTime = endTime - initTime
hours     = totalTime.total_seconds() // 3600
minutes   = (totalTime.total_seconds() % 3600) // 60
seconds   = totalTime.total_seconds() % 60
print("⏱️ --- Time: {:02d} hours and {:02d} minutes [{:02d} seconds] --- ⏱️".format(int(hours), int(minutes), int(totalTime.total_seconds())))