## 🔢 App Classification

In [None]:
from sklearn.metrics               import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection       import train_test_split
from sklearn.preprocessing         import LabelEncoder
from sklearn.neural_network        import MLPClassifier
from sklearn.linear_model          import LogisticRegression
from sklearn                       import svm
from dotenv                        import load_dotenv
from joblib                        import dump, load
import pandas                      as pd
import numpy                       as np
import datetime
import ast


import warnings
warnings.filterwarnings("ignore")

#### Initialization

In [None]:
RANDOM_SEED = 151836

In [None]:
print("⚡ Start - {} ⚡\n".format(datetime.datetime.now()))
startTime = datetime.datetime.now()

#### 📥 1) Load Data 

In [None]:
DATA_PATH = "../TmpData/0_AndroCatEmbeddings.csv"

# Read the data
appsDF = pd.read_csv(DATA_PATH)

print("--- #️⃣ Apps: {} ".format(appsDF.shape[0]))

In [None]:
appsDF.head(5)

#### 2) Reorganize a bit the data

In [None]:
# Reorder columns
appsDF = appsDF[[col for col in appsDF.columns if col != "classID"] + ["classID"]]

# Rename the column
appsDF.rename(columns={"classID": "trueLabel"}, inplace=True)

In [None]:
def convertToNumpyArray(arrayStr):
	# Convert the string representation to a list using ast.literal_eval
	arrayList = ast.literal_eval(arrayStr)
	# Convert the list to a numpy array
	return np.array(arrayList)

# Apply the conversion function to the column
appsDF['embedding'] = appsDF['embedding'].apply(convertToNumpyArray)

In [None]:
appsDF.head(5)

#### 3) Train

Get X (Data), Y (Labels) And split.

In [None]:
# Convert the list of arrays into a NumPy matrix
X = np.vstack(appsDF['embedding'].tolist())
print("--- 📐 X Shape : {}".format(X.shape))

trueLabels = appsDF['trueLabel'].values 
print("--- 📐 Y Shape : {}".format(len(trueLabels)))

# Create an instance of LabelEncoder using a special value for NO CLASS
labelEncoder = LabelEncoder()
labelEncoder.fit(np.append(trueLabels, "NO_CLASS"))

# Save the model
dump(labelEncoder, '../TmpData/labelEncoder.joblib')

In [None]:
# Get Encoded True Lalebsl
Y_True = labelEncoder.transform(trueLabels)

NO_CLASS_VALUE = labelEncoder.transform(["NO_CLASS"])[0]
print("NO_CLASS Special Value:", NO_CLASS_VALUE)

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y_True, test_size=0.2, random_state = RANDOM_SEED)

In [None]:
modelName = 'svm'
#model     = svm.SVC(kernel='linear', C=1, probability=True)
model     = svm.SVC(kernel='sigmoid', C=1, gamma=1 , probability= True)

print("--- 🦾 Training model: {}".format(modelName))
model.fit(X_train, Y_train)

# Save the model
dump(model, '../TmpData/svmModel.joblib')

In [None]:
# # Define the neural network model
# modelName = 'NN'
# model = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(64,), random_state=RANDOM_SEED, max_iter=500)

# print("--- 🦾 Training model: {}".format(modelName))
# model.fit(X_train, Y_train)

In [None]:
# TO evaluate the model
def evaluateModel(modelName, Y_test, Y_pred, threshold):

	mask = (Y_pred != NO_CLASS_VALUE) & (Y_test != NO_CLASS_VALUE)
	Y_test_filtered = Y_test[mask]
	Y_pred_filtered = Y_pred[mask]

	# print(Y_test)
	# print(Y_test_filtered)
	# print(Y_pred)
	# print(Y_pred_filtered)

	# Compute metrics only on filtered labels
	accuracy  = accuracy_score(Y_test_filtered, Y_pred_filtered)
	precision = precision_score(Y_test_filtered, Y_pred_filtered, average='weighted')
	recall    = recall_score(Y_test_filtered, Y_pred_filtered, average='weighted')
	f1        = f1_score(Y_test_filtered, Y_pred_filtered, average='weighted')

	# Print metrics in a well-formatted way
	print("--- 📊 Model Evaluation Metrics 📊 ---")
	print("--- Model                    : {}".format(modelName))
	print("--- Confidence Threshold     : {:.2f}".format(threshold))
	print("--- Metrics:")
	print("------ Accuracy              : {:.4f}".format(accuracy))
	print("------ Precision (weighted)  : {:.4f}".format(precision))
	print("------ Recall (weighted)     : {:.4f}".format(recall))
	print("------ F1 Score (weighted)   : {:.4f}".format(f1))
	print("--"*20 + "\n")

def applyThreshold(probabilities, threshold):
	Y_pred = []

	for prob in probabilities:
		if np.max(prob) < threshold:
			Y_pred.append(NO_CLASS_VALUE)
		else:
			# Get the index of the maximum probability
			predicted_class = np.argmax(prob)
			# To fix index mismatch
			if(predicted_class) > NO_CLASS_VALUE:
				predicted_class += 1
			Y_pred.append(predicted_class)
	
	return np.array(Y_pred)

In [None]:
# Define a range of confidence thresholds
thresholds = [i/100 for i in range(75, 100, 5)]

# Get prediction probabilities
probabilities  = model.predict_proba(X_test)
for threshold in thresholds:
	# Apply threshold to get final predictions
	Y_pred = applyThreshold(probabilities, threshold)
	evaluateModel(modelName, Y_test, Y_pred, threshold)


# For entire Dataset
allProbablities = model.predict_proba(X)
for threshold in thresholds:
	# Use threshold
	predictedLabels = applyThreshold(allProbablities, threshold)

	# Convert numeric predictions back to labels
	predictedLabels = labelEncoder.inverse_transform(predictedLabels)

	# Add the predictions to the DataFrame
	appsDF[modelName + str(threshold)] = predictedLabels

### 4) Save Results

In [None]:
appsDF.head(5)

In [None]:
appsDF = appsDF.drop(columns=['description','embedding'])
appsDF.to_csv("../TmpData/0_AndroCatSetNewLabels.csv", index=False)

##### 🔚 End

In [None]:
endTime = datetime.datetime.now()
print("\n🔚 --- End - {} --- 🔚".format(endTime))

# Assuming endTime and startTime are in seconds
totalTime = endTime - startTime
minutes = totalTime.total_seconds() // 60
seconds = totalTime.total_seconds() % 60
print("⏱️ --- Time: {:02d} minutes and {:02d} seconds --- ⏱️".format(int(minutes), int(seconds)))