## 🔢 App Classification

In [1]:
from sklearn.metrics               import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection       import train_test_split
from sklearn.preprocessing         import LabelEncoder
from sklearn.neural_network        import MLPClassifier
from sklearn.linear_model          import LogisticRegression
from sklearn                       import svm
from dotenv                        import load_dotenv
import pandas                      as pd
import numpy                       as np
import datetime
import joblib
import ast


import warnings
warnings.filterwarnings("ignore")

#### Initialization

In [2]:
RANDOM_SEED = 151836

In [3]:
print("⚡ Start - {} ⚡\n".format(datetime.datetime.now()))
startTime = datetime.datetime.now()

⚡ Start - 2024-07-20 18:58:10.323285 ⚡



#### 📥 1) Load Data 

In [4]:
DATA_PATH = "../TmpData/3_MalCatSet_Embeddings.csv"

# Read the data
appsDF = pd.read_csv(DATA_PATH)

print("--- #️⃣ Apps: {} ".format(appsDF.shape[0]))

--- #️⃣ Apps: 512 


In [5]:
appsDF.head(5)

Unnamed: 0,sha256,pkgName,classID,description,embedding
0,24D3490CF23842A791CBB5B10F1427808F4B163F9C4927...,com.rytong.airchina,Airlines,air china national flag carrier offer commerci...,"[0.016700733453035355, -0.028782714158296585, ..."
1,2D3D869A1DF82ACDCABBB08277ADECB8E5B64DB4DB516C...,com.rytong.hnair,Airlines,airline company profile airline hold co ltd he...,"[0.004994473420083523, -0.0420411080121994, 0...."
2,238396FF004CF5505CC1FB783C2671D6E89867235D2846...,com.fsecure.eicar.antivirus.test,Antivirus,test safely antivirus app work note antivirus ...,"[-0.015152397565543652, -0.0017729764804244041..."
3,9937BC3FADCD8A177D78CD14217C6B81F7AAE687557751...,com.kbzbank.kpaycustomer,Banking,mobile wallet power bank safer simpler conveni...,"[0.014227830804884434, -0.009698555804789066, ..."
4,7B18A91FEBBCAF7AE470DAA12942C54E1DFB62B010FAB2...,com.banofinancial.app,Banking,innovative australia spend request save invest...,"[-0.008306749165058136, -0.0017330991104245186..."


#### 2) Reorganize a bit the data

In [6]:
# Reorder columns
appsDF = appsDF[[col for col in appsDF.columns if col != "classID"] + ["classID"]]

# Rename the column
appsDF.rename(columns={"classID": "trueLabel"}, inplace=True)

In [7]:
def convertToNumpyArray(arrayStr):
	# Convert the string representation to a list using ast.literal_eval
	arrayList = ast.literal_eval(arrayStr)
	# Convert the list to a numpy array
	return np.array(arrayList)

# Apply the conversion function to the column
appsDF['embedding'] = appsDF['embedding'].apply(convertToNumpyArray)

In [8]:
appsDF.head(5)

Unnamed: 0,sha256,pkgName,description,embedding,trueLabel
0,24D3490CF23842A791CBB5B10F1427808F4B163F9C4927...,com.rytong.airchina,air china national flag carrier offer commerci...,"[0.016700733453035355, -0.028782714158296585, ...",Airlines
1,2D3D869A1DF82ACDCABBB08277ADECB8E5B64DB4DB516C...,com.rytong.hnair,airline company profile airline hold co ltd he...,"[0.004994473420083523, -0.0420411080121994, 0....",Airlines
2,238396FF004CF5505CC1FB783C2671D6E89867235D2846...,com.fsecure.eicar.antivirus.test,test safely antivirus app work note antivirus ...,"[-0.015152397565543652, -0.0017729764804244041...",Antivirus
3,9937BC3FADCD8A177D78CD14217C6B81F7AAE687557751...,com.kbzbank.kpaycustomer,mobile wallet power bank safer simpler conveni...,"[0.014227830804884434, -0.009698555804789066, ...",Banking
4,7B18A91FEBBCAF7AE470DAA12942C54E1DFB62B010FAB2...,com.banofinancial.app,innovative australia spend request save invest...,"[-0.008306749165058136, -0.0017330991104245186...",Banking


#### 3) Train

Get X (Data), Y (Labels) And split.

In [9]:
# Convert the list of arrays into a NumPy matrix
X = np.vstack(appsDF['embedding'].tolist())
print("--- 📐 X Shape : {}".format(X.shape))

# Create an instance of LabelEncoder using a special value for NO CLASS
labelEncoder = joblib.load("../TmpData/labelEncoder.joblib")

NO_CLASS_VALUE = labelEncoder.transform(["NO_CLASS"])[0]
print("NO_CLASS Special Value:", NO_CLASS_VALUE)

--- 📐 X Shape : (512, 1536)
NO_CLASS Special Value: 30


In [10]:
# TO evaluate the model
def evaluateModel(modelName, Y_test, Y_pred, threshold):

	mask = (Y_pred != NO_CLASS_VALUE) & (Y_test != NO_CLASS_VALUE)
	Y_test_filtered = Y_test[mask]
	Y_pred_filtered = Y_pred[mask]

	# print(Y_test)
	# print(Y_test_filtered)
	# print(Y_pred)
	# print(Y_pred_filtered)

	# Compute metrics only on filtered labels
	accuracy  = accuracy_score(Y_test_filtered, Y_pred_filtered)
	precision = precision_score(Y_test_filtered, Y_pred_filtered, average='weighted')
	recall    = recall_score(Y_test_filtered, Y_pred_filtered, average='weighted')
	f1        = f1_score(Y_test_filtered, Y_pred_filtered, average='weighted')

	# Print metrics in a well-formatted way
	print("--- 📊 Model Evaluation Metrics 📊 ---")
	print("--- Model                    : {}".format(modelName))
	print("--- Confidence Threshold     : {:.2f}".format(threshold))
	print("--- Metrics:")
	print("------ Accuracy              : {:.4f}".format(accuracy))
	print("------ Precision (weighted)  : {:.4f}".format(precision))
	print("------ Recall (weighted)     : {:.4f}".format(recall))
	print("------ F1 Score (weighted)   : {:.4f}".format(f1))
	print("--"*20 + "\n")

def applyThreshold(probabilities, threshold):
	Y_pred = []

	for prob in probabilities:
		if np.max(prob) < threshold:
			Y_pred.append(NO_CLASS_VALUE)
		else:
			# Get the index of the maximum probability
			predicted_class = np.argmax(prob)
			# To fix index mismatch
			if(predicted_class) > NO_CLASS_VALUE:
				predicted_class += 1
			Y_pred.append(predicted_class)
	
	return np.array(Y_pred)

In [11]:
model = joblib.load("../TmpData/svmModel.joblib")
print(model)

SVC(C=1, gamma=1, kernel='sigmoid', probability=True)


In [12]:
# Define a range of confidence thresholds
confidfenceThreshold = 0.5

# For entire Dataset
allProbablities = model.predict_proba(X)

# Use threshold
Y_pred = applyThreshold(allProbablities, confidfenceThreshold)

# Convert numeric predictions back to labels
predictedLabels = labelEncoder.inverse_transform(Y_pred)

### 4) Save Results

In [13]:
# Add the predictions to the DataFrame
appsDF["newClassID"] = predictedLabels

In [14]:
appsDF.head(5)

Unnamed: 0,sha256,pkgName,description,embedding,trueLabel,newClassID
0,24D3490CF23842A791CBB5B10F1427808F4B163F9C4927...,com.rytong.airchina,air china national flag carrier offer commerci...,"[0.016700733453035355, -0.028782714158296585, ...",Airlines,Airlines
1,2D3D869A1DF82ACDCABBB08277ADECB8E5B64DB4DB516C...,com.rytong.hnair,airline company profile airline hold co ltd he...,"[0.004994473420083523, -0.0420411080121994, 0....",Airlines,Airlines
2,238396FF004CF5505CC1FB783C2671D6E89867235D2846...,com.fsecure.eicar.antivirus.test,test safely antivirus app work note antivirus ...,"[-0.015152397565543652, -0.0017729764804244041...",Antivirus,Antivirus
3,9937BC3FADCD8A177D78CD14217C6B81F7AAE687557751...,com.kbzbank.kpaycustomer,mobile wallet power bank safer simpler conveni...,"[0.014227830804884434, -0.009698555804789066, ...",Banking,Banking
4,7B18A91FEBBCAF7AE470DAA12942C54E1DFB62B010FAB2...,com.banofinancial.app,innovative australia spend request save invest...,"[-0.008306749165058136, -0.0017330991104245186...",Banking,Banking


In [15]:
# Count occurrences of 'NO_CLASS' and calculate percentage
noClassCount = (appsDF['newClassID'] == 'NO_CLASS').sum()
noClassPercentage = (noClassCount / len(appsDF)) * 100

# Calculate counts and percentages for remaining classes
remainingCount = len(appsDF) - noClassCount
remainingPercentage = 100 - noClassPercentage

# Print results for 'NO_CLASS'
print(f"Number of occurrences of 'NO_CLASS'            : {noClassCount}")
print(f"Percentage of occurrences of 'NO_CLASS'        : {noClassPercentage:.2f}%")

# Print results for remaining classes
print(f"Number of occurrences of remaining classes     : {remainingCount}")
print(f"Percentage of occurrences of remaining classes : {remainingPercentage:.2f}%")

Number of occurrences of 'NO_CLASS'            : 303
Percentage of occurrences of 'NO_CLASS'        : 59.18%
Number of occurrences of remaining classes     : 209
Percentage of occurrences of remaining classes : 40.82%


In [16]:
appsDF = appsDF.drop(columns=['description','embedding'])
appsDF.to_csv("../TmpData/3_MalCatSet_NewLabels.csv", index=False)

##### 🔚 End

In [17]:
endTime = datetime.datetime.now()
print("\n🔚 --- End - {} --- 🔚".format(endTime))

# Assuming endTime and startTime are in seconds
totalTime = endTime - startTime
minutes = totalTime.total_seconds() // 60
seconds = totalTime.total_seconds() % 60
print("⏱️ --- Time: {:02d} minutes and {:02d} seconds --- ⏱️".format(int(minutes), int(seconds)))


🔚 --- End - 2024-07-20 18:58:12.885079 --- 🔚
⏱️ --- Time: 00 minutes and 02 seconds --- ⏱️
