## 🔢 App Classification

In [55]:
from sklearn.metrics               import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection       import train_test_split
from sklearn.preprocessing         import LabelEncoder
from sklearn.neural_network        import MLPClassifier
from sklearn.linear_model          import LogisticRegression
from sklearn                       import svm
from dotenv                        import load_dotenv
import pandas                      as pd
import numpy                       as np
import datetime
import joblib
import ast


import warnings
warnings.filterwarnings("ignore")

#### Initialization

In [56]:
RANDOM_SEED = 151836

In [57]:
print("⚡ Start - {} ⚡\n".format(datetime.datetime.now()))
startTime = datetime.datetime.now()

⚡ Start - 2024-07-20 23:54:03.370808 ⚡



#### 📥 1) Load Data 

In [58]:
#DATA_PATH = "../TmpData/3_MalCatSet_Embeddings.csv"
#DATA_PATH = "../TmpData/4_MudFlow_Embeddings.csv"
DATA_PATH = "../TmpData/5_NewMal_Embeddings.csv"

# Read the data
appsDF = pd.read_csv(DATA_PATH)

print("--- #️⃣ Apps: {} ".format(appsDF.shape[0]))

--- #️⃣ Apps: 128 


In [59]:
appsDF.head(5)

Unnamed: 0,sha256,pkgName,description,embedding
0,093EBD1938A3FD17C91FBB34AED59E53F927EDBCE36865...,com.zoner.android.security,app work mobile device tablet switch layout se...,"[-0.00041017646435648203, 0.008432715199887753..."
1,4156C9E0A3F5469E0A4B2BC39D6D58EEAD89B8C5AC951A...,com.crescentmoongames.monkeyboxing,enter monkey box league monkey box step inside...,"[0.008457562886178493, 0.010708541609346867, -..."
2,701B6E746DFB5521C8FD70662945A1AA92980F371662C3...,com.roamingsoft.manager,wifi connection manager scanner manager connec...,"[-0.029483338817954063, 0.0008390073198825121,..."
3,9739C87F05E3AAE3B1B600F4CC4B01DBBCE16B92AAC414...,com.zeptolab.ctrexperiments.google.paid,experiment feed om candy cut rope like never e...,"[0.03246341645717621, 0.009713522158563137, -0..."
4,13100B7D640C539372B744C80C117600182D7F40109B34...,org.cohortor.gstrings,chromatic tuner application measure sound pitc...,"[-0.007262952160090208, 0.015478202141821384, ..."


#### 2) Reorganize a bit the data

In [60]:
def convertToNumpyArray(arrayStr):
	# Convert the string representation to a list using ast.literal_eval
	arrayList = ast.literal_eval(arrayStr)
	# Convert the list to a numpy array
	return np.array(arrayList)

# Apply the conversion function to the column
appsDF['embedding'] = appsDF['embedding'].apply(convertToNumpyArray)

In [61]:
appsDF.head(5)

Unnamed: 0,sha256,pkgName,description,embedding
0,093EBD1938A3FD17C91FBB34AED59E53F927EDBCE36865...,com.zoner.android.security,app work mobile device tablet switch layout se...,"[-0.00041017646435648203, 0.008432715199887753..."
1,4156C9E0A3F5469E0A4B2BC39D6D58EEAD89B8C5AC951A...,com.crescentmoongames.monkeyboxing,enter monkey box league monkey box step inside...,"[0.008457562886178493, 0.010708541609346867, -..."
2,701B6E746DFB5521C8FD70662945A1AA92980F371662C3...,com.roamingsoft.manager,wifi connection manager scanner manager connec...,"[-0.029483338817954063, 0.0008390073198825121,..."
3,9739C87F05E3AAE3B1B600F4CC4B01DBBCE16B92AAC414...,com.zeptolab.ctrexperiments.google.paid,experiment feed om candy cut rope like never e...,"[0.03246341645717621, 0.009713522158563137, -0..."
4,13100B7D640C539372B744C80C117600182D7F40109B34...,org.cohortor.gstrings,chromatic tuner application measure sound pitc...,"[-0.007262952160090208, 0.015478202141821384, ..."


#### 3) Train

Get X (Data), Y (Labels) And split.

In [62]:
# Convert the list of arrays into a NumPy matrix
X = np.vstack(appsDF['embedding'].tolist())
print("--- 📐 X Shape : {}".format(X.shape))

# Create an instance of LabelEncoder using a special value for NO CLASS
labelEncoder = joblib.load("../TmpData/labelEncoder.joblib")

NO_CLASS_VALUE = labelEncoder.transform(["NO_CLASS"])[0]
print("NO_CLASS Special Value:", NO_CLASS_VALUE)

--- 📐 X Shape : (128, 1536)
NO_CLASS Special Value: 30


In [63]:
# TO evaluate the model
def evaluateModel(modelName, Y_test, Y_pred, threshold):

	mask = (Y_pred != NO_CLASS_VALUE) & (Y_test != NO_CLASS_VALUE)
	Y_test_filtered = Y_test[mask]
	Y_pred_filtered = Y_pred[mask]

	# print(Y_test)
	# print(Y_test_filtered)
	# print(Y_pred)
	# print(Y_pred_filtered)

	# Compute metrics only on filtered labels
	accuracy  = accuracy_score(Y_test_filtered, Y_pred_filtered)
	precision = precision_score(Y_test_filtered, Y_pred_filtered, average='weighted')
	recall    = recall_score(Y_test_filtered, Y_pred_filtered, average='weighted')
	f1        = f1_score(Y_test_filtered, Y_pred_filtered, average='weighted')

	# Print metrics in a well-formatted way
	print("--- 📊 Model Evaluation Metrics 📊 ---")
	print("--- Model                    : {}".format(modelName))
	print("--- Confidence Threshold     : {:.2f}".format(threshold))
	print("--- Metrics:")
	print("------ Accuracy              : {:.4f}".format(accuracy))
	print("------ Precision (weighted)  : {:.4f}".format(precision))
	print("------ Recall (weighted)     : {:.4f}".format(recall))
	print("------ F1 Score (weighted)   : {:.4f}".format(f1))
	print("--"*20 + "\n")

def applyThreshold(probabilities, threshold):
	Y_pred = []

	for prob in probabilities:
		if np.max(prob) < threshold:
			Y_pred.append(NO_CLASS_VALUE)
		else:
			# Get the index of the maximum probability
			predicted_class = np.argmax(prob)
			# To fix index mismatch
			if(predicted_class) > NO_CLASS_VALUE:
				predicted_class += 1
			Y_pred.append(predicted_class)
	
	return np.array(Y_pred)

In [64]:
model = joblib.load("../TmpData/svmModel.joblib")
print(model)

SVC(C=1, gamma=1, kernel='sigmoid', probability=True)


In [65]:
# Define a range of confidence thresholds
confidfenceThreshold = 0.5

# For entire Dataset
allProbablities = model.predict_proba(X)

# Use threshold
Y_pred = applyThreshold(allProbablities, confidfenceThreshold)

# Convert numeric predictions back to labels
predictedLabels = labelEncoder.inverse_transform(Y_pred)

### 4) Save Results

In [66]:
# Add the predictions to the DataFrame
appsDF["classID"] = predictedLabels

In [67]:
appsDF.head(5)

Unnamed: 0,sha256,pkgName,description,embedding,classID
0,093EBD1938A3FD17C91FBB34AED59E53F927EDBCE36865...,com.zoner.android.security,app work mobile device tablet switch layout se...,"[-0.00041017646435648203, 0.008432715199887753...",Antivirus
1,4156C9E0A3F5469E0A4B2BC39D6D58EEAD89B8C5AC951A...,com.crescentmoongames.monkeyboxing,enter monkey box league monkey box step inside...,"[0.008457562886178493, 0.010708541609346867, -...",NO_CLASS
2,701B6E746DFB5521C8FD70662945A1AA92980F371662C3...,com.roamingsoft.manager,wifi connection manager scanner manager connec...,"[-0.029483338817954063, 0.0008390073198825121,...",NO_CLASS
3,9739C87F05E3AAE3B1B600F4CC4B01DBBCE16B92AAC414...,com.zeptolab.ctrexperiments.google.paid,experiment feed om candy cut rope like never e...,"[0.03246341645717621, 0.009713522158563137, -0...",NO_CLASS
4,13100B7D640C539372B744C80C117600182D7F40109B34...,org.cohortor.gstrings,chromatic tuner application measure sound pitc...,"[-0.007262952160090208, 0.015478202141821384, ...",NO_CLASS


In [68]:
# Count values in column 'A'
counts = appsDF['classID'].value_counts()

# Display the counts
print(counts)

classID
NO_CLASS                   109
PhotoEditor                  3
Antivirus                    2
BarcodeAndQRcodeScanner      2
Dialer                       2
Wallpaper                    1
Keyboard                     1
ExpenseTracker               1
TravelGuide                  1
FileManager                  1
News                         1
VideoPlayer                  1
Banking                      1
Calculator                   1
Alarm                        1
Name: count, dtype: int64


In [69]:
# Count occurrences of 'NO_CLASS' and calculate percentage
noClassCount = (appsDF['classID'] == 'NO_CLASS').sum()
noClassPercentage = (noClassCount / len(appsDF)) * 100

# Calculate counts and percentages for remaining classes
remainingCount = len(appsDF) - noClassCount
remainingPercentage = 100 - noClassPercentage

# Print results for 'NO_CLASS'
print(f"Number of occurrences of 'NO_CLASS'            : {noClassCount}")
print(f"Percentage of occurrences of 'NO_CLASS'        : {noClassPercentage:.2f}%")

# Print results for remaining classes
print(f"Number of occurrences of remaining classes     : {remainingCount}")
print(f"Percentage of occurrences of remaining classes : {remainingPercentage:.2f}%")

Number of occurrences of 'NO_CLASS'            : 109
Percentage of occurrences of 'NO_CLASS'        : 85.16%
Number of occurrences of remaining classes     : 19
Percentage of occurrences of remaining classes : 14.84%


In [70]:
# Drop rows where classID is 'NO_CLASS'
appsDF = appsDF[appsDF['classID'] != 'NO_CLASS']

# Order the DataFrame by classID
appsDF = appsDF.sort_values(by='classID')

In [71]:
# appsDF = appsDF.drop(columns=['description','embedding'])
# appsDF.to_csv("../TmpData/3_MalCatSet_NewLabels.csv", index=False)

appsDF = appsDF.drop(columns=['description','embedding'])
appsDF.to_csv("../TmpData/5_NewMal_NewLabels.csv", index=False)

#appsDF = appsDF.drop(columns=['description','embedding'])
#appsDF.to_csv("../TmpData/4_MudFlow_NewLabels.csv", index=False)

##### 🔚 End

In [72]:
endTime = datetime.datetime.now()
print("\n🔚 --- End - {} --- 🔚".format(endTime))

# Assuming endTime and startTime are in seconds
totalTime = endTime - startTime
minutes = totalTime.total_seconds() // 60
seconds = totalTime.total_seconds() % 60
print("⏱️ --- Time: {:02d} minutes and {:02d} seconds --- ⏱️".format(int(minutes), int(seconds)))


🔚 --- End - 2024-07-20 23:54:04.196988 --- 🔚
⏱️ --- Time: 00 minutes and 00 seconds --- ⏱️
