## 🔢 App Classification

In [75]:
from sklearn.metrics               import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection       import train_test_split
from sklearn.preprocessing         import LabelEncoder
from sklearn.neural_network        import MLPClassifier
from sklearn.linear_model          import LogisticRegression
from sklearn                       import svm
from dotenv                        import load_dotenv
import pandas                      as pd
import numpy                       as np
import datetime
import joblib
import ast


import warnings
warnings.filterwarnings("ignore")

#### Initialization

In [76]:
RANDOM_SEED = 151836

In [77]:
print("⚡ Start - {} ⚡\n".format(datetime.datetime.now()))
startTime = datetime.datetime.now()

⚡ Start - 2024-07-25 20:09:18.356527 ⚡



#### 📥 1) Load Data 

In [78]:
DATA_PATH = "../TmpData/5_NewMalCatSetEmbeddings.csv"

# Read the data
appsDF = pd.read_csv(DATA_PATH)
print("--- #️⃣ Apps: {} ".format(appsDF.shape[0]))

appsDF.head(5)

--- #️⃣ Apps: 2893 


Unnamed: 0,sha256,pkgName,source,BenjaminClass,description,embedding
0,001E53DDA101AF19670FBD7FB5D348B21641036240895F...,es.aroundpixels.hsk3lite,AZ,,new chinese level allow easily interactively d...,"[0.002189100254327059, -0.0016080982750281692,..."
1,002557D37607615083739ABEB8D6FDC05CD27EC93C1104...,com.onesports.score,AZ,,expert information statistic completely new ex...,"[-0.013837181963026524, 0.009949913248419762, ..."
2,00280B2E4FA921E570467396A380FA4248AFA10372B97E...,com.perception.soc.en,AZ,,play best classic empire online truly massivel...,"[0.029726574197411537, 0.03841918334364891, 0...."
3,09B598D9B39A3ADDF0EA884D95B29013F917A8E2419F62...,com.perception.soc.en,AZ,,play best classic empire online truly massivel...,"[0.02981792762875557, 0.038398627191782, 0.015..."
4,0E5FD3106C026C100956935842519D88039FCAA9A0BEB8...,com.perception.soc.en,AZ,,play best classic empire online truly massivel...,"[0.02981792762875557, 0.038398627191782, 0.015..."


In [79]:
# Load classes from Benjamin manual labellinh
benjaminDF = pd.read_csv("../../0_Data/3_MalCatSetBenjamin.csv")

# Merge dfA and dfB on the 'sha256' column
mergedDF = pd.merge(appsDF, benjaminDF[['sha256', 'classID']], on='sha256', how='left')

# The resulting DataFrame will have the 'classID' column from dfB
appsDF['BenjaminClass'] = mergedDF['classID']

#### 2) Reorganize a bit the data

In [80]:
def convertToNumpyArray(arrayStr):
	# Convert the string representation to a list using ast.literal_eval
	arrayList = ast.literal_eval(arrayStr)
	# Convert the list to a numpy array
	return np.array(arrayList)

# Apply the conversion function to the column
appsDF['embedding'] = appsDF['embedding'].apply(convertToNumpyArray)

In [81]:
appsDF.head(5)

Unnamed: 0,sha256,pkgName,source,BenjaminClass,description,embedding
0,001E53DDA101AF19670FBD7FB5D348B21641036240895F...,es.aroundpixels.hsk3lite,AZ,,new chinese level allow easily interactively d...,"[0.002189100254327059, -0.0016080982750281692,..."
1,002557D37607615083739ABEB8D6FDC05CD27EC93C1104...,com.onesports.score,AZ,,expert information statistic completely new ex...,"[-0.013837181963026524, 0.009949913248419762, ..."
2,00280B2E4FA921E570467396A380FA4248AFA10372B97E...,com.perception.soc.en,AZ,,play best classic empire online truly massivel...,"[0.029726574197411537, 0.03841918334364891, 0...."
3,09B598D9B39A3ADDF0EA884D95B29013F917A8E2419F62...,com.perception.soc.en,AZ,,play best classic empire online truly massivel...,"[0.02981792762875557, 0.038398627191782, 0.015..."
4,0E5FD3106C026C100956935842519D88039FCAA9A0BEB8...,com.perception.soc.en,AZ,,play best classic empire online truly massivel...,"[0.02981792762875557, 0.038398627191782, 0.015..."


#### 3) Train

Get X (Data), Y (Labels) And split.

In [82]:
# Convert the list of arrays into a NumPy matrix
X = np.vstack(appsDF['embedding'].tolist())
print("--- 📐 X Shape : {}".format(X.shape))

# Create an instance of LabelEncoder using a special value for NO CLASS
labelEncoder = joblib.load("../TmpData/labelEncoder.joblib")

NO_CLASS_VALUE = labelEncoder.transform(["NO_CLASS"])[0]
print("NO_CLASS Special Value:", NO_CLASS_VALUE)

--- 📐 X Shape : (2893, 1536)
NO_CLASS Special Value: 30


In [83]:
# TO evaluate the model
def evaluateModel(modelName, Y_test, Y_pred, threshold):

	mask = (Y_pred != NO_CLASS_VALUE) & (Y_test != NO_CLASS_VALUE)
	Y_test_filtered = Y_test[mask]
	Y_pred_filtered = Y_pred[mask]

	# print(Y_test)
	# print(Y_test_filtered)
	# print(Y_pred)
	# print(Y_pred_filtered)

	# Compute metrics only on filtered labels
	accuracy  = accuracy_score(Y_test_filtered, Y_pred_filtered)
	precision = precision_score(Y_test_filtered, Y_pred_filtered, average='weighted')
	recall    = recall_score(Y_test_filtered, Y_pred_filtered, average='weighted')
	f1        = f1_score(Y_test_filtered, Y_pred_filtered, average='weighted')

	# Print metrics in a well-formatted way
	print("--- 📊 Model Evaluation Metrics 📊 ---")
	print("--- Model                    : {}".format(modelName))
	print("--- Confidence Threshold     : {:.2f}".format(threshold))
	print("--- Metrics:")
	print("------ Accuracy              : {:.4f}".format(accuracy))
	print("------ Precision (weighted)  : {:.4f}".format(precision))
	print("------ Recall (weighted)     : {:.4f}".format(recall))
	print("------ F1 Score (weighted)   : {:.4f}".format(f1))
	print("--"*20 + "\n")

def applyThreshold(probabilities, threshold):
	Y_pred = []

	for prob in probabilities:
		if np.max(prob) < threshold:
			Y_pred.append(NO_CLASS_VALUE)
		else:
			# Get the index of the maximum probability
			predicted_class = np.argmax(prob)
			# To fix index mismatch
			if(predicted_class) > NO_CLASS_VALUE:
				predicted_class += 1
			Y_pred.append(predicted_class)
	
	return np.array(Y_pred)

In [84]:
model = joblib.load("../TmpData/svmModel.joblib")
print(model)

SVC(C=1, gamma=1, kernel='sigmoid', probability=True)


In [85]:
# Define a range of confidence thresholds
confidfenceThreshold = 0.65

# For entire Dataset
allProbablities = model.predict_proba(X)

# Use threshold
Y_pred = applyThreshold(allProbablities, confidfenceThreshold)

# Convert numeric predictions back to labels
predictedLabels = labelEncoder.inverse_transform(Y_pred)

### 4) Save Results

In [86]:
# Add the predictions to the DataFrame
appsDF["classID"] = predictedLabels

In [87]:
appsDF.head(5)

Unnamed: 0,sha256,pkgName,source,BenjaminClass,description,embedding,classID
0,001E53DDA101AF19670FBD7FB5D348B21641036240895F...,es.aroundpixels.hsk3lite,AZ,,new chinese level allow easily interactively d...,"[0.002189100254327059, -0.0016080982750281692,...",LanguageLearning
1,002557D37607615083739ABEB8D6FDC05CD27EC93C1104...,com.onesports.score,AZ,,expert information statistic completely new ex...,"[-0.013837181963026524, 0.009949913248419762, ...",NO_CLASS
2,00280B2E4FA921E570467396A380FA4248AFA10372B97E...,com.perception.soc.en,AZ,,play best classic empire online truly massivel...,"[0.029726574197411537, 0.03841918334364891, 0....",NO_CLASS
3,09B598D9B39A3ADDF0EA884D95B29013F917A8E2419F62...,com.perception.soc.en,AZ,,play best classic empire online truly massivel...,"[0.02981792762875557, 0.038398627191782, 0.015...",NO_CLASS
4,0E5FD3106C026C100956935842519D88039FCAA9A0BEB8...,com.perception.soc.en,AZ,,play best classic empire online truly massivel...,"[0.02981792762875557, 0.038398627191782, 0.015...",NO_CLASS


In [88]:
# Filtering rows where 'benjaminClass' is not NaN
filtered_appsDF = appsDF[appsDF['BenjaminClass'].notna()]

filtered_appsDF

Unnamed: 0,sha256,pkgName,source,BenjaminClass,description,embedding,classID
27,1A21267F408BE18CC6E927FBEC49AB6535DD6736137713...,im.token.app,AZ,Investment,feature rich digital wallet securely manage at...,"[0.015967801213264465, -0.005556922405958176, ...",NO_CLASS
37,009AACC5A4276BE4E16C6E5890784E2A230AF16476D27E...,com.Siyou.HD,AZ,RemoteControl,download application free pilot r hd drone use...,"[0.012928946875035763, 0.030400829389691353, -...",NO_CLASS
38,00ADEDC8E79A8DFD10589019DEAFE6FA04712AE22A6AB9...,dev.ovpn3jx.magnetic,AZ,Vpn,magnetic vpn pro free high quality core base a...,"[-0.022940730676054955, 0.008177200332283974, ...",NO_CLASS
51,00F223C81F0AEA7D61FA00EDC2746808F6898375CAC186...,net.sdsingle.tunnel,AZ,Vpn,data security online privacy provide clear voi...,"[0.025399068370461464, -0.006695758551359177, ...",NO_CLASS
52,4482A87D6ECFDA5FE6A8357B9685F027D39F7E265ACC11...,net.sdsingle.tunnel,AZ,Vpn,data security online privacy provide clear voi...,"[0.025441458448767662, -0.006706394720822573, ...",NO_CLASS
...,...,...,...,...,...,...,...
2218,B4F876917B530B88D649F8C89C59C313BA0E200A4B36C5...,app.vpn.prouaenet,AZ,Launcher,enjoy high speed vpn service lite website,"[-0.005240029189735651, -0.022593196481466293,...",Vpn
2219,B55A1B50CB94BC126E4EFF03CE1381AA9061CACFEE215D...,com.sayuran.edutika,AZ,Vpn,get complete family cook various recipe idea t...,"[0.009473497048020363, -0.01938089355826378, -...",Recipes
2231,D07F8AE471B726EC5CA26BCDBCFBFBC2BB6050B89F017A...,com.infinitevpn.org,AZ,Vpn,infinite vpn free vpn everyone server come ran...,"[0.01042257435619831, -0.03634800761938095, 0....",Vpn
2235,B78D97D2ABB7735D46FB3760C1D3AF5208805A07B22024...,com.pocketrewards.gamequiz,AZ,Vpn,game quiz trivia guess bring amazing memory ba...,"[0.005612443201243877, -0.0023694115225225687,...",NO_CLASS


In [89]:
# Count values in column 'A'
counts = appsDF['classID'].value_counts()

# Display the counts
print(counts)

classID
NO_CLASS                      2172
Vpn                            294
PhotoEditor                     45
Translator                      42
Wallpaper                       32
Antivirus                       31
Astrology                       20
Investment                      18
Dialer                          17
SmartHome                       16
Messenger                       15
Launcher                        15
Banking                         13
LanguageLearning                13
Dating                          12
Shopping                        11
Radio                           10
Weather                          9
News                             8
Streaming                        8
BarcodeAndQRcodeScanner          8
Calculator                       7
RemoteControl                    7
Recipes                          6
Keyboard                         5
BooksReader                      5
VideoPlayer                      5
Browser                          5
PublicTransi

In [90]:
# Count occurrences of 'NO_CLASS' and calculate percentage
noClassCount = (appsDF['classID'] == 'NO_CLASS').sum()
noClassPercentage = (noClassCount / len(appsDF)) * 100

# Calculate counts and percentages for remaining classes
remainingCount = len(appsDF) - noClassCount
remainingPercentage = 100 - noClassPercentage

# Print results for 'NO_CLASS'
print(f"Number of occurrences of 'NO_CLASS'            : {noClassCount}")
print(f"Percentage of occurrences of 'NO_CLASS'        : {noClassPercentage:.2f}%")

# Print results for remaining classes
print(f"Number of occurrences of remaining classes     : {remainingCount}")
print(f"Percentage of occurrences of remaining classes : {remainingPercentage:.2f}%")

Number of occurrences of 'NO_CLASS'            : 2172
Percentage of occurrences of 'NO_CLASS'        : 75.08%
Number of occurrences of remaining classes     : 721
Percentage of occurrences of remaining classes : 24.92%


In [91]:
# Drop rows where classID is 'NO_CLASS'
appsDF = appsDF[appsDF['classID'] != 'NO_CLASS']

# Order the DataFrame by classID
appsDF = appsDF.sort_values(by='classID')

In [92]:
appsDF = appsDF.drop(columns=['description','embedding','benjaminClass','source'])
appsDF.to_csv("../TmpData/5_NewMalCatSetNewLabels.csv", index=False)

##### 🔚 End

In [93]:
endTime = datetime.datetime.now()
print("\n🔚 --- End - {} --- 🔚".format(endTime))

# Assuming endTime and startTime are in seconds
totalTime = endTime - startTime
minutes = totalTime.total_seconds() // 60
seconds = totalTime.total_seconds() % 60
print("⏱️ --- Time: {:02d} minutes and {:02d} seconds --- ⏱️".format(int(minutes), int(seconds)))


🔚 --- End - 2024-07-25 20:09:31.968268 --- 🔚
⏱️ --- Time: 00 minutes and 13 seconds --- ⏱️
