## 🔢 App Classification

In [19]:
from sklearn.metrics               import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection       import train_test_split
from sklearn.preprocessing         import LabelEncoder
from sklearn.neural_network        import MLPClassifier
from sklearn.linear_model          import LogisticRegression
from sklearn                       import svm
from dotenv                        import load_dotenv
import pandas                      as pd
import numpy                       as np
import datetime
import joblib
import ast


import warnings
warnings.filterwarnings("ignore")

#### Initialization

In [20]:
RANDOM_SEED = 151836

In [21]:
print("⚡ Start - {} ⚡\n".format(datetime.datetime.now()))
startTime = datetime.datetime.now()

⚡ Start - 2024-07-20 22:26:53.615906 ⚡



#### 📥 1) Load Data 

In [22]:
#DATA_PATH = "../TmpData/3_MalCatSet_Embeddings.csv"
DATA_PATH = "../TmpData/5_NewMal_Embeddings.csv"

# Read the data
appsDF = pd.read_csv(DATA_PATH)

print("--- #️⃣ Apps: {} ".format(appsDF.shape[0]))

--- #️⃣ Apps: 942 


In [23]:
appsDF.head(5)

Unnamed: 0,pkgName,description,embedding
0,es.aroundpixels.hsk3lite,worry know study chinese complicate bingo teac...,"[0.005222779233008623, -0.01788198947906494, -..."
1,com.onesports.score,expert information statistic completely new ex...,"[-0.01127683650702238, 0.0010928395204246044, ..."
2,com.perception.soc.en,play best classic empire online truly massivel...,"[0.02538859285414219, 0.04251297935843468, 0.0..."
3,im.token.app,reliable web digital wallet trust ten million ...,"[0.012788532301783562, 0.004980458877980709, -..."
4,com.Siyou.HD,download application free pilot r hd drone use...,"[0.012929912656545639, 0.030403099954128265, -..."


#### 2) Reorganize a bit the data

In [24]:
def convertToNumpyArray(arrayStr):
	# Convert the string representation to a list using ast.literal_eval
	arrayList = ast.literal_eval(arrayStr)
	# Convert the list to a numpy array
	return np.array(arrayList)

# Apply the conversion function to the column
appsDF['embedding'] = appsDF['embedding'].apply(convertToNumpyArray)

In [25]:
appsDF.head(5)

Unnamed: 0,pkgName,description,embedding
0,es.aroundpixels.hsk3lite,worry know study chinese complicate bingo teac...,"[0.005222779233008623, -0.01788198947906494, -..."
1,com.onesports.score,expert information statistic completely new ex...,"[-0.01127683650702238, 0.0010928395204246044, ..."
2,com.perception.soc.en,play best classic empire online truly massivel...,"[0.02538859285414219, 0.04251297935843468, 0.0..."
3,im.token.app,reliable web digital wallet trust ten million ...,"[0.012788532301783562, 0.004980458877980709, -..."
4,com.Siyou.HD,download application free pilot r hd drone use...,"[0.012929912656545639, 0.030403099954128265, -..."


#### 3) Train

Get X (Data), Y (Labels) And split.

In [26]:
# Convert the list of arrays into a NumPy matrix
X = np.vstack(appsDF['embedding'].tolist())
print("--- 📐 X Shape : {}".format(X.shape))

# Create an instance of LabelEncoder using a special value for NO CLASS
labelEncoder = joblib.load("../TmpData/labelEncoder.joblib")

NO_CLASS_VALUE = labelEncoder.transform(["NO_CLASS"])[0]
print("NO_CLASS Special Value:", NO_CLASS_VALUE)

--- 📐 X Shape : (942, 1536)
NO_CLASS Special Value: 30


In [27]:
# TO evaluate the model
def evaluateModel(modelName, Y_test, Y_pred, threshold):

	mask = (Y_pred != NO_CLASS_VALUE) & (Y_test != NO_CLASS_VALUE)
	Y_test_filtered = Y_test[mask]
	Y_pred_filtered = Y_pred[mask]

	# print(Y_test)
	# print(Y_test_filtered)
	# print(Y_pred)
	# print(Y_pred_filtered)

	# Compute metrics only on filtered labels
	accuracy  = accuracy_score(Y_test_filtered, Y_pred_filtered)
	precision = precision_score(Y_test_filtered, Y_pred_filtered, average='weighted')
	recall    = recall_score(Y_test_filtered, Y_pred_filtered, average='weighted')
	f1        = f1_score(Y_test_filtered, Y_pred_filtered, average='weighted')

	# Print metrics in a well-formatted way
	print("--- 📊 Model Evaluation Metrics 📊 ---")
	print("--- Model                    : {}".format(modelName))
	print("--- Confidence Threshold     : {:.2f}".format(threshold))
	print("--- Metrics:")
	print("------ Accuracy              : {:.4f}".format(accuracy))
	print("------ Precision (weighted)  : {:.4f}".format(precision))
	print("------ Recall (weighted)     : {:.4f}".format(recall))
	print("------ F1 Score (weighted)   : {:.4f}".format(f1))
	print("--"*20 + "\n")

def applyThreshold(probabilities, threshold):
	Y_pred = []

	for prob in probabilities:
		if np.max(prob) < threshold:
			Y_pred.append(NO_CLASS_VALUE)
		else:
			# Get the index of the maximum probability
			predicted_class = np.argmax(prob)
			# To fix index mismatch
			if(predicted_class) > NO_CLASS_VALUE:
				predicted_class += 1
			Y_pred.append(predicted_class)
	
	return np.array(Y_pred)

In [28]:
model = joblib.load("../TmpData/svmModel.joblib")
print(model)

SVC(C=1, gamma=1, kernel='sigmoid', probability=True)


In [29]:
# Define a range of confidence thresholds
confidfenceThreshold = 0.5

# For entire Dataset
allProbablities = model.predict_proba(X)

# Use threshold
Y_pred = applyThreshold(allProbablities, confidfenceThreshold)

# Convert numeric predictions back to labels
predictedLabels = labelEncoder.inverse_transform(Y_pred)

### 4) Save Results

In [30]:
# Add the predictions to the DataFrame
appsDF["classID"] = predictedLabels

In [31]:
appsDF.head(5)

Unnamed: 0,pkgName,description,embedding,classID
0,es.aroundpixels.hsk3lite,worry know study chinese complicate bingo teac...,"[0.005222779233008623, -0.01788198947906494, -...",LanguageLearning
1,com.onesports.score,expert information statistic completely new ex...,"[-0.01127683650702238, 0.0010928395204246044, ...",NO_CLASS
2,com.perception.soc.en,play best classic empire online truly massivel...,"[0.02538859285414219, 0.04251297935843468, 0.0...",NO_CLASS
3,im.token.app,reliable web digital wallet trust ten million ...,"[0.012788532301783562, 0.004980458877980709, -...",NO_CLASS
4,com.Siyou.HD,download application free pilot r hd drone use...,"[0.012929912656545639, 0.030403099954128265, -...",NO_CLASS


In [32]:
# Count values in column 'A'
counts = appsDF['classID'].value_counts()

# Display the counts
print(counts)

classID
NO_CLASS                   679
Vpn                         54
Translator                  43
Investment                  20
SmartHome                   17
LanguageLearning            16
Banking                     14
News                        12
Antivirus                   11
BooksReader                 10
Weather                      6
PhotoEditor                  5
Shopping                     5
RemoteControl                5
Browser                      4
Wallpaper                    4
Calendar                     4
Launcher                     4
Airlines                     3
Radio                        3
Astrology                    3
Dating                       3
Messenger                    2
BarcodeAndQRcodeScanner      2
Drawing                      2
Notepad                      2
Recipes                      2
FoodDelivery                 2
PublicTransit                2
Dialer                       1
JobSearch                    1
Streaming                    1


In [33]:
# Count occurrences of 'NO_CLASS' and calculate percentage
noClassCount = (appsDF['classID'] == 'NO_CLASS').sum()
noClassPercentage = (noClassCount / len(appsDF)) * 100

# Calculate counts and percentages for remaining classes
remainingCount = len(appsDF) - noClassCount
remainingPercentage = 100 - noClassPercentage

# Print results for 'NO_CLASS'
print(f"Number of occurrences of 'NO_CLASS'            : {noClassCount}")
print(f"Percentage of occurrences of 'NO_CLASS'        : {noClassPercentage:.2f}%")

# Print results for remaining classes
print(f"Number of occurrences of remaining classes     : {remainingCount}")
print(f"Percentage of occurrences of remaining classes : {remainingPercentage:.2f}%")

Number of occurrences of 'NO_CLASS'            : 679
Percentage of occurrences of 'NO_CLASS'        : 72.08%
Number of occurrences of remaining classes     : 263
Percentage of occurrences of remaining classes : 27.92%


In [34]:
# Drop rows where classID is 'NO_CLASS'
appsDF = appsDF[appsDF['classID'] != 'NO_CLASS']

# Order the DataFrame by classID
appsDF = appsDF.sort_values(by='classID')

In [35]:
# appsDF = appsDF.drop(columns=['description','embedding'])
# appsDF.to_csv("../TmpData/3_MalCatSet_NewLabels.csv", index=False)

appsDF = appsDF.drop(columns=['description','embedding'])
appsDF.to_csv("../TmpData/5_NewMal_NewLabels.csv", index=False)

##### 🔚 End

In [36]:
endTime = datetime.datetime.now()
print("\n🔚 --- End - {} --- 🔚".format(endTime))

# Assuming endTime and startTime are in seconds
totalTime = endTime - startTime
minutes = totalTime.total_seconds() // 60
seconds = totalTime.total_seconds() % 60
print("⏱️ --- Time: {:02d} minutes and {:02d} seconds --- ⏱️".format(int(minutes), int(seconds)))


🔚 --- End - 2024-07-20 22:26:58.457485 --- 🔚
⏱️ --- Time: 00 minutes and 04 seconds --- ⏱️
