## 🔢 App Classification

In [1]:
from sklearn.metrics               import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection       import train_test_split
from sklearn.preprocessing         import LabelEncoder
from sklearn.neural_network        import MLPClassifier
from sklearn.linear_model          import LogisticRegression
from sklearn                       import svm
from dotenv                        import load_dotenv
import xgboost                     as xgb
import pandas                      as pd
import numpy                       as np
import datetime
import ast
import os
import sys

#### Initialization

In [2]:
RANDOM_SEED = 151836

In [3]:
print("⚡ Start - {} ⚡\n".format(datetime.datetime.now()))
startTime = datetime.datetime.now()

⚡ Start - 2024-07-20 15:05:50.276816 ⚡



#### 📥 1) Load Data 

In [4]:
DATA_PATH = "../TmpData/0_AndroCatEmbeddings.csv"

# Read the data
appsDF = pd.read_csv(DATA_PATH)

print("--- #️⃣ Apps: {} ".format(appsDF.shape[0]))

--- #️⃣ Apps: 50 


In [5]:
appsDF.head(5)

Unnamed: 0,sha256,pkgName,classID,description,embedding
0,9B30837BD2474AC3623A43D052F7ADC4C63E4AA9981F0F...,my.android.calc,Calculator,handiness universal percentage calculator simp...,"[0.00032980614923872054, -0.00386972539126873,..."
1,686DE8D8A0D08992CB135BC7A0500D0109D9697A1140B8...,com.vpn.basiccalculator,Calculator,citizen calculator angel best mobile app world...,"[-0.004879472311586142, -0.005030232481658459,..."
2,A49864DCC90F6730569455BDFA39B4B7CF70AE0C34D656...,com.ba.fractioncalculator,Calculator,free offline fraction calculator support also ...,"[-0.03689534589648247, 0.0034220372326672077, ..."
3,85C80B7ED3799C04CBD107DD3004F91ED4DC2BF8D75C35...,de.sfr.calctape,Calculator,happens combine standard calculator app classi...,"[0.01341024786233902, -0.0020422476809471846, ..."
4,66442DEF269FF22FB4177759343D973DA1A6A3AFE2ECE3...,com.digitalchemy.calculator.freefraction,Calculator,fraction calculator plus best way deal everyda...,"[-0.014099321328103542, 0.0014035949716344476,..."


#### 2) Reorganize a bit the data

In [6]:
# Reorder columns
appsDF = appsDF[[col for col in appsDF.columns if col != "classID"] + ["classID"]]

# Rename the column
appsDF.rename(columns={"classID": "trueLabel"}, inplace=True)

In [7]:
def convertToNumpyArray(arrayStr):
	# Convert the string representation to a list using ast.literal_eval
	arrayList = ast.literal_eval(arrayStr)
	# Convert the list to a numpy array
	return np.array(arrayList)

# Apply the conversion function to the column
appsDF['embedding'] = appsDF['embedding'].apply(convertToNumpyArray)

In [8]:
appsDF.head(5)

Unnamed: 0,sha256,pkgName,description,embedding,trueLabel
0,9B30837BD2474AC3623A43D052F7ADC4C63E4AA9981F0F...,my.android.calc,handiness universal percentage calculator simp...,"[0.00032980614923872054, -0.00386972539126873,...",Calculator
1,686DE8D8A0D08992CB135BC7A0500D0109D9697A1140B8...,com.vpn.basiccalculator,citizen calculator angel best mobile app world...,"[-0.004879472311586142, -0.005030232481658459,...",Calculator
2,A49864DCC90F6730569455BDFA39B4B7CF70AE0C34D656...,com.ba.fractioncalculator,free offline fraction calculator support also ...,"[-0.03689534589648247, 0.0034220372326672077, ...",Calculator
3,85C80B7ED3799C04CBD107DD3004F91ED4DC2BF8D75C35...,de.sfr.calctape,happens combine standard calculator app classi...,"[0.01341024786233902, -0.0020422476809471846, ...",Calculator
4,66442DEF269FF22FB4177759343D973DA1A6A3AFE2ECE3...,com.digitalchemy.calculator.freefraction,fraction calculator plus best way deal everyda...,"[-0.014099321328103542, 0.0014035949716344476,...",Calculator


#### 3) Train

Get X (Data), Y (Labels) And split.

In [9]:
# Convert the list of arrays into a NumPy matrix
X = np.vstack(appsDF['embedding'].tolist())
print("--- 📐 X Shape : {}".format(X.shape))

trueLabels = appsDF['trueLabel'].values 
print("--- 📐 Y Shape : {}".format(len(trueLabels)))

--- 📐 X Shape : (50, 1536)
--- 📐 Y Shape : 50


In [10]:
# Create an instance of LabelEncoder using a special value for NO CLASS
labelEncoder = LabelEncoder()
labelEncoder.fit(np.append(trueLabels, "NO_CLASS"))

In [11]:
# Get Encoded True Lalebsl
Y_True = labelEncoder.transform(trueLabels)
print(Y_True)

[0 0 0 0 0 0 0 0 0 0 3 3 3 3 3 3 3 3 3 3 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4
 4 4 4 1 1 1 1 1 1 1 1 1 1]


In [12]:
NO_CLASS_VALUE = labelEncoder.transform(["NO_CLASS"])[0]
print("No Class Value:", NO_CLASS_VALUE)

No Class Value: 2


In [13]:
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y_True, test_size=0.2, random_state = RANDOM_SEED)

In [14]:
import warnings
warnings.filterwarnings("ignore")

modelName = 'svm'
model     = svm.SVC(kernel='linear', C=1, probability=True)

print("--- 🦾Training model: {}".format(modelName))
model.fit(X_train, Y_train)

--- 🦾Training model: svm


In [15]:
# Define a confidence threshold
CONFIDENCE_THRESHOLD = 0.75

def applyThreshold(probabilities, threshold):
    Y_pred = []
    for prob in probabilities:
        if np.max(prob) < threshold:
            Y_pred.append(NO_CLASS_VALUE)  
        else:
            Y_pred.append(np.argmax(prob)) 
    return np.array(Y_pred)

# Get prediction probabilities
probabilities = model.predict_proba(X_test)

# Apply threshold to get final predictions
Y_pred = applyThreshold(probabilities, CONFIDENCE_THRESHOLD)

print(Y_pred)

[2 2 1 4 0 2 2 2 0 3]


In [16]:
def evaluateModel(modelName, Y_test, Y_pred, threshold):

    mask = (Y_pred != NO_CLASS_VALUE) & (Y_test != NO_CLASS_VALUE)
    Y_test_filtered = Y_test[mask]
    Y_pred_filtered = Y_pred[mask]

    # print(Y_test)
    # print(Y_test_filtered)
    # print(Y_pred)
    # print(Y_pred_filtered)

    # Compute metrics only on filtered labels
    accuracy = accuracy_score(Y_test_filtered, Y_pred_filtered)
    precision = precision_score(Y_test_filtered, Y_pred_filtered, average='weighted')
    recall = recall_score(Y_test_filtered, Y_pred_filtered, average='weighted')
    f1 = f1_score(Y_test_filtered, Y_pred_filtered, average='weighted')

    # Print metrics in a well-formatted way
    print("--- 📊 Model Evaluation Metrics 📊 ---")
    print("--- Model                    : {}".format(modelName))
    print("--- Confidence Threshold     : {:.2f}".format(threshold))
    print("--- Metrics:")
    print("------ Accuracy              : {:.4f}".format(accuracy))
    print("------ Precision (weighted)  : {:.4f}".format(precision))
    print("------ Recall (weighted)     : {:.4f}".format(recall))
    print("------ F1 Score (weighted)   : {:.4f}".format(f1))
    print("--"*20 + "\n")

evaluateModel(modelName,Y_test, Y_pred, CONFIDENCE_THRESHOLD)

--- 📊 Model Evaluation Metrics 📊 ---
--- Model                    : svm
--- Confidence Threshold     : 0.75
--- Metrics:
------ Accuracy              : 0.6000
------ Precision (weighted)  : 0.6000
------ Recall (weighted)     : 0.6000
------ F1 Score (weighted)   : 0.6000
----------------------------------------



In [17]:
predictedLabels = applyThreshold(model.predict_proba(X), CONFIDENCE_THRESHOLD)

# Convert numeric predictions back to labels
predictedLabels = labelEncoder.inverse_transform(predictedLabels)

# Add the predictions to the DataFrame
appsDF[modelName] = predictedLabels

In [18]:
# 'NN': MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(64,), random_state=RANDOM_SEED, max_iter=500)

### 4) Save Results

In [19]:
appsDF

Unnamed: 0,sha256,pkgName,description,embedding,trueLabel,svm
0,9B30837BD2474AC3623A43D052F7ADC4C63E4AA9981F0F...,my.android.calc,handiness universal percentage calculator simp...,"[0.00032980614923872054, -0.00386972539126873,...",Calculator,Calculator
1,686DE8D8A0D08992CB135BC7A0500D0109D9697A1140B8...,com.vpn.basiccalculator,citizen calculator angel best mobile app world...,"[-0.004879472311586142, -0.005030232481658459,...",Calculator,NO_CLASS
2,A49864DCC90F6730569455BDFA39B4B7CF70AE0C34D656...,com.ba.fractioncalculator,free offline fraction calculator support also ...,"[-0.03689534589648247, 0.0034220372326672077, ...",Calculator,Calculator
3,85C80B7ED3799C04CBD107DD3004F91ED4DC2BF8D75C35...,de.sfr.calctape,happens combine standard calculator app classi...,"[0.01341024786233902, -0.0020422476809471846, ...",Calculator,Calculator
4,66442DEF269FF22FB4177759343D973DA1A6A3AFE2ECE3...,com.digitalchemy.calculator.freefraction,fraction calculator plus best way deal everyda...,"[-0.014099321328103542, 0.0014035949716344476,...",Calculator,Calculator
5,E62452B248DDDC34646D98BEC604C236B865115DA1CD4B...,com.digitalchemy.calculator.freedecimal,calculator plus perfect calculator android eas...,"[0.0033893745858222246, 0.009801872074604034, ...",Calculator,Calculator
6,D35EDCE75773CB40CAC2BBD6AC294DE8103F3B7CA11DD6...,com.candl.athena,express style free calculator make choose colo...,"[0.012968630529940128, 0.0028018050361424685, ...",Calculator,Calculator
7,0708505CD2893C15589BD3BCBEEA22F414B0EF23B7BC11...,com.dencreak.dlcalculator,calculator allow easily handle calculation nec...,"[0.0070110284723341465, -0.009924965910613537,...",Calculator,Calculator
8,5482249D7EF2DCA96CDC791433F86EE0C962A1F71712C4...,calculator.innovit.com.calculatrice,calculator essential tool smartphone simple ea...,"[0.016785969957709312, 0.02142198197543621, 0....",Calculator,Calculator
9,F1D6B18311B1C80936935BC6199A338E599AF4509E0DAB...,apps.r.calculator,calculator allow perform simple complex math t...,"[-0.010264242067933083, 0.012525494210422039, ...",Calculator,Calculator


##### 🔚 End

In [20]:
endTime = datetime.datetime.now()
print("\n🔚 --- End - {} --- 🔚".format(endTime))

# Assuming endTime and startTime are in seconds
totalTime = endTime - startTime
minutes = totalTime.total_seconds() // 60
seconds = totalTime.total_seconds() % 60
print("⏱️ --- Time: {:02d} minutes and {:02d} seconds --- ⏱️".format(int(minutes), int(seconds)))


🔚 --- End - 2024-07-20 15:05:50.681257 --- 🔚
⏱️ --- Time: 00 minutes and 00 seconds --- ⏱️
