## 🔢 App Classification

In [1]:
from sklearn.metrics               import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection       import train_test_split
from sklearn.preprocessing         import LabelEncoder
from sklearn.neural_network        import MLPClassifier
from sklearn.linear_model          import LogisticRegression
from sklearn                       import svm
from dotenv                        import load_dotenv
import xgboost                     as xgb
import pandas                      as pd
import numpy                       as np
import datetime
import ast
import os
import sys

import warnings
warnings.filterwarnings("ignore")

#### Initialization

In [2]:
RANDOM_SEED = 151836

In [3]:
print("⚡ Start - {} ⚡\n".format(datetime.datetime.now()))
startTime = datetime.datetime.now()

⚡ Start - 2024-07-20 15:53:39.103271 ⚡



#### 📥 1) Load Data 

In [4]:
DATA_PATH = "../TmpData/0_AndroCatEmbeddings.csv"

# Read the data
appsDF = pd.read_csv(DATA_PATH)

print("--- #️⃣ Apps: {} ".format(appsDF.shape[0]))

--- #️⃣ Apps: 5000 


In [5]:
appsDF.head(5)

Unnamed: 0,sha256,pkgName,classID,description,embedding
0,9B30837BD2474AC3623A43D052F7ADC4C63E4AA9981F0F...,my.android.calc,Calculator,handiness universal percentage calculator simp...,"[0.00032980614923872054, -0.00386972539126873,..."
1,686DE8D8A0D08992CB135BC7A0500D0109D9697A1140B8...,com.vpn.basiccalculator,Calculator,citizen calculator angel best mobile app world...,"[-0.0048510185442864895, -0.005108269397169352..."
2,A49864DCC90F6730569455BDFA39B4B7CF70AE0C34D656...,com.ba.fractioncalculator,Calculator,free offline fraction calculator support also ...,"[-0.03689534589648247, 0.0034220372326672077, ..."
3,85C80B7ED3799C04CBD107DD3004F91ED4DC2BF8D75C35...,de.sfr.calctape,Calculator,happens combine standard calculator app classi...,"[0.01341024786233902, -0.0020422476809471846, ..."
4,66442DEF269FF22FB4177759343D973DA1A6A3AFE2ECE3...,com.digitalchemy.calculator.freefraction,Calculator,fraction calculator plus best way deal everyda...,"[-0.014099321328103542, 0.0014035949716344476,..."


#### 2) Reorganize a bit the data

In [6]:
# Reorder columns
appsDF = appsDF[[col for col in appsDF.columns if col != "classID"] + ["classID"]]

# Rename the column
appsDF.rename(columns={"classID": "trueLabel"}, inplace=True)

In [7]:
def convertToNumpyArray(arrayStr):
	# Convert the string representation to a list using ast.literal_eval
	arrayList = ast.literal_eval(arrayStr)
	# Convert the list to a numpy array
	return np.array(arrayList)

# Apply the conversion function to the column
appsDF['embedding'] = appsDF['embedding'].apply(convertToNumpyArray)

In [8]:
appsDF.head(5)

Unnamed: 0,sha256,pkgName,description,embedding,trueLabel
0,9B30837BD2474AC3623A43D052F7ADC4C63E4AA9981F0F...,my.android.calc,handiness universal percentage calculator simp...,"[0.00032980614923872054, -0.00386972539126873,...",Calculator
1,686DE8D8A0D08992CB135BC7A0500D0109D9697A1140B8...,com.vpn.basiccalculator,citizen calculator angel best mobile app world...,"[-0.0048510185442864895, -0.005108269397169352...",Calculator
2,A49864DCC90F6730569455BDFA39B4B7CF70AE0C34D656...,com.ba.fractioncalculator,free offline fraction calculator support also ...,"[-0.03689534589648247, 0.0034220372326672077, ...",Calculator
3,85C80B7ED3799C04CBD107DD3004F91ED4DC2BF8D75C35...,de.sfr.calctape,happens combine standard calculator app classi...,"[0.01341024786233902, -0.0020422476809471846, ...",Calculator
4,66442DEF269FF22FB4177759343D973DA1A6A3AFE2ECE3...,com.digitalchemy.calculator.freefraction,fraction calculator plus best way deal everyda...,"[-0.014099321328103542, 0.0014035949716344476,...",Calculator


#### 3) Train

Get X (Data), Y (Labels) And split.

In [9]:
# Convert the list of arrays into a NumPy matrix
X = np.vstack(appsDF['embedding'].tolist())
print("--- 📐 X Shape : {}".format(X.shape))

trueLabels = appsDF['trueLabel'].values 
print("--- 📐 Y Shape : {}".format(len(trueLabels)))

# Create an instance of LabelEncoder using a special value for NO CLASS
labelEncoder = LabelEncoder()
labelEncoder.fit(np.append(trueLabels, "NO_CLASS"))

# Get Encoded True Lalebsl
Y_True = labelEncoder.transform(trueLabels)
print(Y_True)

NO_CLASS_VALUE = labelEncoder.transform(["NO_CLASS"])[0]
print("No Class Value:", NO_CLASS_VALUE)

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y_True, test_size=0.1, random_state = RANDOM_SEED)

--- 📐 X Shape : (5000, 1536)
--- 📐 Y Shape : 5000
[10 10 10 ... 41 41 41]
No Class Value: 30


In [10]:
modelName = 'svm'
model     = svm.SVC(kernel='linear', C=1, probability=True)

print("--- 🦾 Training model: {}".format(modelName))
model.fit(X_train, Y_train)

--- 🦾 Training model: svm


In [11]:
# TO evaluate the model
def evaluateModel(modelName, Y_test, Y_pred, threshold):

    mask = (Y_pred != NO_CLASS_VALUE) & (Y_test != NO_CLASS_VALUE)
    Y_test_filtered = Y_test[mask]
    Y_pred_filtered = Y_pred[mask]

    # print(Y_test)
    # print(Y_test_filtered)
    # print(Y_pred)
    # print(Y_pred_filtered)

    # Compute metrics only on filtered labels
    accuracy = accuracy_score(Y_test_filtered, Y_pred_filtered)
    precision = precision_score(Y_test_filtered, Y_pred_filtered, average='weighted')
    recall = recall_score(Y_test_filtered, Y_pred_filtered, average='weighted')
    f1 = f1_score(Y_test_filtered, Y_pred_filtered, average='weighted')

    # Print metrics in a well-formatted way
    print("--- 📊 Model Evaluation Metrics 📊 ---")
    print("--- Model                    : {}".format(modelName))
    print("--- Confidence Threshold     : {:.2f}".format(threshold))
    print("--- Metrics:")
    print("------ Accuracy              : {:.4f}".format(accuracy))
    print("------ Precision (weighted)  : {:.4f}".format(precision))
    print("------ Recall (weighted)     : {:.4f}".format(recall))
    print("------ F1 Score (weighted)   : {:.4f}".format(f1))
    print("--"*20 + "\n")

# To apply a Threshold to the result confidence level
def applyThreshold(probabilities, threshold):
    Y_pred = []
    for prob in probabilities:
        if np.max(prob) < threshold:
            Y_pred.append(NO_CLASS_VALUE)  
        else:
            Y_pred.append(np.argmax(prob)) 
    return np.array(Y_pred)

In [12]:
# # Define a confidence threshold
# CONFIDENCE_THRESHOLD = 0.75

# # Get prediction probabilities
# probabilities = model.predict_proba(X_test)

# # Apply threshold to get final predictions
# Y_pred = applyThreshold(probabilities, CONFIDENCE_THRESHOLD)

# evaluateModel(modelName,Y_test, Y_pred, CONFIDENCE_THRESHOLD)

# predictedLabels = applyThreshold(model.predict_proba(X), CONFIDENCE_THRESHOLD)

# # Convert numeric predictions back to labels
# predictedLabels = labelEncoder.inverse_transform(predictedLabels)

# # Add the predictions to the DataFrame
# appsDF[modelName+ str(CONFIDENCE_THRESHOLD)] = predictedLabels

In [18]:
# Define a range of confidence thresholds
thresholds = [i/100 for i in range(50, 100, 5)]

# Get prediction probabilities
probabilities = model.predict_proba(X_test)

# Iterate through each threshold
for threshold in thresholds:
   
    # Apply threshold to get final predictions
    Y_pred = applyThreshold(probabilities, threshold)

    evaluateModel(modelName,Y_test, Y_pred, threshold)

    predictedLabels = applyThreshold(model.predict_proba(X), threshold)

    # Convert numeric predictions back to labels
    predictedLabels = labelEncoder.inverse_transform(predictedLabels)

    # Add the predictions to the DataFrame
    appsDF[modelName+ str(threshold)] = predictedLabels

--- 📊 Model Evaluation Metrics 📊 ---
--- Model                    : svm
--- Confidence Threshold     : 0.50
--- Metrics:
------ Accuracy              : 0.6217
------ Precision (weighted)  : 0.6217
------ Recall (weighted)     : 0.6217
------ F1 Score (weighted)   : 0.6217
----------------------------------------

--- 📊 Model Evaluation Metrics 📊 ---
--- Model                    : svm
--- Confidence Threshold     : 0.55
--- Metrics:
------ Accuracy              : 0.6227
------ Precision (weighted)  : 0.6207
------ Recall (weighted)     : 0.6227
------ F1 Score (weighted)   : 0.6217
----------------------------------------

--- 📊 Model Evaluation Metrics 📊 ---
--- Model                    : svm
--- Confidence Threshold     : 0.60
--- Metrics:
------ Accuracy              : 0.6152
------ Precision (weighted)  : 0.6132
------ Recall (weighted)     : 0.6152
------ F1 Score (weighted)   : 0.6141
----------------------------------------

--- 📊 Model Evaluation Metrics 📊 ---
--- Model         

In [14]:
# 'NN': MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(64,), random_state=RANDOM_SEED, max_iter=500)

### 4) Save Results

In [15]:
appsDF.head(5)

Unnamed: 0,sha256,pkgName,description,embedding,trueLabel,svm0.5,svm0.55,svm0.6,svm0.65,svm0.7,svm0.75,svm0.8,svm0.85,svm0.9,svm0.95
0,9B30837BD2474AC3623A43D052F7ADC4C63E4AA9981F0F...,my.android.calc,handiness universal percentage calculator simp...,"[0.00032980614923872054, -0.00386972539126873,...",Calculator,Calculator,Calculator,Calculator,Calculator,Calculator,NO_CLASS,NO_CLASS,NO_CLASS,NO_CLASS,NO_CLASS
1,686DE8D8A0D08992CB135BC7A0500D0109D9697A1140B8...,com.vpn.basiccalculator,citizen calculator angel best mobile app world...,"[-0.0048510185442864895, -0.005108269397169352...",Calculator,NO_CLASS,NO_CLASS,NO_CLASS,NO_CLASS,NO_CLASS,NO_CLASS,NO_CLASS,NO_CLASS,NO_CLASS,NO_CLASS
2,A49864DCC90F6730569455BDFA39B4B7CF70AE0C34D656...,com.ba.fractioncalculator,free offline fraction calculator support also ...,"[-0.03689534589648247, 0.0034220372326672077, ...",Calculator,Calculator,Calculator,Calculator,Calculator,Calculator,Calculator,Calculator,NO_CLASS,NO_CLASS,NO_CLASS
3,85C80B7ED3799C04CBD107DD3004F91ED4DC2BF8D75C35...,de.sfr.calctape,happens combine standard calculator app classi...,"[0.01341024786233902, -0.0020422476809471846, ...",Calculator,Calculator,Calculator,Calculator,Calculator,Calculator,Calculator,Calculator,NO_CLASS,NO_CLASS,NO_CLASS
4,66442DEF269FF22FB4177759343D973DA1A6A3AFE2ECE3...,com.digitalchemy.calculator.freefraction,fraction calculator plus best way deal everyda...,"[-0.014099321328103542, 0.0014035949716344476,...",Calculator,Calculator,Calculator,Calculator,Calculator,Calculator,Calculator,NO_CLASS,NO_CLASS,NO_CLASS,NO_CLASS


In [16]:
appsDF = appsDF.drop(columns=['description','embedding'])
appsDF.to_csv("../TmpData/0_AndroCatSetNewLabels.csv", index=False)

##### 🔚 End

In [17]:
endTime = datetime.datetime.now()
print("\n🔚 --- End - {} --- 🔚".format(endTime))

# Assuming endTime and startTime are in seconds
totalTime = endTime - startTime
minutes = totalTime.total_seconds() // 60
seconds = totalTime.total_seconds() % 60
print("⏱️ --- Time: {:02d} minutes and {:02d} seconds --- ⏱️".format(int(minutes), int(seconds)))


🔚 --- End - 2024-07-20 15:55:27.295846 --- 🔚
⏱️ --- Time: 01 minutes and 48 seconds --- ⏱️
