## ⛏️ RQ5 - Malicious Dataset - Restricted APIs - Testing (500 Malicious + 500 Benign)

#### Imports

In [None]:
#IMPORT
from    sklearn.feature_extraction.text     import CountVectorizer
from    tqdm                                import tqdm
from    sklearn.svm                         import OneClassSVM
from    joblib                              import dump, load
import  pandas                              as pd
import  numpy                               as np
import  redis
import  itertools
import  ast
import  os
import  shutil

In [None]:
# Initialize TQDM library for Pandas
tqdm.pandas()

In [None]:
print("⚡ START ⚡\n")

#### Data Paths & Constant Values

In [None]:
# FIXED PATHS
INPUT_PATH  = "../../0_Data/CSV/RQ5/4c_AppRestrictedApiData.csv"

# Approaches
APPROACHES = ["1b_Chabada", "2a_Gcata"]

In [None]:
# Random Seed for Reproducibility.
RANDOM_SEED = 151836

### 1. Load Data

In [None]:
appsDF = pd.read_csv(INPUT_PATH, index_col=False)
print("#️⃣ Apps: {}".format(appsDF.shape[0]))

appsDF.head(3)

In [None]:
print("\n🔨 Reading data as lists")
appsDF['apisList'] = appsDF['apisList'].progress_apply(ast.literal_eval) 

print("\n🔨 Order the lists")
appsDF['apisList'] = appsDF['apisList'].progress_apply(lambda lst: sorted(lst))

### 2. Add CHABADA and New Appproach Clustering Labels

In [None]:
inputDF = pd.read_csv("../../0_Data/CSV/RQ1/1b_ChabadaClusteringLabels.csv", index_col=False)
appsDF = pd.concat([appsDF, inputDF['clusterID']], axis=1)
appsDF = appsDF.rename(columns={'clusterID': "Chabada"})

inputDF = pd.read_csv("../../0_Data/CSV/RQ2/2a_GcataClusteringLabels.csv", index_col=False)
appsDF = pd.concat([appsDF, inputDF['clusterID']], axis=1)
appsDF = appsDF.rename(columns={'clusterID': "Gcata"})

In [None]:
# Add a column
appsDF['isMalicious'] = False

# Reorder columns
appsDF = appsDF[['sha256','isMalicious', 'classID','Chabada','Gcata','apisList']]

# Rename the columns using the dictionary
appsDF = appsDF.rename(columns = {'classID': 'groundTruth'})


In [None]:
appsDF.head(3)

### 3. Add malicious apps

In [None]:
maliciousDF =  pd.read_csv("../../0_Data/CSV/RQ5/5_MaliciousDatasetClusteringLabels.csv", index_col=False)
print("#️⃣ Malicious Apps: {}".format(maliciousDF.shape[0]))

print("\n🔨 1. Reading data as lists")
maliciousDF['apisList'] = maliciousDF['apisList'].progress_apply(ast.literal_eval) 

print("\n🔨 2. Order the lists")
maliciousDF['apisList'] = maliciousDF['apisList'].progress_apply(lambda lst: sorted(lst))

maliciousDF.head(3)

In [None]:
# Random sample 500 malicious apps
maliciousDF = maliciousDF.sample(n=500, random_state = RANDOM_SEED)
print("#️⃣ Malicious Apps: {}".format(maliciousDF.shape[0]))

In [None]:
# Concatenating vertically
appsDF = pd.concat([appsDF, maliciousDF], ignore_index=True)
appsDF.head(3)

### 4. Embedding Restricted API as Binary Feature Vectors

In [None]:
# Create an instance of CountVectorizer to transform the lists into feature vectors
vectorizer = CountVectorizer(binary=True, tokenizer=lambda x: x.split('&&&'),token_pattern=None)

# Store daata in the DF as numpy arrays
appsDF['features'] = list(vectorizer.fit_transform(appsDF['apisList'].apply('&&&'.join)).toarray())
print("📐 Len features: {}".format(len(appsDF['features'][0])))

appsDF.head(3)

### 5. Split into Training Set and Test Set

In [None]:
goodDF = appsDF[appsDF['isMalicious'] == False]
print("#️⃣ Apps: {}".format(goodDF.shape[0]))

In [None]:
trainingDF = pd.DataFrame()
testDF     = pd.DataFrame()

for i in range(0, 5000, 100):
    trainingRows = goodDF.iloc[i:i+90]
    testRows     = goodDF.iloc[i+90:i+100]

    # Concatenate the two DataFrames horizontally
    trainingDF = pd.concat([trainingDF, trainingRows], ignore_index=True)
    testDF = pd.concat([testDF, testRows], ignore_index=True)

### 6. Add Malicious TestSet

In [None]:
maliciousTestDF = appsDF[appsDF['isMalicious'] == True]
print("#️⃣ Malicious Apps: {}".format(maliciousTestDF.shape[0]))

### 7. Testing OCSVM - 500 Good apps (TN/FP)

In [None]:
MODEL_PATH = "../../0_Data/CSV/RQ5/Models/"
APPROACHES = ["Chabada", "Gcata"]

In [None]:
def testingGroupedByCluster(testDF, columnName):

    print("\n⭐ RESULTS - {}".format(columnName))

    totalOutliers = 0

    # For each category train a model
    for classID, typeDF in testDF.groupby(columnName):

        #print("\n🏷️ Testing : {}".format(classID))
        
        # Get the features as a list
        X = np.stack(typeDF['features'].values)
        #print("#️⃣ Num apps: {}".format(len(X)))

        # Load the model
        model = load(MODEL_PATH + '{}/OCSVM_{}.joblib'.format(columnName, int(classID)))

        # Get outliers
        Y = model.predict(X)
        numOutliers = np.count_nonzero(Y == -1)

        # Update the total number of outliers
        totalOutliers += numOutliers

    # Print the total number of outliers
    print("#️⃣ Outliers: {}".format(totalOutliers))

    # Get statistiscs
    numApps = testDF.shape[0]
    FP      = totalOutliers
    TN      = numApps - FP
    fpRate  = FP / numApps
    tnRate  = TN / numApps

    print("- FP : {}".format(FP))
    print("- TN : {}".format(TN))

    print("- TN Rate: {:.2%}".format(tnRate))
    print("- FP Rate: {:.2%}".format(fpRate))

In [None]:
print("⭐ TESTING ⭐")

for approach in APPROACHES:
    testingGroupedByCluster(testDF, approach)

### 5) Testing OSCVM - 500 Malicious apps (TP/FN)

In [None]:
MODEL_PATH = "../../0_Data/CSV/RQ5/Models/"
APPROACHES = ["Chabada", "Gcata"]

In [None]:
def testingMaliciousGroupedByCluster(testDF, columnName):

    print("\n⭐ RESULTS - {}".format(columnName))

    totalOutliers = 0

    # For each category train a model
    for classID, typeDF in testDF.groupby(columnName):

        #print("\n🏷️ Testing : {}".format(classID))
        
        # Get the features as a list
        X = np.stack(typeDF['features'].values)

        # Load the model
        model = load(MODEL_PATH + '{}/OCSVM_{}.joblib'.format(columnName, int(classID)))

        # Get outliers
        Y = model.predict(X)
        numOutliers = np.count_nonzero(Y == -1)

        # Update the total number of outliers
        totalOutliers += numOutliers

    # Print the total number of outliers
    print("#️⃣ Outliers: {}".format(totalOutliers))

    # Get statistiscs
    numApps = testDF.shape[0]
    TP      = totalOutliers
    FN      = numApps - TP
    tpRate  = TP / numApps
    fnRate  = FN / numApps

    print("- FN : {}".format(FN))
    print("- TP : {}".format(TP))

    print("- FN Rate: {:.2%}".format(fnRate))
    print("- TP Rate: {:.2%}".format(tpRate))

In [None]:
print("⭐ TESTING ⭐")

for approach in APPROACHES:
    testingMaliciousGroupedByCluster(maliciousTestDF, approach)

In [None]:
print("\n🔚 END \n")