## ⛏️ RQ5 - Malicious Dataset - Restricted APIs - Training Benign Apps Only (4500)

#### Imports

In [None]:
#IMPORT
from    sklearn.feature_extraction.text     import TfidfVectorizer
from    sklearn.feature_extraction.text     import CountVectorizer
from    tqdm                                import tqdm
from    sklearn.svm                         import OneClassSVM
from    joblib                              import dump, load
import  pandas                              as pd
import  numpy                               as np
import  ast
import  os
import  shutil

In [None]:
# Initialize TQDM library for Pandas
tqdm.pandas()

In [None]:
print("⚡ START ⚡\n")

#### Parameters

In [None]:
# FIXED PATHS
INPUT_PATH  = "../../0_Data/CSV/RQ5/5_AndroCatSetClusteringLabels.csv"

In [None]:
# Random Seed for Reproducibility.
RANDOM_SEED = 151836

### 1. Load Data

In [None]:
appsDF = pd.read_csv(INPUT_PATH, index_col=False)
print("#️⃣ Apps: {}".format(appsDF.shape[0]))

appsDF.head(3)

In [None]:
print("\n🔨 1. Reading data as lists")
appsDF['apisList'] = appsDF['apisList'].progress_apply(ast.literal_eval) 

print("\n🔨 2. Order the lists")
appsDF['apisList'] = appsDF['apisList'].progress_apply(lambda lst: sorted(lst))

In [None]:
appsDF

### 2. Add malicious apps

In [None]:
maliciousDF =  pd.read_csv("../../0_Data/CSV/RQ5/5_MaliciousDatasetClusteringLabels.csv", index_col=False)
print("#️⃣ Malicious Apps: {}".format(maliciousDF.shape[0]))

print("\n🔨 1. Reading data as lists")
maliciousDF['apisList'] = maliciousDF['apisList'].progress_apply(ast.literal_eval) 

print("\n🔨 2. Order the lists")
maliciousDF['apisList'] = maliciousDF['apisList'].progress_apply(lambda lst: sorted(lst))

maliciousDF.head(3)

In [None]:
# Random sample 500 malicious apps
maliciousDF = maliciousDF.sample(n=500, random_state = RANDOM_SEED)
print("#️⃣ Malicious Apps: {}".format(maliciousDF.shape[0]))

In [None]:
# Concatenating vertically
appsDF = pd.concat([appsDF, maliciousDF], ignore_index=True)
appsDF.head(3)

### 3. Embedding Restricted API as Binary Feature Vectors

In [None]:
# Create an instance of CountVectorizer to transform the lists into feature vectors
vectorizer = CountVectorizer(binary=True, tokenizer=lambda x: x.split('&&&'),token_pattern=None)

# Store daata in the DF as numpy arrays
appsDF['features'] = list(vectorizer.fit_transform(appsDF['apisList'].apply('&&&'.join)).toarray())

print("📐 Len features: {}".format(len(appsDF['features'][0])))

appsDF.head(3)

### 4. Split into Training Set and Test Set

In [None]:
appsDF = appsDF[appsDF['isMalicious'] == False]
print("#️⃣   APPS: {}".format(appsDF.shape[0]))

In [None]:
trainingDF = pd.DataFrame()
testDF     = pd.DataFrame()

for i in range(0, 5000, 100):

    trainingRows = appsDF.iloc[i:i+90]
    testRows     = appsDF.iloc[i+90:i+100]

    # Concatenate the two DataFrames horizontally
    trainingDF = pd.concat([trainingDF, trainingRows], ignore_index=True)
    testDF = pd.concat([testDF, testRows], ignore_index=True)

### 5. Training OCSVM

In [None]:
def createFolder(folderPath):
    if not os.path.exists(folderPath):
        os.makedirs(folderPath)

def deleteFolder(folderPath):
    shutil.rmtree(folderPath)

In [None]:
# Where to save Model
MODEL_PATH = "../TMP/Models/"
APPROACHES = ["Chabada", "Gcata"]

for appr in APPROACHES:
    createFolder(MODEL_PATH + "{}/".format(appr))


In [None]:
def trainingGroupedByCluster(trainingDF, columnName):

    print("\n🏷️ Approach: {}".format(columnName))

    # To keep track of the total
    totalOutliers = 0

    # For each category train a model
    for classID, typeDF in trainingDF.groupby(columnName):

        #print("\n🏷️ Training          : {}".format(classID))
        
        # Get the features as a list
        X = np.stack(typeDF['features'].values)
        #print("#️⃣ Num apps          : {}".format(len(X)))

        #Create and fit
        model = OneClassSVM(    kernel='rbf',
                                gamma=0.0005,
                                cache_size=100,
                                tol=0.0001,       
                                nu=0.01,
                                shrinking = True
                            ).fit(X)
        

        # Dump the model
        dump(model, MODEL_PATH + '{}/OCSVM_{}.joblib'.format(columnName, int(classID)))

        # Print statistics about training
        Y = model.predict(X)
        numOutliers = np.count_nonzero(Y == -1)
        #print("⚠️ Training Outliers : {} ({:.0%})".format(numOutliers,numOutliers/len(Y)))

        # Update the total number of outliers
        totalOutliers += numOutliers

    # Print the total number of outliers
    print("#️⃣ Outliers: {} ({:.2%})".format(totalOutliers, totalOutliers / trainingDF.shape[0]))

In [None]:
print("🦾 TRAINING")

for approach in APPROACHES:
    trainingGroupedByCluster(trainingDF, approach)

In [None]:
print("\n🔚 END \n")