In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import os
from scipy.sparse import csr_matrix
import csv
import shutil
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from joblib import dump, load
import random
from sklearn import metrics
import pickle
import lzma
import lightgbm
import numpy

# model


In [2]:
with lzma.open('dataset/data.xz', 'rb') as file:
    raw_data = file.read()
    (x_train, x_test, y_train, y_test, feature_names) = pickle.loads(raw_data)

In [3]:
feature_vectorizer = CountVectorizer(input='filename', tokenizer=lambda xx: [item for item in xx.split(os.linesep) if item != ""], token_pattern=None,
                                         binary=True, lowercase=False, vocabulary=feature_names)

In [4]:
lgbm_model = lightgbm.Booster(model_file='./dataset/drebin_lgbm.txt')

In [5]:
x_test_new = x_test.astype(float, "safe", True)
y_pred = lgbm_model.predict(x_test_new)
y_pred_nom = []
for score in y_pred:
    if score > 0.5:
        y_pred_nom.append(1)
    else:
        y_pred_nom.append(0)
accuracy = accuracy_score(y_test, y_pred_nom)
# accuracy = accuracy_score(y_test, y_pred)
print("Test Set Accuracy = {}".format(accuracy))
print(metrics.classification_report(y_test,
                                    y_pred_nom, labels=[1, 0],
                                    target_names=['Malware', 'Goodware']))

Test Set Accuracy = 0.963669391462307
              precision    recall  f1-score   support

     Malware       0.97      0.95      0.96      1585
    Goodware       0.96      0.97      0.97      1718

    accuracy                           0.96      3303
   macro avg       0.96      0.96      0.96      3303
weighted avg       0.96      0.96      0.96      3303



# calculate AMM-based features

In [6]:
from amm_generator import AmmGenerator
import json

In [7]:
with lzma.open('dataset/shap_samples_train.xz', 'rb') as file:
    raw_data = file.read()
    (x_samples, y_samples, lgbm_shap_values) = pickle.loads(raw_data)

In [8]:
shap_values = lgbm_shap_values[1]

In [9]:
generator = AmmGenerator()
result_lgbm = generator.AMM_feature_selection(x_samples, shap_values, feature_names, trigger_size=75)

In [10]:
results = {key: int(result_lgbm[key]) for key in result_lgbm}

In [11]:
with open('amm_patch.json', 'w') as file:
    json.dump(results, file)

# statistics-based feature selection


In [12]:
ft_benign, ft_malicious = generator.statistics_feature_selection(x_train, y_train, feature_names)

# Problem Space patching

In [13]:
import apk_generator as patcher

In [14]:
patcher.perform_patching('dataset/be837406e861488e43d0a374982066517c3c08cf549b2f9a1ffe252ccdf3a29b', 'amm_patch.json', '/tmp', './be837406e861488e43d0a374982066517c3c08cf549b2f9a1ffe252ccdf3a29b.pack.apk')

15/01/2022 14:02:45> [INFO][tool.Apktool][decode()] Running decode command "/usr/bin/apktool --frame-path /tmp d --force dataset/be837406e861488e43d0a374982066517c3c08cf549b2f9a1ffe252ccdf3a29b -o /tmp/be837406e861488e43d0a374982066517c3c08cf549b2f9a1ffe252ccdf3a29b"
15/01/2022 14:02:47> [INFO][patchers.string_packer.patcher.StringPatcher][patch()] Running "StringPatcher" patcher
15/01/2022 14:02:47> [INFO][patchers.rebuild.rebuild.Rebuild][patch()] Running "Rebuild" patcher
15/01/2022 14:02:47> [INFO][tool.Apktool][build()] Running build command "/usr/bin/apktool --frame-path /tmp b --force-all /tmp/be837406e861488e43d0a374982066517c3c08cf549b2f9a1ffe252ccdf3a29b -o ./be837406e861488e43d0a374982066517c3c08cf549b2f9a1ffe252ccdf3a29b.pack.apk"
