### Prepare Data

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
DB_FILE_NAME_ENTRY = "_FileName"
DB_APK_ENTRY = "_APK"
DB_VERSION_CODE_ENTRY = "_VersionCode"
DB_FILE_SIZE_ENTRY = "_FileSize"
# DB_DEX_SIZE_ENTRY = "_DexSize"
DB_MD5_ENTRY = "_MD5"
DB_TAG_ENTRY = "_AnalysisTag"

In [None]:
# load Data
benign = pd.read_csv('Data/benign.csv')
risk = pd.read_csv('Data/riskware.csv')

In [None]:
# build training Data
Xb=benign.drop(columns=[DB_MD5_ENTRY,
                     DB_TAG_ENTRY,DB_FILE_NAME_ENTRY,
                     DB_APK_ENTRY,DB_VERSION_CODE_ENTRY,
                     DB_FILE_SIZE_ENTRY])
Xr=risk.drop(columns=[DB_MD5_ENTRY,
                     DB_TAG_ENTRY,DB_FILE_NAME_ENTRY,
                     DB_APK_ENTRY,DB_VERSION_CODE_ENTRY,
                     DB_FILE_SIZE_ENTRY])
yb = np.zeros((Xb.shape[0]))
yr = np.ones((Xr.shape[0]))
X = np.concatenate((Xb,Xr),axis=0)
y = np.concatenate((yb,yr),axis=0)

In [None]:
print(f"Shapes of X={X.shape} y={y.shape}, #Risk Cases={y.sum()}")

### Hyperparameters tuning

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer

pipe = Pipeline([
    ("scale",QuantileTransformer(n_quantiles=1000)),
    ("model",SVC(kernel="rbf",probability=True))
])

grid = GridSearchCV(estimator=pipe,
            param_grid={'model__C':np.logspace(-2, 3, 6),
                        'model__gamma':np.logspace(-4, 2, 7)},
            scoring={'accuracy': make_scorer(accuracy_score),
                     'AUC': 'roc_auc'},
            refit='AUC',
            cv=10)
grid.fit(X, y)
print(
    "The best parameters are %s with a score of %0.2f"
    % (grid.best_params_, grid.best_score_)
)

In [None]:
df = pd.DataFrame(grid.cv_results_)

In [None]:
df.to_csv('Output/cv_results.csv')

In [None]:
# save _cv_results
import joblib
joblib.dump(grid.cv_results_,'Output/cross_validation_results')

### Train and Save model

In [None]:
clf = grid.best_estimator_
clf.fit(X,y)

In [None]:
import joblib
joblib.dump(clf,'Output/blink_maldroid2020.joblib')
keys=Xb.keys().to_list()
joblib.dump(keys,'Output/blink_maldroid2020_keys.joblib')

### Plot Validation AUC

In [None]:
import numpy as np
from matplotlib.colors import Normalize

C_range= np.logspace(-2, 3, 6)
gamma_range = np.logspace(-4, 2, 7)
scores = grid.cv_results_["mean_test_AUC"].reshape(len(C_range), len(gamma_range))

class MidpointNormalize(Normalize):
    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        self.midpoint = midpoint
        Normalize.__init__(self, vmin, vmax, clip)

    def __call__(self, value, clip=None):
        x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
        return np.ma.masked_array(np.interp(value, x, y))
    
plt.figure(figsize=(8, 6))
plt.subplots_adjust(left=0.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(
    scores,
    interpolation="nearest",
    cmap=plt.cm.hot,
    norm=MidpointNormalize(vmin=0.2, midpoint=0.92),
)
plt.xlabel("gamma")
plt.ylabel("C")
plt.colorbar()
plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
plt.yticks(np.arange(len(C_range)), C_range)
plt.title("Validation AUC")
plt.show()

#### Testing on all data sets

In [None]:
clf=joblib.load('Output/blink_maldroid2020.joblib')

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score
# roc_auc_score(y, clf.predict_proba(X)[:, 1]) #[chance 0, chance 1]
from sklearn import metrics
score = roc_auc_score(y, clf.predict_proba(X)[:, 1])
# curve = metrics.roc_curve(y, clf.predict_proba(X)[:, 1])
# df = pd.DataFrame(np.column_stack(curve),columns=['fpr','tpr','threshold'])
# df.to_csv('Output/roc_curve.csv')
fpr, tpr, _ = metrics.roc_curve(y, clf.predict_proba(X)[:, 1])

#create ROC curve
plt.plot(fpr,tpr)
plt.legend([f'AUC {score}'],loc ="lower right") 
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# discrete
# y_pred = np.argmax(clf.predict_proba(X), axis=1)
# confusion_matrix(y, y_pred)
# roc_auc_score(y, clf.predict_proba(X)[:, 1]) #[chance 0, chance 1]
# y_pred = np.argmax(clf2.predict_proba(X), axis=1)
# confusion_matrix(y, y_pred)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import joblib
DB_FILE_NAME_ENTRY = "_FileName"
DB_APK_ENTRY = "_APK"
DB_VERSION_CODE_ENTRY = "_VersionCode"
DB_FILE_SIZE_ENTRY = "_FileSize"
# DB_DEX_SIZE_ENTRY = "_DexSize"
DB_MD5_ENTRY = "_MD5"
DB_TAG_ENTRY = "_AnalysisTag"
clf=joblib.load('Output/blink_maldroid2020.joblib')
#clf.predict_proba(X)[:, 1]

In [None]:
# load Data
benign = pd.read_csv('Data/benign.csv')
risk = pd.read_csv('Data/riskware.csv')
sms = pd.read_csv('Data/sms.csv')
banking = pd.read_csv('Data/banking.csv')
adware = pd.read_csv('Data/adware.csv')

In [None]:
Xb=benign.drop(columns=[DB_MD5_ENTRY,
                     DB_TAG_ENTRY,DB_FILE_NAME_ENTRY,
                     DB_APK_ENTRY,DB_VERSION_CODE_ENTRY,
                     DB_FILE_SIZE_ENTRY]).to_numpy()
Xr=risk.drop(columns=[DB_MD5_ENTRY,
                     DB_TAG_ENTRY,DB_FILE_NAME_ENTRY,
                     DB_APK_ENTRY,DB_VERSION_CODE_ENTRY,
                     DB_FILE_SIZE_ENTRY]).to_numpy()
Xs=sms.drop(columns=[DB_MD5_ENTRY,
                     DB_TAG_ENTRY,DB_FILE_NAME_ENTRY,
                     DB_APK_ENTRY,DB_VERSION_CODE_ENTRY,
                     DB_FILE_SIZE_ENTRY]).to_numpy()
Xbk=banking.drop(columns=[DB_MD5_ENTRY,
                     DB_TAG_ENTRY,DB_FILE_NAME_ENTRY,
                     DB_APK_ENTRY,DB_VERSION_CODE_ENTRY,
                     DB_FILE_SIZE_ENTRY]).to_numpy()
Xa=adware.drop(columns=[DB_MD5_ENTRY,
                     DB_TAG_ENTRY,DB_FILE_NAME_ENTRY,
                     DB_APK_ENTRY,DB_VERSION_CODE_ENTRY,
                     DB_FILE_SIZE_ENTRY]).to_numpy()

In [None]:
res_b=clf.predict_proba(Xb)[:, 1]
res_r=clf.predict_proba(Xr)[:, 1]
res_s=clf.predict_proba(Xs)[:, 1]
res_bk=clf.predict_proba(Xbk)[:, 1]
res_a=clf.predict_proba(Xa)[:, 1]

In [None]:
np.average(res_r)