In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data.csv')
features = df.columns

In [14]:
native_features = [f for f in features if f.startswith('android') or f.startswith('com.android')]

In [15]:
custom_features = [f for f in features if not f in native_features and not f=='Result']

In [6]:
native_df = df[native_features]
custom_df = df[custom_features]
label = df['Result']

# Training only with native permissions

In [7]:
from sklearn.model_selection import train_test_split

X_train_native, X_test_native, y_train_native, y_test_native = train_test_split(native_df, label, test_size=0.3, random_state=69)
X_test_native, X_val_native, y_test_native, y_val_native = train_test_split(X_test_native, y_test_native, test_size=0.5, random_state=69)

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from xgboost import XGBClassifier

In [11]:
model_objs = [LogisticRegression(), KNeighborsClassifier(), SVC(kernel='linear'), SVC(kernel='rbf'), SVC(kernel='poly'), DecisionTreeClassifier(), AdaBoostClassifier(), RandomForestClassifier(), XGBClassifier()]
model_names = ['LogisticRegression', 'KNeighborsClassifier', 'Linear SVC', 'Gaussian SVC', 'Polynomial SVC', 'DecisionTreeClassifier', 'AdaBoostClassifier', 'RandomForestClassifier', 'XGBClassifier']

In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [13]:
for m_obj, m_name in zip(model_objs, model_names):
    m_obj.fit(X_train_native, y_train_native),
    y_pred = m_obj.predict(X_test_native)
    acc = accuracy_score(y_pred, y_test_native)
    precision = precision_score(y_pred, y_test_native)
    recall = recall_score(y_pred, y_test_native)
    f1 = f1_score(y_pred, y_test_native)
    roc_auc = roc_auc_score(y_pred, y_test_native)
    print('MODEL: {}'.format(m_name))
    print('Accuracy: {}'.format(acc))
    print('Precision: {}'.format(precision))
    print('Recall: {}'.format(recall))
    print('F1: {}'.format(f1))
    print('ROC AUC: {}'.format(roc_auc))
    print()
    print()

MODEL: LogisticRegression
Accuracy: 0.9440909090909091
Precision: 0.9599631845375057
Recall: 0.9291759465478842
F1: 0.9443186962426438
ROC AUC: 0.9444023584247541


MODEL: KNeighborsClassifier
Accuracy: 0.955
Precision: 0.9622641509433962
Recall: 0.9474399637516991
F1: 0.9547945205479452
ROC AUC: 0.955024131442653


MODEL: Linear SVC
Accuracy: 0.9377272727272727
Precision: 0.9599631845375057
Recall: 0.9177298724153101
F1: 0.9383715699505173
ROC AUC: 0.9384135962922814


MODEL: Gaussian SVC
Accuracy: 0.9534090909090909
Precision: 0.9613437643810401
Recall: 0.9452488687782805
F1: 0.9532283823864932
ROC AUC: 0.9534463521973594


MODEL: Polynomial SVC
Accuracy: 0.9488636363636364
Precision: 0.9507593189139438
Recall: 0.9459706959706959
F1: 0.9483589625889374
ROC AUC: 0.9488427487073696


MODEL: DecisionTreeClassifier
Accuracy: 0.9577272727272728
Precision: 0.9572020248504371
Recall: 0.9572020248504371
F1: 0.9572020248504371
ROC AUC: 0.9577209046569204


MODEL: AdaBoostClassifier
Accuracy: 

# Training on custom features

In [17]:
from sklearn.model_selection import train_test_split

X_train_custom, X_test_custom, y_train_custom, y_test_custom = train_test_split(custom_df, label, test_size=0.3, random_state=69)
X_test_custom, X_val_custom, y_test_custom, y_val_custom = train_test_split(X_test_custom, y_test_custom, test_size=0.5, random_state=69)

In [18]:
for m_obj, m_name in zip(model_objs, model_names):
    m_obj.fit(X_train_custom, y_train_custom),
    y_pred = m_obj.predict(X_test_custom)
    acc = accuracy_score(y_pred, y_test_custom)
    precision = precision_score(y_pred, y_test_custom)
    recall = recall_score(y_pred, y_test_custom)
    f1 = f1_score(y_pred, y_test_custom)
    roc_auc = roc_auc_score(y_pred, y_test_custom)
    print('MODEL: {}'.format(m_name))
    print('Accuracy: {}'.format(acc))
    print('Precision: {}'.format(precision))
    print('Recall: {}'.format(recall))
    print('F1: {}'.format(f1))
    print('ROC AUC: {}'.format(roc_auc))
    print()
    print()

MODEL: LogisticRegression
Accuracy: 0.72
Precision: 0.9843534284399448
Recall: 0.6409949056038358
F1: 0.7764065335753176
ROC AUC: 0.8045049786720966


MODEL: KNeighborsClassifier
Accuracy: 0.5136363636363637
Precision: 0.016106764841233318
Recall: 0.9459459459459459
F1: 0.03167420814479638
ROC AUC: 0.7279580749670138


MODEL: Linear SVC
Accuracy: 0.7202272727272727
Precision: 0.9843534284399448
Recall: 0.6411870503597122
F1: 0.7765474677799964
ROC AUC: 0.8046160815708336


MODEL: Gaussian SVC
Accuracy: 0.7204545454545455
Precision: 0.9843534284399448
Recall: 0.6413793103448275
F1: 0.7766884531590413
ROC AUC: 0.8047272138578598


MODEL: Polynomial SVC
Accuracy: 0.7204545454545455
Precision: 0.9843534284399448
Recall: 0.6413793103448275
F1: 0.7766884531590413
ROC AUC: 0.8047272138578598


MODEL: DecisionTreeClassifier
Accuracy: 0.7202272727272727
Precision: 0.9843534284399448
Recall: 0.6411870503597122
F1: 0.7765474677799964
ROC AUC: 0.8046160815708336


MODEL: AdaBoostClassifier
Accurac