In [1]:
!pip install imblearn



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix, roc_auc_score, plot_roc_curve
from imblearn.under_sampling import RandomUnderSampler

In [3]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [4]:
drebin = pd.read_csv("../Datasets Intersecção/Interseccao_Drebin.csv")
androcrawl = pd.read_csv("../Datasets Intersecção/Interseccao_Androcrawl.csv")
android = pd.read_csv("../Datasets Intersecção/Interseccao_AndroidMalwareNormal.csv")
droid = pd.read_csv("../Datasets Intersecção/Interseccao_DefenseDroid.csv")

In [5]:
drebin.drop("Unnamed: 0", axis=1, inplace=True)
androcrawl.drop("Unnamed: 0", axis=1, inplace=True)
android.drop("Unnamed: 0", axis=1, inplace=True)
droid.drop("Unnamed: 0", axis=1, inplace=True)

In [6]:
sorted(drebin.columns)

['CHANGE_COMPONENT_ENABLED_STATE',
 'CONTROL_LOCATION_UPDATES',
 'DEVICE_POWER',
 'DISABLE_KEYGUARD',
 'EXPAND_STATUS_BAR',
 'FLASHLIGHT',
 'KILL_BACKGROUND_PROCESSES',
 'READ_USER_DICTIONARY',
 'SET_ORIENTATION',
 'SET_TIME',
 'STATUS_BAR',
 'WRITE_HISTORY_BOOKMARKS',
 'WRITE_USER_DICTIONARY',
 'class']

# Separação dos dados

### Separação de conjuntos de treino, teste e validação

In [7]:
train, validation = train_test_split(drebin,test_size = 0.2,stratify=drebin["class"],random_state=42)

In [8]:
X_TRAIN = train.drop("class", axis=1)
Y_TRAIN = train["class"]
x_train, x_test, y_train, y_test = train_test_split(X_TRAIN, Y_TRAIN, test_size = 0.375, stratify = Y_TRAIN, random_state=42)

In [9]:
# Balanceamento de dados de treino
rus = RandomUnderSampler()
x_train_balanced, y_train_balanced = rus.fit_resample(x_train, y_train)
x_train_balanced = pd.DataFrame(x_train_balanced)

In [10]:
x_validation = validation.drop("class", axis=1)
y_validation = validation["class"]

In [11]:
x_androcrawl = androcrawl.drop("class", axis=1)
y_androcrawl = androcrawl["class"]

In [12]:
x_android = android.drop("class", axis=1)
y_android = android["class"]

In [13]:
x_droid = droid.drop("class", axis=1)
y_droid = droid["class"]

In [14]:
D_RandomForest = RandomForestClassifier()
B_RandomForest = RandomForestClassifier()

# HIPERPARÂMETROS PADRÕES

### Treino Desbalanceado

In [15]:
D_RandomForest.fit(x_train,y_train)

RandomForestClassifier()

In [16]:
print("Treino desbalanceado")
print("Curva ROC >> ", (cross_val_score(D_RandomForest, x_test, y_test, scoring="roc_auc", cv = 5).mean())*100,"%")
print("Acurácia >> ", (cross_val_score(D_RandomForest, x_test, y_test, scoring="accuracy", cv = 5).mean())*100,"%")
print("Precisão >> ", (cross_val_score(D_RandomForest, x_test, y_test, scoring="precision", cv = 5).mean())*100,"%")
print("F1-Score >> ", (cross_val_score(D_RandomForest, x_test, y_test, scoring="f1", cv = 5).mean())*100,"%")
print("Recall >> ", (cross_val_score(D_RandomForest, x_test, y_test, scoring="recall", cv = 5).mean())*100,"%")

Treino desbalanceado
Curva ROC >>  65.3846114497214 %
Acurácia >>  81.2671095644838 %
Precisão >>  81.44909178544111 %
F1-Score >>  40.8380367072202 %
Recall >>  27.326362957430916 %


### Treino Balanceado

In [17]:
B_RandomForest.fit(x_train_balanced,y_train_balanced)

RandomForestClassifier()

In [18]:
print("Treino balanceado")
print("Curva ROC >> ", (cross_val_score(B_RandomForest, x_test, y_test, scoring="roc_auc", cv = 5).mean())*100,"%")
print("Acurácia >> ", (cross_val_score(B_RandomForest, x_test, y_test, scoring="accuracy", cv = 5).mean())*100,"%")
print("Precisão >> ", (cross_val_score(B_RandomForest, x_test, y_test, scoring="precision", cv = 5).mean())*100,"%")
print("F1-Score >> ", (cross_val_score(B_RandomForest, x_test, y_test, scoring="f1", cv = 5).mean())*100,"%")
print("Recall >> ", (cross_val_score(B_RandomForest, x_test, y_test, scoring="recall", cv = 5).mean())*100,"%")

Treino balanceado
Curva ROC >>  65.38761107175542 %
Acurácia >>  81.22123800485078 %
Precisão >>  80.91878875513808 %
F1-Score >>  40.777925071324965 %
Recall >>  27.326362957430916 %


# TESTANDO HIPERPARÂMETROS

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

D_RS = RandomizedSearchCV(D_RandomForest, random_grid, random_state=42, scoring="roc_auc",cv = 5, n_iter=100, n_jobs=-1, verbose=2).fit(x_train,y_train)
B_RS = RandomizedSearchCV(B_RandomForest, random_grid, random_state=42, scoring="roc_auc",cv = 5, n_iter = 100, n_jobs=-1, verbose=2).fit(x_train_balanced,y_train_balanced)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [None]:
D_RS.best_params_

In [None]:
B_RS.best_params_

# HIPERPARÂMETROS OTIMIZADOS

In [None]:
OD_RandomForest = RandomForestClassifier(**D_RS.best_params_)
OB_RandomForest = RandomForestClassifier(**B_RS.best_params_)

In [None]:
OD_RandomForest.fit(x_train,y_train)

In [None]:
print("Treino desbalanceado")
print("Curva ROC >> ", (cross_val_score(OD_RandomForest, x_test, y_test, scoring="roc_auc", cv = 5).mean())*100,"%")
print("Acurácia >> ", (cross_val_score(OD_RandomForest, x_test, y_test, scoring="accuracy", cv = 5).mean())*100,"%")
print("Precisão >> ", (cross_val_score(OD_RandomForest, x_test, y_test, scoring="precision", cv = 5).mean())*100,"%")
print("F1-Score >> ", (cross_val_score(OD_RandomForest, x_test, y_test, scoring="f1", cv = 5).mean())*100,"%")
print("Recall >> ", (cross_val_score(OD_RandomForest, x_test, y_test, scoring="recall", cv = 5).mean())*100,"%")

In [None]:
OB_RandomForest.fit(x_train_balanced, y_train_balanced)

In [None]:
print("Treino balanceado")
print("Curva ROC >> ", (cross_val_score(OB_RandomForest, x_test, y_test, scoring="roc_auc", cv = 5).mean())*100,"%")
print("Acurácia >> ", (cross_val_score(OB_RandomForest, x_test, y_test, scoring="accuracy", cv = 5).mean())*100,"%")
print("Precisão >> ", (cross_val_score(OB_RandomForest, x_test, y_test, scoring="precision", cv = 5).mean())*100,"%")
print("F1-Score >> ", (cross_val_score(OB_RandomForest, x_test, y_test, scoring="f1", cv = 5).mean())*100,"%")
print("Recall >> ", (cross_val_score(OB_RandomForest, x_test, y_test, scoring="recall", cv = 5).mean())*100,"%")

# HIPERPARÂMETROS PADRÕES X HIPERPARÂMETROS OTIMIZADOS

### Desbalanceado

In [None]:
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(12,8))
plt.tight_layout()
ax1.set(title="Padrão")
ax2.set(title="Otimizado")
plot_confusion_matrix(D_RandomForest, x_validation, y_validation,ax=ax1, cmap = "Blues", values_format="d")
plot_confusion_matrix(OD_RandomForest, x_validation, y_validation, ax=ax2, cmap= "Purples", values_format="d")
plot_confusion_matrix(D_RandomForest, x_validation, y_validation,ax=ax3, cmap = "Blues", normalize = "true")
plot_confusion_matrix(OD_RandomForest, x_validation, y_validation, ax=ax4, cmap= "Purples", normalize = "true")
plot_roc_curve(D_RandomForest,x_validation, y_validation, ax=ax5)
plot_roc_curve(OD_RandomForest,x_validation, y_validation, ax=ax6)

In [None]:
print("PADRÃO - ROC_AUC>>",roc_auc_score(y_validation, D_RandomForest.predict(x_validation))*100, "%")

In [None]:
print("OTIMIZADO - ROC_AUC>>",roc_auc_score(y_validation, OD_RandomForest.predict(x_validation))*100, "%")

### Balanceado

In [None]:
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(12,8))
plt.tight_layout()
ax1.set(title="Padrão")
ax2.set(title="Otimizado")
plot_confusion_matrix(B_RandomForest, x_validation, y_validation,ax=ax1, cmap = "Blues", values_format="d")
plot_confusion_matrix(OB_RandomForest, x_validation, y_validation, ax=ax2, cmap= "Purples", values_format="d")
plot_confusion_matrix(B_RandomForest, x_validation, y_validation,ax=ax3, cmap = "Blues", normalize = "true")
plot_confusion_matrix(OB_RandomForest, x_validation, y_validation, ax=ax4, cmap= "Purples", normalize = "true")
plot_roc_curve(B_RandomForest,x_validation, y_validation, ax=ax5)
plot_roc_curve(OB_RandomForest,x_validation, y_validation, ax=ax6)

In [None]:
print("PADRÃO - ROC_AUC >>",roc_auc_score(y_validation, B_RandomForest.predict(x_validation))*100, "%")

In [None]:
print("OTIMIZADO - ROC_AUC >>",roc_auc_score(y_validation, OB_RandomForest.predict(x_validation))*100, "%")

# VALIDANDO NOS SUBSETS DE INTERSECÇÃO

#### ANDROCRAWL

In [None]:
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(12,8))
plt.tight_layout()
ax1.set(title="Desbalanceado")
ax2.set(title="Balanceado")
plot_confusion_matrix(OD_RandomForest, x_androcrawl, y_androcrawl,ax=ax1, cmap = "Blues", values_format="d")
plot_confusion_matrix(OB_RandomForest, x_androcrawl, y_androcrawl, ax=ax2, cmap= "Purples", values_format="d")
plot_confusion_matrix(OD_RandomForest, x_androcrawl, y_androcrawl,ax=ax3, cmap = "Blues", normalize = "true")
plot_confusion_matrix(OB_RandomForest, x_androcrawl, y_androcrawl, ax=ax4, cmap= "Purples", normalize = "true")
plot_roc_curve(OD_RandomForest,x_androcrawl, y_androcrawl, ax=ax5)
plot_roc_curve(OB_RandomForest,x_androcrawl, y_androcrawl, ax=ax6)

In [None]:
print("ANDROCRAWL - ROC_AUC >>",roc_auc_score(y_androcrawl, OB_RandomForest.predict(x_androcrawl))*100, "%")

#### ANDROID

In [None]:
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(12,8))
plt.tight_layout()
ax1.set(title="Desbalanceado")
ax2.set(title="Balanceado")
plot_confusion_matrix(OD_RandomForest, x_android, y_android,ax=ax1, cmap = "Blues", values_format="d")
plot_confusion_matrix(OB_RandomForest, x_android, y_android, ax=ax2, cmap= "Purples", values_format="d")
plot_confusion_matrix(OD_RandomForest, x_android, y_android,ax=ax3, cmap = "Blues", normalize = "true")
plot_confusion_matrix(OB_RandomForest, x_android, y_android, ax=ax4, cmap= "Purples", normalize = "true")
plot_roc_curve(OD_RandomForest,x_android, y_android, ax=ax5)
plot_roc_curve(OB_RandomForest,x_android, y_android, ax=ax6)

In [None]:
print("ANDROID - ROC_AUC >>",roc_auc_score(y_android, OB_RandomForest.predict(x_android))*100, "%")

#### DEFENSE DROID

In [None]:
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(12,8))
plt.tight_layout()
ax1.set(title="Desbalanceado")
ax2.set(title="Balanceado")
plot_confusion_matrix(OD_RandomForest, x_droid, y_droid,ax=ax1, cmap = "Blues", values_format="d")
plot_confusion_matrix(OB_RandomForest, x_droid, y_droid, ax=ax2, cmap= "Purples", values_format="d")
plot_confusion_matrix(OD_RandomForest, x_droid, y_droid,ax=ax3, cmap = "Blues", normalize = "true")
plot_confusion_matrix(OB_RandomForest, x_droid, y_droid, ax=ax4, cmap= "Purples", normalize = "true")
plot_roc_curve(OD_RandomForest,x_droid, y_droid, ax=ax5)
plot_roc_curve(OB_RandomForest,x_droid, y_droid, ax=ax6)

In [None]:
print("DEFENSE DROID - ROC_AUC >>",roc_auc_score(y_droid, OB_RandomForest.predict(x_droid))*100, "%")

### Resultados

In [None]:
print("DREBIN - ROC_AUC >>",roc_auc_score(y_validation, OB_RandomForest.predict(x_validation))*100, "%")
print("ANDROCRAWL - ROC_AUC >>",roc_auc_score(y_androcrawl, OB_RandomForest.predict(x_androcrawl))*100, "%")
print("ANDROID - ROC_AUC >>",roc_auc_score(y_android, OB_RandomForest.predict(x_android))*100, "%")
print("DEFENSE DROID - ROC_AUC >>",roc_auc_score(y_droid, OB_RandomForest.predict(x_droid))*100, "%")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=be34dc1b-561d-4c90-bae1-66eed919bb55' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>