In [2]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import os
import time
import pickle
import matplotlib.pyplot as plt
import warnings
from collections import Counter
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_recall_curve, recall_score, roc_curve, precision_score
from sklearn.exceptions import ConvergenceWarning
from androguard.core.bytecodes.apk import APK
from androguard.core.bytecodes.dvm import DalvikVMFormat
from androguard.misc import AnalyzeAPK
from mlxtend.plotting import plot_confusion_matrix
import requests
import json

ModuleNotFoundError: No module named 'androguard'

In [None]:
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [None]:
# Load dataset and feature categories
data = pd.read_csv("/dataset/malware-dataset.csv", encoding="utf-8", low_memory=False, na_values="?")
feature_df = pd.read_csv("/dataset/features-categories.csv", header=None, names=["X", "Category"])

In [None]:
# Map target class
data["class"] = data["class"].map({"B": 0, "S": 1})

In [None]:
# Drop missing values and plot class distribution
data = data.dropna()
plt.figure()
ax = sns.countplot(data=data, x="class")
for container in ax.containers:
    ax.bar_label(container)
plt.show()

In [None]:
# Split data into features (X) and target (y)
X = data.drop("class", axis=1)
y = data["class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train a randomforest classifier
rf = RandomForestClassifier()
start = time.time()
rf.fit(X_train, y_train)
end = time.time()
rf_time = end - start
print("Random Forest Train Time:", rf_time)

In [None]:
# Predictions
rf_pred_train = rf.predict(X_train)
rf_pred_test = rf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
rf_train_score = accuracy_score(rf_pred_train, y_train)
rf_test_score = accuracy_score(rf_pred_test, y_test)
print("Random Forest Train Score:", rf_train_score)
print("Random Forest Test Score:", rf_test_score)

In [None]:
rf_precision_score = precision_score(y_test, rf_pred_test)
rf_f1_score = f1_score(y_test, rf_pred_test)
rf_recall_score = recall_score(y_test, rf_pred_test)
rf_accuracy_score = accuracy_score(y_test, rf_pred_test)
print("Random Forest Precision Score:", rf_precision_score)
print("Random Forest F1 Score:", rf_f1_score)
print("Random Forest Recall Score:", rf_recall_score)
print("Random Forest Accuracy Score:", rf_accuracy_score)

In [None]:
print(classification_report(y_test, rf_pred_test))

In [None]:
rf_cm = confusion_matrix(y_test, rf_pred_test)
fig, ax = plot_confusion_matrix(conf_mat=rf_cm, show_absolute=True, show_normed=True, colorbar=True, class_names=["benign", "malware"])
plt.show()

In [None]:
feat_importances = pd.Series(rf.feature_importances_, index=X.columns)
feat_importances.nlargest(30).plot(kind='barh')
plt.show()

In [None]:
# Extracted features from the malware APK
permissions_list = feature_df[feature_df["Category"] == "Manifest Permission"].X.unique()
api_call_signatures = feature_df[feature_df["Category"] == "API call signature"].X.unique()
intents = feature_df[feature_df["Category"] == "Intent"].X.unique()
keywords = feature_df[feature_df["Category"] == "Commands signature"].X.unique()

columns = ["filename"]
for col in data.columns:
    columns.append(col)
test_df = pd.DataFrame(columns=columns)

# Set APK file path
apk_file_path = "/content/8b4e3a277e317328b9bc4bc40a9d3a66b79ffde193717cd11b1d4d20ba451825.apk"
test_df.loc[0, "filename"] = apk_file_path

In [None]:
a = APK(apk_file_path)
d = DalvikVMFormat(a.get_dex())

permissions = a.get_permissions()
manifest = a.get_android_manifest_xml()
intent_filters = manifest.findall(".//intent-filter")

found_permissions = []
found_api_signatures = []
found_intents = []
found_keywords = []

# Check for permissions
for permission in permissions:
    permission = permission.split(".")[-1]
    if permission in permissions_list:
        found_permissions.append(permission)

for permission in permissions_list:
    if permission in found_permissions:
        test_df[permission] = 1
    else:
        test_df[permission] = 0

# Check for API calls
for method in d.get_methods():
    descriptor = method.get_descriptor()
    if isinstance(descriptor, bytes):
        descriptor = descriptor.decode('utf-8')

    for api_call in api_call_signatures:
        if re.search(api_call, descriptor):
            found_api_signatures.append(api_call)

for api_call in api_call_signatures:
    if api_call in found_api_signatures:
        test_df[api_call] = 1
    else:
        test_df[api_call] = 0

# Check for intents
for intent_filter in intent_filters:
    action_elements = intent_filter.findall(".//action")
    for action_element in action_elements:
        action_value = action_element.get("{http://schemas.android.com/apk/res/android}name")
        for intent in intents:
            if re.search(intent, action_value):
                found_intents.append(intent)

for intent in intents:
    if intent in found_intents:
        test_df[intent] = 1
    else:
        test_df[intent] = 0

# Check for command keywords
for method in d.get_methods():
    for api_call in api_call_signatures:
        descriptor = method.get_descriptor().decode('utf-8')
        if re.search(api_call, descriptor):
            found_api_signatures.append(api_call)

for keyword in keywords:
    if keyword in found_keywords:
        test_df[keyword] = 1
    else:
        test_df[keyword] = 0

# Ensure no missing data
test_df.isnull().sum().sum()

dropped = test_df.drop("filename", axis=1)
dropped = dropped.fillna(0)

if 'class' in dropped.columns:
    dropped = dropped.drop('class', axis=1)

In [None]:
# Model prediction
prediction = rf.predict(dropped)

if prediction[0] == 0:
  print("The APK file is predicted to be benign.")
else:
  print("The APK file is predicted to be malware.")