# Imports

In [24]:
import numpy as np
import pandas as pd
from pandas import DataFrame as dataframe
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
import seaborn as sns
import os
from os.path import join
# from tqdm import tqdm
import random

from sklearn.feature_selection import SelectKBest, chi2

## Functions

In [3]:
from collections import Counter

def find_least_common_class(y_labels):
    # Count the frequency of each class in y_labels
    counter = Counter(y_labels)
    
    # Find the class with the minimum count
    least_common_class, least_common_count = counter.most_common()[-1]
    
    return (least_common_class, least_common_count)
    
def balance_classes(data, MAX_INSTANCES):
    """
    Balances classes by under-sampling each class to a specified maximum number of instances.
    
    Parameters:
    - X: Features, numpy array of shape (n_samples, n_features)
    - y_ohe: One-hot encoded labels, numpy array of shape (n_samples, n_classes)
    - label_list: List of actual labels corresponding to each sample
    - MAX_INSTANCES: The maximum number of instances allowed per class.
    
    Returns:
    - A dictionary with balanced features, labels, one-hot encoded labels, actual label list, and indices.
    """
    # y = data_dict['labels']['labels']
    unique_classes = np.unique(data['class'])

    min_class, min_value = find_least_common_class(y_labels=data['class'])

    if min_value < MAX_INSTANCES:
        print(f'Class {min_class} with only {min_value} samples. Updating MAX_INSTANCES to {min_value}.')
        MAX_INSTANCES = min_value
    
    under_sample = pd.DataFrame()
    # np.random.seed(0)  # For reproducibility
    for class_id in unique_classes:
        # Find indices of the current class
        group = data[data['class']==class_id]
        sampled = group.sample(MAX_INSTANCES, random_state=0, replace=False)
    
    # Concatenate all sampled indices from each class
        under_sample = pd.concat([under_sample, sampled])

    return under_sample.reset_index(drop=True)

def process_column(column_names, delimiter='.'):
    process_col = [col.strip().replace(';','.').replace(',','.').replace(':','.').replace('->','').replace('/','.').replace(' ', '_').lower() for col in column_names]

    return [col.split(delimiter)[-1] for col in process_col]


## Directories

In [4]:
os.getcwd()

'D:\\github'

In [5]:
print(os.getcwd())
base_dir = os.getcwd()

paths={
    'data': join(base_dir, 'data'),
    'processed': join(base_dir, 'processed'),
    'datasets': join(base_dir, 'datasets1'),
    'binaries': join(base_dir,'datasets1', 'binaries'),
    'continuous': join(base_dir,'datasets1', 'continuous'),
    'limpos': join(base_dir,'datasets1', 'limpos'),
    'models': join(base_dir,'models'),
    'images': join(base_dir,'imgs'),
    'shap': join(base_dir,'shap'),
    
}

D:\github


In [6]:
os.listdir(paths['datasets'])

['.git',
 '8K',
 'amexplorer',
 'binaries',
 'continuous',
 'limpos',
 'malmem2022',
 'motodroid',
 'motodroid-v2',
 'originais',
 'originals',
 'preprocessed',
 'README.md',
 'src',
 'VT_metadata.rar']

In [7]:
files = dataframe(os.listdir(paths['binaries']), columns=['filename'])
files

Unnamed: 0,filename
0,adroit.csv
1,androcrawl.csv
2,android_permissions.csv
3,defensedroid_prs.csv
4,drebin215.csv
5,kronodroid_emulador.csv
6,kronodroid_real_device.csv


# Random Undersampling

In [107]:
import checker
from checker import DataFrameChecker

def randomUndersampling(path_datasets, MAX_INSTANCES=5000, MAX_FEATURES=100, path_save=None):

    for dataset_name in os.listdir(path_datasets):
        print(dataset_name)
        data = pd.read_csv(join(path_datasets, dataset_name))
        print(f'\tdata shape: {data.shape}')
        
        labels = data['class']
        data.drop(columns=['class'], inplace=True)
        # column_names = process_column(column_names=data.columns.values, delimiter='.')
        column_names = data.columns.values
        # data.columns = column_names
        # print(len(column_names))

        if len(column_names) > MAX_FEATURES:
            chi2_features = feature_selection(data.values, labels.values, column_names, MAX_FEATURES=MAX_FEATURES)
            data = data[chi2_features['names'].values]
        # print(f"\tchi2 shape: {chi2_features.shape}, len {len(chi2_features['names'].values)}")

        # return chi2_features, data[chi2_features['names'].values]
        data['class'] = labels.values
        balanced_data = balance_classes(data=data, MAX_INSTANCES=MAX_INSTANCES)
        print(f'\tBalanced shape: {balanced_data.shape}')

        has_nan = balanced_data.isnull().values.any()
        if has_nan:
            print(f'\nHas NaN: {has_nan}')
        # checker = DataFrameChecker(balanced_data)
        # print(checker.summary())
        
        if path_save:
            balanced_data.to_csv(
                join(path_save, f'{dataset_name}-balanced.csv'),
                index=False
            )
        
    return

def feature_selection(data, labels, col_names, MAX_FEATURES=0):
    chi2_stats, p_values = chi2(data, labels)  # Virus total scanners detections >= 4
    df_chi2 = dataframe({
        'names': col_names,         
        'stats': chi2_stats,
        'p_values': p_values
    })

    chi2_sorted = df_chi2.sort_values(by='stats', ascending=False).dropna()

    # chi2_features = df_chi2[df_chi2['p_values'] < 0.05]  ## significance level (e.g. α = .05), and .head for TOP K
    # chi2_features = chi2_sorted[chi2_sorted['p_values'] < 0.05]  ## significance level (e.g. α = .05), and .head for TOP K

    if MAX_FEATURES>0:
        return chi2_sorted.head(MAX_FEATURES)
    
    return  chi2_sorted


In [108]:
randomUndersampling(
    path_datasets=paths['binaries'],  
    MAX_INSTANCES=5000, 
    MAX_FEATURES=200, 
    path_save=join(base_dir, 'balanced', '10k')
)

adroit.csv
	data shape: (11476, 167)
Class 1 with only 3418 samples. Updating MAX_INSTANCES to 3418.
	Balanced shape: (6836, 167)
androcrawl.csv
	data shape: (96744, 142)
	Balanced shape: (10000, 142)
android_permissions.csv
	data shape: (26864, 152)
	Balanced shape: (10000, 152)
defensedroid_apicalls_closeness.csv
	data shape: (10476, 4275)
	Balanced shape: (10000, 201)
defensedroid_apicalls_degree.csv
	data shape: (10476, 6003)
	Balanced shape: (10000, 201)
defensedroid_apicalls_katz.csv
	data shape: (10476, 6003)
	Balanced shape: (10000, 201)
defensedroid_prs.csv
	data shape: (11975, 2878)
	Balanced shape: (10000, 201)
drebin215.csv
	data shape: (15031, 216)
	Balanced shape: (10000, 201)
kronodroid_emulador.csv
	data shape: (63991, 277)
	Balanced shape: (10000, 201)
kronodroid_real_device.csv
	data shape: (78137, 287)
	Balanced shape: (10000, 201)


# Drebin dataset

In [7]:
data = pd.read_csv(join(paths['binaries'], files['filename'][4]))

column_names = process_column(column_names=data.columns.values, delimiter='.')
data.columns = column_names

data.head()

Unnamed: 0,transact,onserviceconnected,bindservice,attachinterface,serviceconnection,binder,send_sms,getcanonicalname,getmethods,cast,...,read_contacts,device_power,hardware_test,access_wifi_state,write_external_storage,access_fine_location,set_wallpaper_hints,set_preferred_applications,write_secure_settings,class
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,1,1,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,1


In [8]:
print(data.shape)
print(data.info())

class_names=['authentic', 'malware']
dataset_name = 'DREBIN'
print(data['class'].value_counts())

(15031, 216)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15031 entries, 0 to 15030
Columns: 216 entries, transact to class
dtypes: int64(216)
memory usage: 24.8 MB
None
class
0    9476
1    5555
Name: count, dtype: int64


In [9]:
balanced_data = balance_classes(data=data, MAX_INSTANCES=10000)
balanced_data

Class 1 with only 5555 samples. Updating MAX_INSTANCES to 5555.


Unnamed: 0,transact,onserviceconnected,bindservice,attachinterface,serviceconnection,binder,send_sms,getcanonicalname,getmethods,cast,...,read_contacts,device_power,hardware_test,access_wifi_state,write_external_storage,access_fine_location,set_wallpaper_hints,set_preferred_applications,write_secure_settings,class
0,1,0,0,1,0,1,0,0,0,0,...,0,0,0,1,1,0,1,0,0,0
1,1,1,1,1,1,1,0,1,1,1,...,0,0,0,0,1,0,0,0,0,0
2,0,1,1,0,1,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,1,1,0,1,1,1,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11105,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
11106,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
11107,1,0,0,1,0,1,1,0,1,0,...,1,0,0,0,1,0,0,0,0,1
11108,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1


In [10]:
balanced_data['class'].value_counts()

class
0    5555
1    5555
Name: count, dtype: int64

In [11]:
balanced_data.to_csv(join(base_dir, 'balanced', f'{dataset_name}-balanced.csv'))

# AndroCrawl dataset

In [51]:
files

Unnamed: 0,filename
0,adroit.csv
1,androcrawl.csv
2,android_permissions.csv
3,defensedroid_prs.csv
4,drebin215.csv
5,kronodroid_emulador.csv
6,kronodroid_real_device.csv


In [52]:
data = pd.read_csv(join(paths['binaries'], files['filename'][1]))

column_names = process_column(column_names=data.columns.values, delimiter='.')
data.columns = column_names

data.head()

Unnamed: 0,_access_superuser,_change_component_enabled_state,_clear_app_user_data,_delete_cache_files,_delete_packages,_disable_keyguard,_factory_test,_install_packages,_inject_events,_internal_system_window,...,_user_present,hidden_apk,sends_sms_to_suspicious_number(s),package_domain_exists,reads_phone_data_at_startup,sends_sms_at_startup,starts_service_at_startup,sends_sms_when_receiving_sms,accesses_a_database_when_receiving_sms,class
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [53]:
class_names=['authentic', 'malware']
dataset_name = 'ANDROCRAWL'
print(data['class'].value_counts())

class
0    86574
1    10170
Name: count, dtype: int64


In [54]:
balanced_data = balance_classes(data=data, MAX_INSTANCES=10000)
balanced_data

Unnamed: 0,_access_superuser,_change_component_enabled_state,_clear_app_user_data,_delete_cache_files,_delete_packages,_disable_keyguard,_factory_test,_install_packages,_inject_events,_internal_system_window,...,_user_present,hidden_apk,sends_sms_to_suspicious_number(s),package_domain_exists,reads_phone_data_at_startup,sends_sms_at_startup,starts_service_at_startup,sends_sms_when_receiving_sms,accesses_a_database_when_receiving_sms,class
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
19996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
19997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,1
19998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,1


In [55]:
balanced_data['class'].value_counts()

class
0    10000
1    10000
Name: count, dtype: int64

In [56]:
balanced_data.to_csv(join(base_dir, 'balanced', f'{dataset_name}-balanced.csv'))

# KronoDroid dataset

In [57]:
files

Unnamed: 0,filename
0,adroit.csv
1,androcrawl.csv
2,android_permissions.csv
3,defensedroid_prs.csv
4,drebin215.csv
5,kronodroid_emulador.csv
6,kronodroid_real_device.csv


In [58]:
data = pd.read_csv(join(paths['binaries'], files['filename'][6]))
column_names = process_column(column_names=data.columns.values, delimiter='.')
data.columns = column_names
data.head()

Unnamed: 0,getuid32,getgid32,geteuid32,getegid32,getgroups32,getppid,brk,kill,tgkill,ptrace,...,filesinsideapk,activities,nrintservices,nrintservicesactions,nrintactivities,nrintactivitiesactions,nrintreceivers,nrintreceiversactions,nrservices,class
0,1,0,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,0,1,0
2,1,0,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,0,0,0,0


In [62]:
class_names=['authentic', 'malware']
dataset_name = 'KRONODROID'

print(data['class'].value_counts())

class
1    41382
0    36755
Name: count, dtype: int64


In [59]:
balanced_data = balance_classes(data=data, MAX_INSTANCES=10000)
balanced_data

Unnamed: 0,getuid32,getgid32,geteuid32,getegid32,getgroups32,getppid,brk,kill,tgkill,ptrace,...,filesinsideapk,activities,nrintservices,nrintservicesactions,nrintactivities,nrintactivitiesactions,nrintreceivers,nrintreceiversactions,nrservices,class
0,1,0,1,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
1,1,0,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,0,1,0,0
2,1,0,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,0,0,0,0
4,1,0,1,0,0,0,0,0,0,0,...,1,1,0,0,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,1,0,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,0,1,1,1
19996,1,0,1,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
19997,1,0,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,0,1,1,1
19998,1,0,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,0,1,1,1


In [60]:
balanced_data['class'].value_counts()

class
0    10000
1    10000
Name: count, dtype: int64

In [61]:
balanced_data.to_csv(join(base_dir, 'balanced', f'{dataset_name}-balanced.csv'))

# Android Permissions

In [62]:
files

Unnamed: 0,filename
0,adroit.csv
1,androcrawl.csv
2,android_permissions.csv
3,defensedroid_prs.csv
4,drebin215.csv
5,kronodroid_emulador.csv
6,kronodroid_real_device.csv


In [63]:
data = pd.read_csv(join(paths['binaries'], files['filename'][2]))

column_names = [col.split(':')[-1].strip().lower() for col in data.columns.values]
#  process_col = [col.strip().replace(';','.').replace(',','.').replace(':','.').replace('->','').replace('/','.').replace(' ', '_').lower() for col in column_names]
data.columns = column_names
data.head()

Unnamed: 0,access drm content. (s),access email provider data (s),access download manager. (s),advanced download manager functions. (s),audio file access (s),install drm content. (s),modify google settings (s),move application resources (s),read google settings (s),send download notifications. (s),...,read calendar events (d),read contact data (d),read sensitive log data (d),read user defined dictionary (d),retrieve system internal state (s),set alarm in alarm clock (s),write browser's history and bookmarks (d),write contact data (d),write to user defined dictionary (s),class
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [64]:
class_names=['authentic', 'malware']
dataset_name = 'ANDROID-PERMISSIONS'
print(data['class'].value_counts())


class
1    17787
0     9077
Name: count, dtype: int64


In [65]:
balanced_data = balance_classes(data=data, MAX_INSTANCES=10000)
balanced_data

Class 0 with only 9077 samples. Updating MAX_INSTANCES to 9077.


Unnamed: 0,access drm content. (s),access email provider data (s),access download manager. (s),advanced download manager functions. (s),audio file access (s),install drm content. (s),modify google settings (s),move application resources (s),read google settings (s),send download notifications. (s),...,read calendar events (d),read contact data (d),read sensitive log data (d),read user defined dictionary (d),retrieve system internal state (s),set alarm in alarm clock (s),write browser's history and bookmarks (d),write contact data (d),write to user defined dictionary (s),class
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18149,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
18150,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
18151,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
18152,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [66]:
balanced_data['class'].value_counts()

class
0    9077
1    9077
Name: count, dtype: int64

In [67]:
balanced_data.to_csv(join(base_dir, 'balanced', f'{dataset_name}-balanced.csv'))

# Adroit

In [68]:
files

Unnamed: 0,filename
0,adroit.csv
1,androcrawl.csv
2,android_permissions.csv
3,defensedroid_prs.csv
4,drebin215.csv
5,kronodroid_emulador.csv
6,kronodroid_real_device.csv


In [69]:
data = pd.read_csv(join(paths['binaries'], files['filename'][0]))

column_names = process_column(column_names=data.columns.values, delimiter='.')
data.columns = column_names

data.head()

Unnamed: 0,bind_wallpaper,force_back,read_calendar,body_sensors,read_social_stream,read_sync_stats,internet,change_configuration,bind_dream_service,hardware_test,...,read_sms,battery_stats,global_search,bind_nfc_service,package_usage_stats,set_always_finish,access_drm,broadcast_sticky,mount_unmount_filesystems,class
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [70]:
class_names=['authentic', 'malware']
dataset_name = 'ADROIT'
print(data['class'].value_counts())

class
0    8058
1    3418
Name: count, dtype: int64


In [71]:
balanced_data = balance_classes(data=data, MAX_INSTANCES=10000)
balanced_data

Class 1 with only 3418 samples. Updating MAX_INSTANCES to 3418.


Unnamed: 0,bind_wallpaper,force_back,read_calendar,body_sensors,read_social_stream,read_sync_stats,internet,change_configuration,bind_dream_service,hardware_test,...,read_sms,battery_stats,global_search,bind_nfc_service,package_usage_stats,set_always_finish,access_drm,broadcast_sticky,mount_unmount_filesystems,class
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6831,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6832,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6833,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6834,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [72]:
balanced_data['class'].value_counts()

class
0    3418
1    3418
Name: count, dtype: int64

In [73]:
balanced_data.to_csv(join(base_dir, 'balanced', f'{dataset_name}-balanced.csv'))

# DefenseDroid

In [74]:
files

Unnamed: 0,filename
0,adroit.csv
1,androcrawl.csv
2,android_permissions.csv
3,defensedroid_prs.csv
4,drebin215.csv
5,kronodroid_emulador.csv
6,kronodroid_real_device.csv


In [75]:
data = pd.read_csv(join(paths['binaries'], files['filename'][3]))

## column_names = process_column(column_names=data.columns.values, delimiter='.')

column_names = [col.split(':')[-1].strip().lower() for col in data.columns.values]
data.columns = column_names

data.head()

Unnamed: 0,getuiservice.com.glodon.ynjtapp,collect_metrics,sec.mdm_phone_restriction,media_mounted,usage_access_settings,voip_broadcast_voip_intents,getuiservice.com.huamaitel.client.yun,client,getuiservice.pailiefive.main,restart_pxckages,...,service07,pluginjobservice,getuipushservice,sandboxedprocessservice3,actionservice,themelivewallpaperservice,sandboxedprocessservice2,channelservice$kernelservice,sampledownloaderservice,class
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [76]:
class_names=['authentic', 'malware']
dataset_name = 'DEFENSEDROID'
print(data['class'].value_counts())


class
1    6000
0    5975
Name: count, dtype: int64


In [77]:
balanced_data = balance_classes(data=data, MAX_INSTANCES=10000)
balanced_data

Class 0 with only 5975 samples. Updating MAX_INSTANCES to 5975.


Unnamed: 0,getuiservice.com.glodon.ynjtapp,collect_metrics,sec.mdm_phone_restriction,media_mounted,usage_access_settings,voip_broadcast_voip_intents,getuiservice.com.huamaitel.client.yun,client,getuiservice.pailiefive.main,restart_pxckages,...,service07,pluginjobservice,getuipushservice,sandboxedprocessservice3,actionservice,themelivewallpaperservice,sandboxedprocessservice2,channelservice$kernelservice,sampledownloaderservice,class
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11945,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
11946,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
11947,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
11948,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [78]:
balanced_data['class'].value_counts()

class
0    5975
1    5975
Name: count, dtype: int64

In [79]:
balanced_data.to_csv(join(base_dir, 'balanced', f'{dataset_name}-balanced.csv'))

# MALMEM

In [81]:
def process_label(array, separator='-', index=0):
    return np.asarray([s.split(separator)[index].strip() for s in array])

In [82]:
files = dataframe(os.listdir(join(paths['datasets'], 'malmem2022')), columns=['filename'])
files

Unnamed: 0,filename
0,Obfuscated-MalMem2022.csv


In [83]:
data = pd.read_csv(join(paths['datasets'], 'malmem2022', files['filename'][0]))

column_names = process_column(column_names=data.columns.values, delimiter='.')
data.columns = column_names
data['category'] = process_label(data['category'].values)
data.drop(index=data.iloc[33867].name, inplace=True) ##outlier
data.drop(columns=['category'], inplace=True)
data['class'] = data['class'].map({'Benign': 0, 'Malware':1})
data

Unnamed: 0,nproc,nppid,avg_threads,nprocs64bit,avg_handlers,ndlls,avg_dlls_per_proc,nhandles,avg_handles_per_proc,nport,...,kernel_drivers,fs_drivers,process_services,shared_process_services,interactive_process_services,nactive,ncallbacks,nanonymous,ngeneric,class
0,45,17,10.555556,0,202.844444,1694,38.500000,9129,212.302326,0,...,221,26,24,116,0,121,87,0,8,0
1,47,19,11.531915,0,242.234043,2074,44.127660,11385,242.234043,0,...,222,26,24,118,0,122,87,0,8,0
2,40,14,14.725000,0,288.225000,1932,48.300000,11529,288.225000,0,...,222,26,27,118,0,120,88,0,8,0
3,32,13,13.500000,0,264.281250,1445,45.156250,8457,264.281250,0,...,222,26,27,118,0,120,88,0,8,0
4,42,16,11.452381,0,281.333333,2067,49.214286,11816,281.333333,0,...,222,26,24,118,0,124,87,0,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58591,37,15,10.108108,0,215.486487,1453,39.270270,7973,215.486487,0,...,221,26,24,116,0,120,86,0,8,1
58592,37,14,9.945946,0,190.216216,1347,36.405405,7038,190.216216,0,...,221,26,24,116,0,116,88,0,8,1
58593,38,15,9.842105,0,210.026316,1448,38.105263,7982,215.729730,0,...,221,26,24,116,0,120,88,0,8,1
58594,37,15,10.243243,0,215.513513,1452,39.243243,7974,215.513513,0,...,221,26,24,116,0,120,87,0,8,1


In [84]:
# X = data.drop(columns=['category', 'class'])

# column_names = [col.split(':')[-1].strip().lower() for col in X.columns.values]

# # y = data['class']

# # # Binary
# # y = data['class'].map({'Benign': 0, 'Malware':1})
# # class_names=['Benign', 'Malware']

# # Multiclass
# y = data['category'].map({'Benign': 0, 'Spyware':1, 'Ransomware': 2, 'Trojan': 3})
# class_names=['Benign', 'Spyware', 'Ransomware', 'Trojan']
print(data.shape)
print(data.info())

(58595, 56)
<class 'pandas.core.frame.DataFrame'>
Index: 58595 entries, 0 to 58595
Data columns (total 56 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   nproc                           58595 non-null  int64  
 1   nppid                           58595 non-null  int64  
 2   avg_threads                     58595 non-null  float64
 3   nprocs64bit                     58595 non-null  int64  
 4   avg_handlers                    58595 non-null  float64
 5   ndlls                           58595 non-null  int64  
 6   avg_dlls_per_proc               58595 non-null  float64
 7   nhandles                        58595 non-null  int64  
 8   avg_handles_per_proc            58595 non-null  float64
 9   nport                           58595 non-null  int64  
 10  nfile                           58595 non-null  int64  
 11  nevent                          58595 non-null  int64  
 12  ndesktop                 

In [85]:
dataset_name = 'MALMEM'
print('class: ', data['class'].value_counts())
# print('Families: ', data['category'].value_counts())

class:  class
0    29298
1    29297
Name: count, dtype: int64


In [86]:
balanced_data = balance_classes(data=data, MAX_INSTANCES=10000)
balanced_data

Unnamed: 0,nproc,nppid,avg_threads,nprocs64bit,avg_handlers,ndlls,avg_dlls_per_proc,nhandles,avg_handles_per_proc,nport,...,kernel_drivers,fs_drivers,process_services,shared_process_services,interactive_process_services,nactive,ncallbacks,nanonymous,ngeneric,class
0,46,12,13.145947,0,303.106227,2374,51.448760,14020,303.106227,0,...,222,26,26,118,0,124,87,0,8,0
1,43,17,10.790698,0,249.842950,1839,42.767442,10744,255.815401,0,...,222,26,24,118,0,126,87,0,8,0
2,39,12,12.986170,0,291.369167,2012,50.465426,11616,291.369167,0,...,222,26,27,118,0,123,88,0,8,0
3,40,13,12.921970,0,292.946970,2018,50.450000,11717,292.946970,0,...,222,26,27,118,0,123,88,0,8,0
4,42,12,13.643900,0,312.315219,2189,51.434271,13292,312.315219,0,...,222,26,27,118,0,124,88,0,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,42,16,9.404762,0,199.333333,1574,37.476190,8372,199.333333,0,...,221,26,24,116,0,121,87,0,8,1
19996,41,16,10.121951,0,210.560976,1614,39.365854,8633,210.560976,0,...,221,26,24,116,0,121,86,0,8,1
19997,42,16,10.785714,0,210.000000,1635,38.928571,8820,210.000000,0,...,221,26,24,116,0,122,87,0,8,1
19998,41,16,9.804878,0,208.219512,1606,39.170732,8537,208.219512,0,...,221,26,24,116,0,122,86,0,8,1


In [87]:
balanced_data['class'].value_counts()

class
0    10000
1    10000
Name: count, dtype: int64

In [88]:
balanced_data.to_csv(join(base_dir, 'balanced', f'{dataset_name}-balanced.csv'))

# Motodroid-100k-filtered

In [109]:
path_mh100 = join(base_dir, 'data', 'mh100', 'all')

In [112]:
files = dataframe(os.listdir(path_mh100), columns=['filename'])
files

Unnamed: 0,filename
0,.ipynb_checkpoints
1,desktop.ini
2,mh100-features-all.csv
3,mh100-features-classes.csv
4,mh100.npy
5,mh100_labels.csv
6,mh100_vt-labels.csv


In [113]:
import gc
def load_motodroid(path, dataset_name):
    dataset = np.load(join(path, dataset_name+'.npy'))
    vt_labels = pd.read_csv(join(path, dataset_name+'_vt-labels.csv'))
    labels = dataframe(vt_labels['4-class'].values, columns=['class'])
    column_names = pd.read_csv(join(path, dataset_name+'-features-all.csv'))['features']
    # df_labels = np.load(join(path, dataset_name+'_labels.npy'))
    # data_labels = pd.read_csv(join(path, dataset_name+'_labels.csv'), index_col=0)
    # vt_labels = pd.read_csv(join(path, dataset_name+'_vt-labels.csv'))
    # vt_labels.drop(columns=['index'], inplace=True)
    # classes_names = ['authentic', 'malware']
    
    dataset = dataframe(dataset, columns=column_names.values)
    dataset = pd.concat([dataset, labels], axis=1)
    gc.collect()
    return dataset

In [114]:
dataset = load_motodroid(
    path=path_mh100, 
    dataset_name='mh100'
)

In [115]:
dataset.head()

Unnamed: 0,Permission::WAKE_LOCK,Permission::WRITE_EXTERNAL_STORAGE,Permission::ACCESS_NETWORK_STATE,Permission::WRITE_SETTINGS,Permission::INTERNET,Intent::AUDIO_BECOMING_NOISY,APICall::Landroid/content/Intent.toUri(),APICall::Landroid/view/View.setTag(),APICall::Landroid/util/Xml.newSerializer(),APICall::Landroid/content/pm/PackageManager.queryIntentServices(),...,APICall::Landroid/widget/ListView.getScaleX(),APICall::Landroid/widget/ScrollView.setOnApplyWindowInsetsListener(),APICall::Landroid/widget/ExpandableListView.setNextFocusUpId(),APICall::Landroid/widget/ScrollView.setSoundEffectsEnabled(),APICall::Landroid/widget/TableRow.getLeft(),APICall::Landroid/widget/HorizontalScrollView.onKeyUp(),APICall::Landroid/widget/RatingBar.isFocusable(),APICall::Landroid/preference/ListPreference.getView(),APICall::Landroid/widget/LinearLayout.computeHorizontalScrollExtent(),class
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,1,1,0,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [117]:
dataset_name = 'mh100'
print(dataset['class'].value_counts())

0    92134
1     9800
Name: class, dtype: int64


In [118]:
balanced_data = balance_classes(data=dataset, MAX_INSTANCES=5000)
balanced_data

Unnamed: 0,Permission::WAKE_LOCK,Permission::WRITE_EXTERNAL_STORAGE,Permission::ACCESS_NETWORK_STATE,Permission::WRITE_SETTINGS,Permission::INTERNET,Intent::AUDIO_BECOMING_NOISY,APICall::Landroid/content/Intent.toUri(),APICall::Landroid/view/View.setTag(),APICall::Landroid/util/Xml.newSerializer(),APICall::Landroid/content/pm/PackageManager.queryIntentServices(),...,APICall::Landroid/widget/ListView.getScaleX(),APICall::Landroid/widget/ScrollView.setOnApplyWindowInsetsListener(),APICall::Landroid/widget/ExpandableListView.setNextFocusUpId(),APICall::Landroid/widget/ScrollView.setSoundEffectsEnabled(),APICall::Landroid/widget/TableRow.getLeft(),APICall::Landroid/widget/HorizontalScrollView.onKeyUp(),APICall::Landroid/widget/RatingBar.isFocusable(),APICall::Landroid/preference/ListPreference.getView(),APICall::Landroid/widget/LinearLayout.computeHorizontalScrollExtent(),class
0,1,1,1,0,1,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,1,0,1,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,1,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
9996,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9997,1,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9998,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [119]:
balanced_data['class'].value_counts()

0    5000
1    5000
Name: class, dtype: int64

In [16]:
balanced_data.to_csv(join(base_dir, 'balanced', f'{dataset_name}-balanced.csv'))

## Filtering

In [116]:
## CHI-SQUARE FILTER

chi_feats = pd.read_csv(join(base_dir, 'features', 'motodroid-filtered-vt-4_chi2.csv'))
chi_feats

Unnamed: 0,index,names,stats,p_values
0,3242,Permission::MOUNT_UNMOUNT_FILESYSTEMS,22321.011249,0.000000
1,3240,Permission::CHANGE_WIFI_STATE,17168.829776,0.000000
2,3249,Permission::GET_TASKS,16600.018340,0.000000
3,3950,Intent::PACKAGE_REMOVED,16365.160445,0.000000
4,3430,Permission::CHANGE_NETWORK_STATE,15538.667462,0.000000
...,...,...,...,...
12530,10256,APICall::Landroid/widget/HorizontalScrollView....,3.846674,0.049845
12531,19414,APICall::Landroid/widget/MultiAutoCompleteText...,3.846674,0.049845
12532,10620,APICall::Landroid/nfc/cardemulation/CardEmulat...,3.844322,0.049915
12533,11051,APICall::Landroid/widget/ListView.computeVerti...,3.843878,0.049928


### 10k

In [120]:
# balanced = balanced_data.filter(np.concatenate((chi_feats['names'].values, ['class'])))
balanced = balanced_data.filter(np.concatenate((chi_feats['names'].values[0:200], ['class'])))
balanced

Unnamed: 0,Permission::MOUNT_UNMOUNT_FILESYSTEMS,Permission::CHANGE_WIFI_STATE,Permission::GET_TASKS,Intent::PACKAGE_REMOVED,Permission::CHANGE_NETWORK_STATE,Permission::READ_LOGS,Permission::REQUEST_INSTALL_PACKAGES,Intent::USER_PRESENT,APICall::Landroid/view/Window.setType(),Permission::SYSTEM_ALERT_WINDOW,...,APICall::Landroid/view/View.setImportantForAccessibility(),APICall::Landroid/view/Window.getDecorView(),APICall::Landroid/view/View.getImportantForAccessibility(),APICall::Landroid/view/Menu.findItem(),APICall::Landroid/content/res/Resources.getText(),APICall::Landroid/widget/TextView.setTextAppearance(),APICall::Landroid/widget/OverScroller.isFinished(),APICall::Landroid/content/res/TypedArray.getDrawable(),APICall::Landroid/view/ViewConfiguration.getLongPressTimeout(),class
0,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
1,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
2,0,0,0,1,0,0,0,1,0,0,...,1,1,1,1,1,1,1,1,1,0
3,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
4,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
9996,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
9997,0,1,1,1,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1
9998,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [122]:
dataset_name = 'mh100-10k-instances'
print(balanced['class'].value_counts())

0    5000
1    5000
Name: class, dtype: int64


In [124]:
balanced.to_csv(join(base_dir, 'balanced', '10k', f'{dataset_name}-balanced.csv'), index=False)

### 5k

In [34]:
balanced = balanced_data.filter(np.concatenate((chi_feats['names'].values[0:5000], ['class'])))
balanced

Unnamed: 0,Permission::MOUNT_UNMOUNT_FILESYSTEMS,Permission::CHANGE_WIFI_STATE,Permission::GET_TASKS,Intent::PACKAGE_REMOVED,Permission::CHANGE_NETWORK_STATE,Permission::READ_LOGS,Permission::REQUEST_INSTALL_PACKAGES,Intent::USER_PRESENT,APICall::Landroid/view/Window.setType(),Permission::SYSTEM_ALERT_WINDOW,...,APICall::Landroid/widget/AdapterView.getChildCount(),APICall::Landroid/widget/ImageButton.clearAnimation(),APICall::Landroid/preference/PreferenceScreen.saveHierarchyState(),APICall::Landroid/preference/PreferenceScreen.restoreHierarchyState(),APICall::Landroid/widget/LinearLayout.getBaseline(),APICall::Landroid/widget/ImageView.setTranslationY(),APICall::Landroid/widget/RadioGroup.addView(),APICall::Landroid/view/ViewGroup.setTranslationY(),APICall::Landroid/widget/ListView.setSelectionAfterHeaderView(),class
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19595,1,1,1,0,1,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,1
19596,1,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
19597,0,1,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
19598,1,1,1,1,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1


In [35]:
dataset_name = 'mh100-5k'
print(balanced['class'].value_counts())

class
0    9800
1    9800
Name: count, dtype: int64


In [36]:
balanced.to_csv(join(base_dir, 'balanced', f'{dataset_name}-balanced.csv'))

### 3k

In [37]:
balanced = balanced_data.filter(np.concatenate((chi_feats['names'].values[0:3000], ['class'])))
balanced

Unnamed: 0,Permission::MOUNT_UNMOUNT_FILESYSTEMS,Permission::CHANGE_WIFI_STATE,Permission::GET_TASKS,Intent::PACKAGE_REMOVED,Permission::CHANGE_NETWORK_STATE,Permission::READ_LOGS,Permission::REQUEST_INSTALL_PACKAGES,Intent::USER_PRESENT,APICall::Landroid/view/Window.setType(),Permission::SYSTEM_ALERT_WINDOW,...,APICall::Landroid/webkit/WebSettings.setRenderPriority(),APICall::Landroid/os/ParcelFileDescriptor.getStatSize(),APICall::Landroid/util/JsonReader.nextDouble(),APICall::Landroid/widget/EditText.getRight(),APICall::Landroid/os/Looper.prepare(),APICall::Landroid/os/Debug.getNativeHeapAllocatedSize(),APICall::Landroid/view/accessibility/CaptioningManager.getFontScale(),APICall::Landroid/view/ViewGroup.setMinimumHeight(),APICall::Landroid/view/accessibility/CaptioningManager.getUserStyle(),class
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,1,0,0,0,1,0,0,...,0,1,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19595,1,1,1,0,1,1,0,0,1,1,...,0,0,0,0,1,0,0,0,0,1
19596,1,0,0,0,0,0,1,0,1,0,...,0,0,0,0,1,0,0,0,0,1
19597,0,1,1,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
19598,1,1,1,1,1,0,1,1,1,1,...,0,0,0,0,1,0,0,0,0,1


In [38]:
dataset_name = 'mh100-3k'
print(balanced['class'].value_counts())

class
0    9800
1    9800
Name: count, dtype: int64


In [39]:
balanced.to_csv(join(base_dir, 'balanced', f'{dataset_name}-balanced.csv'))

In [11]:
from pyspark.sql import SparkSession

# os.environ['SPARK_HOME'] = 'C:\spark\spark-3.3.0-bin-hadoop3'
os.environ["JAVA_HOME"] = 'C:\Program Files\Java\jdk-20'

In [None]:
spark =SparkSession.builder.appName('Spark-csv.com').getOrCreate()