In [2]:
import sys
import pickle
import gzip
import re
import subprocess
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import time 

## Parsing

In [3]:
def Inst2ID(instruction):
    global VocabDict

    className = instruction.split('->')[0].split('$')[0]
    if className in VocabDict.index:
        return VocabDict[className]
    else: 
        return None

def parseMethodCallArguments(instruction : str) -> list:
    instruction = instruction.split(': ')[-1]
    method = instruction.replace('{', '').replace('}', '').replace('[', '').strip().split(' // ')[0].split(' ')[-1]

    method = method.replace(';.', '->')
    methodParts = method.split(':')
    methodCall = methodParts[0]
    if len(methodCall.split('/')[0]) == 2:
        methodCall = 'class.local####'
        
    if len(methodParts) > 1:
        methodArgs = methodParts[1].replace('(', '').replace(')', '').split(';')
        methodArgs = [item for item in methodArgs if len(item) != 0 and item[0] == 'L']
    else:
        methodArgs = []
        
    mtdSplit = methodCall.split('->')
    if len(mtdSplit) > 1:
        methodCall = mtdSplit[0].split('$')[0] + '->' + mtdSplit[1].split('$')[0]
                    
    args = [methodCall] + methodArgs
    args = [Inst2ID(argo) for argo in args]
    argsIDs = np.array([argo for argo in args if argo != None], dtype='uint32')
    
    return argsIDs

def parseDex(FileName : str) -> list:
    global outputRoot
    global inputRoot
    
    outputFilePath = outputRoot + FileName
    inputFilePath  = inputRoot  + FileName

    dex = subprocess.run(['dexdump', '-d', inputFilePath], stdout=subprocess.PIPE).stdout.decode(encoding="ISO-8859-1")

    condidatClasses = re.split("Class descriptor  : '", dex)[1:]

    collectedMethods = []

    for currentClass in condidatClasses:
        if currentClass.split('$')[0] not in VocabDict.index:
            className = currentClass.split('\n')[0][1:-2]
            currentCondidatMethods = re.split('    #\d', currentClass)
            for method in currentCondidatMethods:
                if method != '':
                    Instructions = [item for item in method.split('\n') if '|' in item]
                    if len(Instructions) != 0:
                        methodMeta = Instructions[0].split()[-1]
                        Instructions = Instructions[1:]

                        functionMethodCallsArgs    = np.concatenate([parseMethodCallArguments(line) for line in Instructions if ('method@' in line)] + [np.array([], dtype='uint32')])
                                                
                        collectedMethods.append((className, methodMeta, functionMethodCallsArgs, Instructions))

    if len(collectedMethods) > 1:
        colNames = ['className', 'methodMeta', 'functionMethodCallsArgs', 'Instructions']
        df = pd.DataFrame(collectedMethods, columns=colNames)
        #df['className'] = pd.factorize(df.className)[0]
        #df['methodMeta'] = pd.factorize(df.methodMeta)[0]

        df.to_parquet(outputFilePath)

### Drebin Details

In [6]:
df = pd.read_msgpack('/ws/mnt/habouch/datasets/android_dataset/meta/drebin_meta.msg')
fileNames  = df.sha256.to_list()

import glob
from multiprocessing import Pool
mypool = Pool(processes=40)

outputRoot = '/ws/papers/active/petadroid/code/output/drebinDetails/'
inputRoot  = '/ws/mnt/habouch/datasets/android_dataset/samples/drebin_dataset/'

VocabDict = pd.read_csv('dataset/InstructionDict.csv', index_col=0, names=['ID'])
VocabDict = VocabDict.ID

doneFileNames = [item.split('/')[-1] for item in glob.glob(outputRoot + '*')]
todoFileNames = np.array(list(set(fileNames).difference(set(doneFileNames))))
print(len(todoFileNames))

5560


In [7]:
mypool.map(parseDex, todoFileNames)
mypool.close();
mypool.join();

### MiniZoo

### Drebin

### MalGenome

### Maldozer

### AMD

### Benign

In [5]:
df = pd.read_csv('/ws/mnt/habouch/datasets/zoows/history/latest.csv')
df['sha256'] = df.sha256.str.lower()
df = df.loc[df.vt_detection == 0]
fileNames = df.sha256.sample(50000, random_state=54).to_list()

In [9]:
import glob
from multiprocessing import Pool
mypool = Pool(processes=40)

outputRoot = '/ws/papers/active/petadroid/code/output/benign/'
inputRoot  = '/ws/mnt/habouch/datasets/zoo_dataset/'

VocabDict = pd.read_csv('dataset/InstructionDict.csv', index_col=0, names=['ID'])
VocabDict = VocabDict.ID

doneFileNames = [item.split('/')[-1] for item in glob.glob(outputRoot + '*')]
todoFileNames = np.array(list(set(fileNames).difference(set(doneFileNames))))
print(len(todoFileNames))

1263


In [10]:
mypool.map(parseDex, todoFileNames)
mypool.close();
mypool.join();

### Obfuscated Malgenome (praguard)

In [13]:
df = pd.read_msgpack('/ws/mnt/habouch/datasets/android_dataset/android_meta/praguard_meta_df.msg')
fileNames = df.md5.to_list()

In [14]:
import glob
from multiprocessing import Pool
mypool = Pool(processes=40)

outputRoot = '/ws/papers/active/petadroid/code/output/praguard/'
inputRoot  = '/ws/mnt/habouch/datasets/android_dataset/samples/praguard_dataset/files/'

VocabDict = pd.read_csv('dataset/InstructionDict.csv', index_col=0, names=['ID'])
VocabDict = VocabDict.ID

doneFileNames = [item.split('/')[-1] for item in glob.glob(outputRoot + '*')]
todoFileNames = np.array(list(set(fileNames).difference(set(doneFileNames))))
print(len(todoFileNames))

10370


In [15]:
mypool.map(parseDex, todoFileNames)
mypool.close();
mypool.join();

### Obfuscated Drebin (charmilion)

In [4]:
df = pd.read_msgpack('/ws/mnt/habouch/datasets/android_dataset/meta/drebin_obfus_meta.msg')
fileNames = df.md5.to_list()
fileNames[:5]

['000146786111132694d5f0d8f89e17c6',
 '000379d1a9201758fdae17ccfad374f5',
 '0006d3fc1fcd3277451fe676fe13b1c9',
 '0008ac38b8c073e5bab4a7c8b4120fcd',
 '000909e08eeee76eb4a00a922333da62']

In [6]:
import glob
from multiprocessing import Pool
mypool = Pool(processes=40)

outputRoot = '/ws/papers/active/petadroid/code/output/charmi/'
inputRoot  = '/ws/mnt/habouch/datasets/android_dataset/samples/drebin_obfus_dataset/files/'

VocabDict = pd.read_csv('dataset/InstructionDict.csv', index_col=0, names=['ID'])
VocabDict = VocabDict.ID

doneFileNames = [item.split('/')[-1] for item in glob.glob(outputRoot + '*')]
todoFileNames = np.array(list(set(fileNames).difference(set(doneFileNames))))
print(len(todoFileNames))

49625


In [7]:
mypool.map(parseDex, todoFileNames)
mypool.close();
mypool.join();

### Obfuscated benign (charmilion)

In [7]:
df = pd.read_msgpack('/ws/mnt/habouch/datasets/android_dataset/meta/benign_obfus_meta_df.msg')
fileNames = df.md5.to_list()
fileNames[:5]

['0001928ff53c44ca2af68152af336a25',
 '0002461ffb54834a257c9d4eecd2ebb2',
 '000315d021e6ba52e1bc1d30ffe592b5',
 '000497a806c495c4b7b385b155e72ce6',
 '000520fe345eae9567fdb283161bbe28']

In [8]:
import glob
from multiprocessing import Pool
mypool = Pool(processes=40)

outputRoot = '/ws/papers/active/petadroid/code/output/benign_obfus/'
inputRoot  = '/ws/mnt/habouch/datasets/android_dataset/samples/benign_obfus/files/'

VocabDict = pd.read_csv('dataset/InstructionDict.csv', index_col=0, names=['ID'])
VocabDict = VocabDict.ID

doneFileNames = [item.split('/')[-1] for item in glob.glob(outputRoot + '*')]
todoFileNames = np.array(list(set(fileNames).difference(set(doneFileNames))))
print(len(todoFileNames))

41067


In [9]:
mypool.map(parseDex, todoFileNames)
mypool.close();
mypool.join();

### Not Obfuscated benign

In [13]:
df = pd.read_msgpack('/ws/mnt/habouch/datasets/android_dataset/meta/benign_notobfus_meta_df.msg')
fileNames = df.sha256.str.lower().to_list()
fileNames[:5]

['cf23b28b6e396b70baafa11fb27d3c9792590b6e9dcb0bbf9d0f70b0a2742ec6',
 '0866ea2f7b8ea45fe9947ab9c44d45b2424cfca0ba2a9f21686f67cc95ff6df8',
 '8822d55bb6a75c8745d426653ecd8041e4a5e84b062062f53b9ae542bb8afbc7',
 '273979aadfda6f72ee3c6d02aed768a18787afdc4dd3a77c2df6e473d3909d66',
 '98a634d1ff7cc79b675f2406a1dfea96a2617d65d8ee4cc418b86368d274cb38']

In [14]:
import glob
from multiprocessing import Pool
mypool = Pool(processes=40)

outputRoot = '/ws/papers/active/petadroid/code/output/benign_notobfus/'
inputRoot  = '/ws/mnt/habouch/datasets/zoo_dataset/'

VocabDict = pd.read_csv('dataset/InstructionDict.csv', index_col=0, names=['ID'])
VocabDict = VocabDict.ID

doneFileNames = [item.split('/')[-1] for item in glob.glob(outputRoot + '*')]
todoFileNames = np.array(list(set(fileNames).difference(set(doneFileNames))))
print(len(todoFileNames))

5500


In [15]:
mypool.map(parseDex, todoFileNames)
mypool.close();
mypool.join();

### VirusShare

In [3]:
df = pd.read_msgpack('/ws/mnt/habouch/datasets/android_dataset/meta/vshare_meta.msg')
fileNames = df.md5.to_list()
fileNames[:5]

['18750d3a30a52e508aa4a03fdada630e',
 '1874ed85ba7160d4b8002f682d8b3102',
 '1873ebb0538fc2656ea67aa93815dba6',
 '1873ea2a9e071290ff5a2d1b90efbc4e',
 '187347fd95d5675ac183f86f8409789b']

In [6]:
import glob
from multiprocessing import Pool
mypool = Pool(processes=40)

outputRoot = '/ws/papers/active/petadroid/code/output/vshare/'
inputRoot  = '/ws/mnt/habouch/datasets/android_dataset/samples/vshare_dataset/'

VocabDict = pd.read_csv('dataset/InstructionDict.csv', index_col=0, names=['ID'])
VocabDict = VocabDict.ID

doneFileNames = [item.split('/')[-1] for item in glob.glob(outputRoot + '*')]
todoFileNames = np.array(list(set(fileNames).difference(set(doneFileNames))))
print(len(todoFileNames))

30340


In [7]:
mypool.map(parseDex, todoFileNames)
mypool.close();
mypool.join();

### Zoo Malware

In [8]:
df = pd.read_csv('/ws/mnt/habouch/datasets/zoows/history/latest.csv')
df['sha256'] = df.sha256.str.lower()
df = df.loc[df.vt_detection != 0]
fileNames = df.sha256.sample(50000, random_state=54).to_list()

In [10]:
import glob
from multiprocessing import Pool
mypool = Pool(processes=40)

outputRoot = '/ws/papers/active/petadroid/code/output/zoomalware/'
inputRoot  = '/ws/mnt/habouch/datasets/zoo_dataset/'

VocabDict = pd.read_csv('dataset/InstructionDict.csv', index_col=0, names=['ID'])
VocabDict = VocabDict.ID

doneFileNames = [item.split('/')[-1] for item in glob.glob(outputRoot + '*')]
todoFileNames = np.array(list(set(fileNames).difference(set(doneFileNames))))
print(len(todoFileNames))

50000


In [11]:
mypool.map(parseDex, todoFileNames)
mypool.close();
mypool.join();