In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.utils.tensorboard import SummaryWriter

import numpy as np
import pandas as pd
import hashlib
import shutil
import glob
import time
import re
import os

from tqdm import tqdm
from datetime import datetime
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
    
class Net(nn.Module):
    def __init__(self, sequenceSize=20000, embeddingDim=128, vocabularySize=2**16, filterWidth=5, filterNumber=1024):
        super(Net, self).__init__()
        self.sequenceSize   = sequenceSize
        self.embeddingDim   = embeddingDim
        self.vocabularySize = vocabularySize
        self.filterWidth    = filterWidth
        self.filterNumber   = filterNumber 
        
        self.embedding = nn.Embedding(self.vocabularySize, self.embeddingDim)
        self.conv = nn.Sequential(
                            nn.Conv2d(1, self.filterNumber, (self.filterWidth, self.embeddingDim)),
                            nn.BatchNorm2d(self.filterNumber),
                            nn.ReLU()
                        )
        
        self.fc = nn.Sequential(
                        nn.Linear(self.filterNumber , 512),
                        nn.BatchNorm1d(512),
                        nn.ReLU(),
            
                        nn.Linear(512, 256),
                        nn.BatchNorm1d(256),
                        nn.ReLU(),
                        
                        nn.Linear(256, 1),
                        nn.Sigmoid()
                    )

    def forward(self, x):
        x = self.embedding(x)
        #print(x.size())
        
        x = self.conv(x)
        #print(x.size())
        
        x = x.max(dim=2)[0]
        #print(x.size())

        x = x.view(-1,  self.filterNumber)
        x = self.fc(x)
        return x

class SampleDataset(Dataset):
    def __init__(self, filePathList, labels, sequenceSize=20000, featureName='functionMethodCallsArgs'):
        self.filePathList = filePathList
        self.labels = labels
        self.sequenceSize = sequenceSize
        self.featureName = featureName
        
    def __len__(self):
        return len(self.filePathList)

    def __getitem__(self, idx):
        df = pd.read_parquet(self.filePathList[idx])
        seed = int(round(time.time()%1, 6) * 1000000)
        x = np.concatenate(df.iloc[np.random.RandomState(seed).permutation(len(df))][self.featureName].values)

        if len(x) > self.sequenceSize:
            x = x[:self.sequenceSize]
        else:
            x = np.concatenate((x, np.zeros([self.sequenceSize - len(x)])))
            
        sample = torch.from_numpy(x)
        return (sample.long(), self.labels[idx], self.filePathList[idx])

def train(model, optimizer, dataLoader, device):
    running_loss  = 0.0  
    label_lst     = list()
    predicted_lst = list()

    model.train()
    for inputs, labels, _ in dataLoader:
        
        #
        inputs = inputs.unsqueeze(1).to(device)
        labels = labels.to(device)

        #
        optimizer.zero_grad()

        #
        outputs = model(inputs)
        predicted = (outputs > 0.5).squeeze().long()
        loss = F.binary_cross_entropy(outputs.squeeze(), labels.float())

        #
        loss.backward()
        optimizer.step()

        #
        label_lst.append(labels.cpu().numpy())
        predicted_lst.append(predicted.cpu().numpy())        
        running_loss += loss.item() 

    labels    = np.concatenate(label_lst)
    predicted = np.concatenate(predicted_lst)
    loss      = running_loss / len(predicted)
    
    return labels, predicted, loss

def assess(model, dataLoader, device):
    running_loss  = 0.0  
    label_lst     = list()
    predicted_lst = list()
    proba_lst     = list()
    path_lst      = list()

    with torch.no_grad():
        model.eval()
        for inputs, labels, paths in dataLoader:
            #
            inputs = inputs.unsqueeze(1).to(device)
            labels = labels.to(device)

            #
            outputs = model(inputs)
            predicted = (outputs > 0.5).squeeze().long()
            loss = F.binary_cross_entropy(outputs.squeeze(), labels.float())

            #
            if len(inputs) > 1:
                label_lst.append(labels.cpu().numpy())
                predicted_lst.append(predicted.cpu().numpy())
                proba_lst.append(outputs.squeeze().cpu().numpy())
                path_lst.append(paths)
                running_loss += loss.item() 
    
    labels    = np.concatenate(label_lst)
    predicted = np.concatenate(predicted_lst)
    proba     = np.concatenate(proba_lst)
    paths     = np.concatenate(path_lst)
    loss      = running_loss / len(predicted)
    
    return labels, predicted, loss, proba, paths

def trainModel(ws, modelTag, epochNum, trainLoader, validLoader, device, lr=3e-4, weightDecay=9e-5):
    #
    model  = Net()
    model  = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weightDecay)
    scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=True, patience=5, factor=0.8)

    outputlogFilePath = f'./traces/{ws}/logs'
    outputtracesPath  = f'./traces/{ws}'
    #shutil.rmtree(outputtracesPath)
    #os.mkdir(outputtracesPath)

    result_lst = list()

    message = '----------'
    with open(outputlogFilePath, 'a') as writer:
        writer.write(message + '\n')
    print(message)
    
    for epoch in range(epochNum):

        tlabel, tpredicted, tloss = train(model, optimizer, trainLoader, device)
        vlabel, vpredicted, vloss, vproba, vproba = assess(model, validLoader, device)

        message  = f'Train: {modelTag} '
        message += '[{:04d}] '.format(epoch)

        tf1score  = f1_score(tlabel, tpredicted)
        message  += 'TF1: {:2.4f}, '.format(tf1score*100)
        message  += 'Tloss: {:2.8f}, '.format(tloss)

        vf1score  = f1_score(vlabel, vpredicted)
        message  += 'VF1: {:2.4f}, '.format(vf1score*100)
        message  += 'VLoss: {:2.8f},'.format(vloss)  
    
        with open(outputlogFilePath, 'a') as writer:
            writer.write(message + '\n')
        print(message)

        modelOutputPath = f'{outputtracesPath}/model_{modelTag}_{epoch:03d}.pth'
        torch.save(model.state_dict(), modelOutputPath)
        result_lst.append((epoch, modelOutputPath, vlabel, vpredicted, vproba, vf1score, vloss, tf1score, tloss))

        scheduler.step(tloss)

    df = pd.DataFrame(result_lst, 
                      columns=['epoch', 'path', 'labels', 'predicted', 'proba', 'vf1score', 'vloss', 'tf1score', 'tloss'])
    df.to_parquet(f'{outputtracesPath}/{modelTag}.parquet')

    message = '----------'
    with open(outputlogFilePath, 'a') as writer:
        writer.write(message + '\n')
    print(message)

    return df

def evaluate(ws, modelPathList, dataloader, device, numberFragments=1):
    modelResultList = []
    outputlogFilePath = f'./traces/{ws}/logs'
    
    for modelPath in modelPathList:
        for fragment in range(numberFragments):
            mdl = Net().to(device)
            mdl.load_state_dict(torch.load(modelPath))
            mdl.eval()
            modelResult = assess(mdl, dataloader, device)
            modelF1Score = f1_score(modelResult[0], modelResult[1])
            modelResultList.append((modelPath, modelF1Score,) + modelResult)
            message  = f'Evaluate: '
            message += f'ModelPath={modelPath} Fragment={fragment:02d} '
            message += f'score={modelF1Score}'
            print(message)
            with open(outputlogFilePath, 'a') as writer:
                writer.write(message + '\n')
    return pd.DataFrame(modelResultList, columns=['name', 'f1score', 'Truth', 'Predicted', 'loss', 'Proba', 'Path'])

def getDataloaders(dataset_df, batchSize=32, numWorkers=16, trainPercentage=0.7, validPercentage=0.8):
    rand_idx = np.random.permutation(len(dataset_df))
    train_df = dataset_df.iloc[rand_idx[:int(trainPercentage * len(dataset_df))]]
    valid_df = dataset_df.iloc[rand_idx[int(trainPercentage * len(dataset_df)):int(validPercentage * len(dataset_df))]]
    test_df  = dataset_df.iloc[rand_idx[int(validPercentage * len(dataset_df)):]]

    print(len(train_df))
    print(train_df.label.value_counts())
    print(len(valid_df))
    print(valid_df.label.value_counts())
    print(len(test_df))
    print(test_df.label.value_counts())
    
    trainDataset = SampleDataset(train_df.filePath.values, train_df.label.values)
    trainLoader  = DataLoader(trainDataset, batch_size=batchSize, shuffle=True, num_workers=numWorkers)

    validDataset = SampleDataset(valid_df.filePath.values, valid_df.label.values)
    validLoader  = DataLoader(validDataset, batch_size=2*batchSize, shuffle=False, num_workers=numWorkers)

    testDataset = SampleDataset(test_df.filePath.values, test_df.label.values)
    testLoader  = DataLoader(testDataset,  batch_size=2*batchSize, shuffle=False, num_workers=numWorkers)
    
    return trainLoader, validLoader, testLoader

def evalDataset(ws, result_df, probaUpperBorn = 0.9,  probaLowerBorn = 0.1):
    outputlogFilePath = f'./traces/{ws}/logs'
    results   = np.vstack(result_df.Proba.values)

    truth       = result_df.Truth.iloc[0]
    paths       = result_df.Path.iloc[0]
    result_mean = results.mean(axis=0)
    predicted   = (result_mean > 0.5).astype('int')
    f1score     = f1_score(truth, predicted)

    vtruth        = truth[(result_mean >= probaUpperBorn) | (result_mean <= probaLowerBorn)]
    vpaths        = paths[(result_mean >= probaUpperBorn) | (result_mean <= probaLowerBorn)]
    vresult_prob  = result_mean[(result_mean >= probaUpperBorn) | (result_mean <= probaLowerBorn)]
    vpredicted    = (vresult_prob > 0.5).astype('int')
    vcoverage     = (len(vtruth)/len(truth))
    vextendSize   = len(vtruth)
    vf1score      = f1_score(vtruth, vpredicted)

    etruth       = truth[(result_mean < probaUpperBorn) & (result_mean > probaLowerBorn)]
    epaths       = paths[(result_mean < probaUpperBorn) & (result_mean > probaLowerBorn)]
    eresult_prob = result_mean[(result_mean < probaUpperBorn) & (result_mean > probaLowerBorn)]
    epredicted    = (eresult_prob > 0.5).astype('int')
    ecoverage     = (len(etruth)/len(truth))
    erestSize     = len(etruth)
    ef1score      = f1_score(etruth, epredicted)

    message  = f'Extend: '
    message += f'f1score={f1score*100:2.4f}, '
    message += f'vcoverage={vcoverage*100:2.4f}, vf1score={vf1score*100:2.4f}, vexentdSize={vextendSize}, '
    message += f'ecoverage={ecoverage*100:2.4f}, ef1score={ef1score*100:2.4f}, erestSize={erestSize}'

    print(message)
    with open(outputlogFilePath, 'a') as writer:
        writer.write(message + '\n')

In [11]:
# 
ws               = 'comparativeWS02'
epochNum         = 50
device           = torch.device('cuda:1')
ensembleSize     = 10

trainPercentageParam = 0.7
validPercentageParam = 0.8

outputlogFilePath = f'./traces/{ws}/logs'
outputtracesPath  = f'./traces/{ws}'
os.mkdir(outputtracesPath)

In [12]:
mamadroid_meta_df = pd.read_parquet('dataset/mamadroid_meta.parquet')
mamadroid_meta_df = mamadroid_meta_df[['sha256', 'year', 'tag']]

drebin_df = pd.read_msgpack('/ws/mnt/habouch/datasets/android_dataset/meta/drebin_meta.msg')
drebin_df = drebin_df[['sha256']]
drebin_df['tag'] = 'malware'
drebin_df['year'] = 'drebin'
drebin_df.reset_index(drop=True, inplace=True)

mamadroid_df = pd.concat([mamadroid_meta_df, drebin_df], sort=False)
mamadroid_df.drop_duplicates(subset='sha256', inplace=True)

doneList = [item.split('/')[-1] for item in glob.glob('/ws/mnt/local/data/output/mamadroid/*')]
mamadroid_df = mamadroid_df.loc[mamadroid_df.sha256.isin(doneList)]

mamadroid_df['label'] = (mamadroid_df.tag == 'malware').apply(int)
mamadroid_df['filePath'] = '/ws/mnt/local/data/output/mamadroid/' + mamadroid_df.sha256
print(len(mamadroid_df))

41712


In [13]:
#androzoo_df = pd.read_parquet('dataset/androzoo_meta.parquet')
df = androzoo_df.loc[androzoo_df.sha256.isin(mamadroid_df.loc[mamadroid_df.label == 0, 'sha256'])]
df.dropna(subset=['vt_detection'], inplace=True)
df = df.loc[df.vt_detection > 0]
mamadroid_df = mamadroid_df.loc[~mamadroid_df.sha256.isin(df.sha256)]
print(len(mamadroid_df))

40706


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
df = androzoo_df.loc[androzoo_df.sha256.isin(mamadroid_df.loc[mamadroid_df.label == 1, 'sha256'])]
df.dropna(subset=['vt_detection'], inplace=True)
df = df.loc[df.vt_detection < 4]
mamadroid_df = mamadroid_df.loc[~mamadroid_df.sha256.isin(df.sha256)]
print(len(mamadroid_df))

37758


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [15]:
mamadroid_df.groupby(['year', 'label']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,sha256,tag,filePath
year,label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,1,6928,6928,6928
2014,1,13654,13654,13654
2015,1,3732,3732,3732
2016,1,2184,2184,2184
drebin,1,4636,4636,4636
new,0,1638,1638,1638
old,0,4986,4986,4986


In [16]:
tags = [
        ['drebin', 'old'],
        ['2013',   'old'],
        ['2014',   'old'],
        ['2014',   'new'],
        ['2015',   'new'],
        ['2016',   'new'],
       ]

In [17]:
for tag in tags:
    currentTag = tag[0] + '_' + tag[1]

    message  = '######## '
    message += currentTag

    with open(outputlogFilePath, 'a') as writer:
        writer.write(message + '\n')
    print(message)

    #
    dataset_df = mamadroid_df.loc[(mamadroid_df.year == tag[0]) | (mamadroid_df.year == tag[1])]

    #
    trainLoader, validLoader, testLoader = getDataloaders(dataset_df, trainPercentage=trainPercentageParam, 
                                                                      validPercentage=validPercentageParam)

    #
    models_df = trainModel(ws, f'train_{currentTag}', epochNum, trainLoader, validLoader, device)
    models_df.sort_values(by=['vloss', 'tloss'], inplace=True)
    selectedModelPaths = models_df.path.iloc[:ensembleSize].tolist()

    #
    evalresult_df = evaluate(ws, selectedModelPaths, testLoader, device)

    #
    evalDataset(ws, evalresult_df, probaUpperBorn=0.8,  probaLowerBorn=0.2)

    #
    outputPath = f'traces/{ws}/{currentTag}.pickle'
    currentResults = pd.DataFrame([(currentTag, models_df, evalresult_df)], columns=['TimeTag', 'models', 'evalResuls'])
    currentResults.to_pickle(outputPath)

    #
    message = '########'
    with open(outputlogFilePath, 'a') as writer:
        writer.write(message + '\n')
    print(message)

######## drebin_old
6735
0    3503
1    3232
Name: label, dtype: int64
962
0    513
1    449
Name: label, dtype: int64
1925
0    970
1    955
Name: label, dtype: int64
----------
Train: train_drebin_old [0000] TF1: 95.3392, Tloss: 0.00402343, VF1: 93.7698, VLoss: 0.00329025,
Train: train_drebin_old [0001] TF1: 98.0034, Tloss: 0.00179486, VF1: 95.8525, VLoss: 0.00190081,
Train: train_drebin_old [0002] TF1: 98.6051, Tloss: 0.00134162, VF1: 98.6577, VLoss: 0.00119896,
Train: train_drebin_old [0003] TF1: 98.7937, Tloss: 0.00122235, VF1: 97.2665, VLoss: 0.00219971,
Train: train_drebin_old [0004] TF1: 99.0096, Tloss: 0.00086410, VF1: 91.9255, VLoss: 0.00349096,
Train: train_drebin_old [0005] TF1: 99.0565, Tloss: 0.00087516, VF1: 94.8827, VLoss: 0.00245284,
Train: train_drebin_old [0006] TF1: 99.0877, Tloss: 0.00083242, VF1: 96.8037, VLoss: 0.00164839,
Train: train_drebin_old [0007] TF1: 99.4118, Tloss: 0.00059410, VF1: 98.2103, VLoss: 0.00119768,
Train: train_drebin_old [0008] TF1: 99.1649, 