In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.utils.tensorboard import SummaryWriter

import numpy as np
import pandas as pd
import hashlib
import glob
import time
import re
import os

from tqdm import tqdm
from datetime import datetime
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score

from sklearn.feature_extraction.text import HashingVectorizer

#import umap
import scipy
from sklearn.decomposition import PCA

import sklearn.metrics as metrics
import hdbscan


class Net(nn.Module):
    def __init__(self, sequenceSize=8000):
        super(Net, self).__init__()
        self.sequenceSize   = sequenceSize
                
        self.encoder = nn.Sequential(
                        nn.Linear(self.sequenceSize  , 512),
                        nn.BatchNorm1d(512),
                        nn.Tanh(),
            
                        nn.Linear(512, 256),
                        nn.BatchNorm1d(256),
                        nn.Tanh(),
                        
                        nn.Linear(256, 128),
                        nn.BatchNorm1d(128),
                        nn.Tanh(),
            
                        nn.Linear(128, 64),
                        nn.BatchNorm1d(64),
                        nn.Tanh(),
                    )

        self.decoder = nn.Sequential(
            
                        nn.Linear(64, 128),
                        nn.BatchNorm1d(128),
                        nn.Tanh(),

                        nn.Linear(128, 256),
                        nn.BatchNorm1d(256),
                        nn.Tanh(),
            
                        nn.Linear(256, 512),
                        nn.BatchNorm1d(512),
                        nn.Tanh(),
                        
                        nn.Linear(512, self.sequenceSize),
                        nn.BatchNorm1d(self.sequenceSize),
                        nn.Tanh(),
                    )
        
    def forward(self, x):

        shash = self.encoder(x)
        #print(x.size())

        y = self.decoder(shash)
        #print(x.size())

        return y, x
    
    def sHash(self, x):
        
        shash = self.encoder(x)
        
        return shash

class SampleDataset(Dataset):
    def __init__(self, filePathList, labels, sequenceSize=8000, featureName='functionMethodCallsArgs'):
        self.filePathList = filePathList
        self.labels       = labels
        self.sequenceSize = sequenceSize
        self.featureName  = featureName
        self.fhVectorizer = HashingVectorizer(tokenizer=lambda x: x.split(), 
                             ngram_range=(4, 4), 
                             decode_error='replace',
                             n_features=self.sequenceSize)
        
    def __len__(self):
        return len(self.filePathList)

    def __getitem__(self, idx):
        
        df = pd.read_parquet(self.filePathList[idx])
        
        contentList = np.concatenate(df[self.featureName]).tolist()
        content     = ' '.join([str(item) for item in contentList])

        vec = self.fhVectorizer.transform([content]).toarray()

        sample = torch.from_numpy(vec)
        
        return (sample.float(), self.labels[idx])

def train(model, optimizer, dataLoader, device):
    running_loss = 0.0  
    label_lst    = list()

    model.train()
    for inputs, labels in dataLoader:
        
        #
        inputs = inputs.squeeze().to(device)
        labels = labels

        #
        optimizer.zero_grad()

        #
        output1, output2 = model(inputs)
        loss = F.mse_loss(output1, output2)

        #
        loss.backward()
        optimizer.step()

        #
        label_lst.append(labels.cpu().numpy())
        running_loss += loss.item() 

    labels    = np.concatenate(label_lst)
    loss      = running_loss / len(labels)
    
    return loss

def assess(model, dataLoader, device):
    label_lst     = list()
    hash_lst     = list()

    with torch.no_grad():
        model.eval()
        for inputs, labels in dataLoader:
            #
            inputs = inputs.squeeze().to(device)
            labels = labels.to(device)

            #
            outputs = model.sHash(inputs)

            #
            label_lst.append(labels.cpu().numpy())
            hash_lst.append(outputs.squeeze().cpu().numpy())

    labels = np.concatenate(label_lst)
    hashes = np.concatenate(hash_lst)   

    hdb = hdbscan.HDBSCAN(min_samples=10, min_cluster_size=50, alpha=1.0)
    clustring_result = hdb.fit_predict(hashes)

    #return labels, clustring_result
    homogeneity_score = metrics.homogeneity_score(labels, clustring_result)
    numClusters       = len(set(clustring_result))
    coverage          = len(clustring_result)/len(labels)

    c_labels = labels[clustring_result != -1]
    c_result = clustring_result[clustring_result != -1]

    c_homogeneity_score = metrics.homogeneity_score(c_labels, c_result)
    c_numClusters = len(set(c_result))
    c_coverage = len(c_result)/len(labels)

    return homogeneity_score, numClusters, coverage, c_homogeneity_score, c_numClusters, c_coverage, len(c_result)

def getValid(batchSize=128, numWorkers=32):
    global malware_rootDir
    global malwareMetaPath
    
    eval_df = pd.read_msgpack(malwareMetaPath)
    eval_df             = eval_df.loc[eval_df.md5.isin([fileName.split('/')[-1] for fileName in glob.glob(malware_rootDir + '*')])]
    eval_df['filePath'] = malware_rootDir + eval_df.md5
    eval_df['label']  = pd.factorize(eval_df.family)[0]
    validDataset = SampleDataset(eval_df.filePath.values, eval_df.label.values)
    validLoader  = DataLoader(validDataset, batch_size=32, shuffle=False, num_workers=32)


    #eval_df = eval_df.sample(evalDatasetSize, random_state=54)
    #eval_df['vname']  = eval_df.family + '-' + eval_df.variant
    #eval_df['vlabel'] = pd.factorize(eval_df.vname)[0]

    #vlabelMap = pd.Series(pd.factorize(eval_df.vname)[1])
    #labelMap  = pd.Series(pd.factorize(eval_df.family)[1])
    #print(len(eval_df))

    
    return validLoader



In [2]:
#
ws = 'clusteringWS01'

#
outputlogFilePath = f'./traces/{ws}/logs'
outputtracesPath  = f'./traces/{ws}/'
os.mkdir(outputtracesPath)

In [3]:
#
model  = Net()
device = torch.device('cuda:1')
model  = model.to(device)

In [4]:
malware_rootDir = '/ws/mnt/local/data/output/datasets/amd/'
malwareMetaPath = '/ws/mnt/habouch/datasets/android_dataset/meta/amd_meta.msg'
eval_df = pd.read_msgpack(malwareMetaPath)
eval_df             = eval_df.loc[eval_df.md5.isin([fileName.split('/')[-1] for fileName in glob.glob(malware_rootDir + '*')])]
eval_df['filePath'] = malware_rootDir + eval_df.md5
eval_df['label']  = pd.factorize(eval_df.family)[0]
validDataset = SampleDataset(eval_df.filePath.values, eval_df.label.values)
validLoader  = DataLoader(validDataset, batch_size=32, shuffle=False, num_workers=32)

print('----------')
epoch    = 0

measures = assess(model, validLoader, device)
homogeneity_score, numClusters, coverage, c_homogeneity_score, c_numClusters, c_coverage, c_size = measures 

Results_df = pd.DataFrame([measures], columns=['homogeneity_score', 'numClusters', 'coverage', 'c_homogeneity_score', 'c_numClusters', 'c_coverage', 'c_size'])
Results_df.to_csv(outputtracesPath + 'amdResults_df.csv')

message = 'AMD [{:04d}] '.format(epoch)
#message += 'Tloss: {:2.8f}, '.format(tloss)
message += f'homogeneity_score:{homogeneity_score:2.5f} '
message += f'numClusters:{numClusters:03d} '
message += f'coverage:{coverage:2.5f} '
message += f'c_homogeneity_score:{c_homogeneity_score:2.5f} '
message += f'c_numClusters:{c_numClusters:03d} '
message += f'c_coverage:{c_coverage:2.5f} '
message += f'c_size:{c_size:05d}'

with open(outputlogFilePath, 'a') as writer:
    writer.write(message + '\n')

print(message)

print('----------')

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


----------
AMD [0000] homogeneity_score:0.49103 numClusters:101 coverage:1.00000 c_homogeneity_score:0.96558 c_numClusters:100 c_coverage:0.49908 c_size:11609
----------


In [5]:
malware_rootDir = '/ws/mnt/local/data/output/datasets/drebin/'
malwareMetaPath = '/ws/mnt/habouch/datasets/android_dataset/meta/drebin_meta.msg'
eval_df = pd.read_msgpack(malwareMetaPath)

eval_df             = eval_df.loc[eval_df.sha256.isin([fileName.split('/')[-1] for fileName in glob.glob(malware_rootDir + '*')])]
eval_df['filePath'] = malware_rootDir + eval_df.sha256
eval_df['label']  = pd.factorize(eval_df.drebin)[0]
validDataset = SampleDataset(eval_df.filePath.values, eval_df.label.values)
validLoader  = DataLoader(validDataset, batch_size=32, shuffle=False, num_workers=32)

print('----------')
epoch    = 0

measures = assess(model, validLoader, device)
homogeneity_score, numClusters, coverage, c_homogeneity_score, c_numClusters, c_coverage, c_size = measures 

Results_df = pd.DataFrame([measures], columns=['homogeneity_score', 'numClusters', 'coverage', 'c_homogeneity_score', 'c_numClusters', 'c_coverage', 'c_size'])
Results_df.to_csv(outputtracesPath + 'drebinResults_df.csv')

message = 'DREBIN [{:04d}] '.format(epoch)
#message += 'Tloss: {:2.8f}, '.format(tloss)
message += f'homogeneity_score:{homogeneity_score:2.5f} '
message += f'numClusters:{numClusters:03d} '
message += f'coverage:{coverage:2.5f} '
message += f'c_homogeneity_score:{c_homogeneity_score:2.5f} '
message += f'c_numClusters:{c_numClusters:03d} '
message += f'c_coverage:{c_coverage:2.5f} '
message += f'c_size:{c_size:05d}'

with open(outputlogFilePath, 'a') as writer:
    writer.write(message + '\n')

print(message)

print('----------')

----------


It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


DREBIN [0000] homogeneity_score:0.38790 numClusters:028 coverage:1.00000 c_homogeneity_score:0.92282 c_numClusters:027 c_coverage:0.49310 c_size:02645
----------


In [6]:
malware_rootDir = '/ws/mnt/local/data/output/datasets/maldozer/'
malwareMetaPath = '/ws/mnt/habouch/datasets/android_dataset/meta/maldozer_meta.msg'
eval_df = pd.read_msgpack(malwareMetaPath)

eval_df             = eval_df.loc[eval_df.md5.isin([fileName.split('/')[-1] for fileName in glob.glob(malware_rootDir + '*')])]
eval_df['filePath'] = malware_rootDir + eval_df.md5
eval_df['label']  = pd.factorize(eval_df.label)[0]
validDataset = SampleDataset(eval_df.filePath.values, eval_df.label.values)
validLoader  = DataLoader(validDataset, batch_size=32, shuffle=False, num_workers=32)

print('----------')
epoch    = 0

measures = assess(model, validLoader, device)
homogeneity_score, numClusters, coverage, c_homogeneity_score, c_numClusters, c_coverage, c_size = measures 

Results_df = pd.DataFrame([measures], columns=['homogeneity_score', 'numClusters', 'coverage', 'c_homogeneity_score', 'c_numClusters', 'c_coverage', 'c_size'])
Results_df.to_csv(outputtracesPath + 'maldozerResults_df.csv')


message = 'MALDOZER [{:04d}] '.format(epoch)
#message += 'Tloss: {:2.8f}, '.format(tloss)
message += f'homogeneity_score:{homogeneity_score:2.5f} '
message += f'numClusters:{numClusters:03d} '
message += f'coverage:{coverage:2.5f} '
message += f'c_homogeneity_score:{c_homogeneity_score:2.5f} '
message += f'c_numClusters:{c_numClusters:03d} '
message += f'c_coverage:{c_coverage:2.5f} '
message += f'c_size:{c_size:05d}'

with open(outputlogFilePath, 'a') as writer:
    writer.write(message + '\n')

print(message)

print('----------')

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


----------
MALDOZER [0000] homogeneity_score:0.50003 numClusters:098 coverage:1.00000 c_homogeneity_score:0.91274 c_numClusters:097 c_coverage:0.55657 c_size:11048
----------


In [7]:
malware_rootDir = '/ws/mnt/local/data/output/datasets/genome/'
malwareMetaPath = '/ws/mnt/habouch/datasets/android_dataset/meta/genome_meta.msg'
eval_df = pd.read_msgpack(malwareMetaPath)

eval_df             = eval_df.loc[eval_df.sha1.isin([fileName.split('/')[-1] for fileName in glob.glob(malware_rootDir + '*')])]
eval_df['filePath'] = malware_rootDir + eval_df.sha1
eval_df['label']    = pd.factorize(eval_df.genome)[0]
validDataset = SampleDataset(eval_df.filePath.values, eval_df.label.values)
validLoader  = DataLoader(validDataset, batch_size=32, shuffle=False, num_workers=32)

print('----------')
epoch    = 0

measures = assess(model, validLoader, device)
homogeneity_score, numClusters, coverage, c_homogeneity_score, c_numClusters, c_coverage, c_size = measures 

Results_df = pd.DataFrame([measures], columns=['homogeneity_score', 'numClusters', 'coverage', 'c_homogeneity_score', 'c_numClusters', 'c_coverage', 'c_size'])
Results_df.to_csv(outputtracesPath + 'genomeResults_df.csv')

message = 'GENOME [{:04d}] '.format(epoch)
#message += 'Tloss: {:2.8f}, '.format(tloss)
message += f'homogeneity_score:{homogeneity_score:2.5f} '
message += f'numClusters:{numClusters:03d} '
message += f'coverage:{coverage:2.5f} '
message += f'c_homogeneity_score:{c_homogeneity_score:2.5f} '
message += f'c_numClusters:{c_numClusters:03d} '
message += f'c_coverage:{c_coverage:2.5f} '
message += f'c_size:{c_size:05d}'

with open(outputlogFilePath, 'a') as writer:
    writer.write(message + '\n')

print(message)

print('----------')

----------


It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


GENOME [0000] homogeneity_score:0.30646 numClusters:007 coverage:1.00000 c_homogeneity_score:0.88998 c_numClusters:006 c_coverage:0.37381 c_size:00471
----------
