In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.utils.tensorboard import SummaryWriter

import numpy as np
import pandas as pd
import hashlib
import shutil
import glob
import time
import re
import os

from tqdm import tqdm
from datetime import datetime
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
    
class Net(nn.Module):
    def __init__(self, sequenceSize=20000, embeddingDim=128, vocabularySize=2**16, filterWidth=5, filterNumber=1024):
        super(Net, self).__init__()
        self.sequenceSize   = sequenceSize
        self.embeddingDim   = embeddingDim
        self.vocabularySize = vocabularySize
        self.filterWidth    = filterWidth
        self.filterNumber   = filterNumber 
        
        self.embedding = nn.Embedding(self.vocabularySize, self.embeddingDim)
        self.conv = nn.Sequential(
                            nn.Conv2d(1, self.filterNumber, (self.filterWidth, self.embeddingDim)),
                            nn.BatchNorm2d(self.filterNumber),
                            nn.ReLU()
                        )
        
        self.fc = nn.Sequential(
                        nn.Linear(self.filterNumber , 512),
                        nn.BatchNorm1d(512),
                        nn.ReLU(),
            
                        nn.Linear(512, 256),
                        nn.BatchNorm1d(256),
                        nn.ReLU(),
                        
                        nn.Linear(256, 1),
                        nn.Sigmoid()
                    )

    def forward(self, x):
        x = self.embedding(x)
        #print(x.size())
        
        x = self.conv(x)
        #print(x.size())
        
        x = x.max(dim=2)[0]
        #print(x.size())

        x = x.view(-1,  self.filterNumber)
        x = self.fc(x)
        return x

class SampleDataset(Dataset):
    def __init__(self, filePathList, labels, sequenceSize=20000, featureName='functionMethodCallsArgs'):
        self.filePathList = filePathList
        self.labels = labels
        self.sequenceSize = sequenceSize
        self.featureName = featureName
        
    def __len__(self):
        return len(self.filePathList)

    def __getitem__(self, idx):
        df = pd.read_parquet(self.filePathList[idx])
        seed = int(round(time.time()%1, 6) * 1000000)
        x = np.concatenate(df.iloc[np.random.RandomState(seed).permutation(len(df))][self.featureName].values)

        if len(x) > self.sequenceSize:
            x = x[:self.sequenceSize]
        else:
            x = np.concatenate((x, np.zeros([self.sequenceSize - len(x)])))
            
        sample = torch.from_numpy(x)
        return (sample.long(), self.labels[idx], self.filePathList[idx])

def train(model, optimizer, dataLoader, device):
    running_loss  = 0.0  
    label_lst     = list()
    predicted_lst = list()

    model.train()
    for inputs, labels, _ in dataLoader:
        
        #
        inputs = inputs.unsqueeze(1).to(device)
        labels = labels.to(device)

        #
        optimizer.zero_grad()

        #
        outputs = model(inputs)
        predicted = (outputs > 0.5).squeeze().long()
        loss = F.binary_cross_entropy(outputs.squeeze(), labels.float())

        #
        loss.backward()
        optimizer.step()

        #
        label_lst.append(labels.cpu().numpy())
        predicted_lst.append(predicted.cpu().numpy())        
        running_loss += loss.item() 

    labels    = np.concatenate(label_lst)
    predicted = np.concatenate(predicted_lst)
    loss      = running_loss / len(predicted)
    
    return labels, predicted, loss

def assess(model, dataLoader, device):
    running_loss  = 0.0  
    label_lst     = list()
    predicted_lst = list()
    proba_lst     = list()
    path_lst      = list()

    with torch.no_grad():
        model.eval()
        for inputs, labels, paths in dataLoader:
            #
            inputs = inputs.unsqueeze(1).to(device)
            labels = labels.to(device)

            #
            outputs = model(inputs)
            predicted = (outputs > 0.5).squeeze().long()
            loss = F.binary_cross_entropy(outputs.squeeze(), labels.float())

            #
            if len(inputs) > 1:
                label_lst.append(labels.cpu().numpy())
                predicted_lst.append(predicted.cpu().numpy())
                proba_lst.append(outputs.squeeze().cpu().numpy())
                path_lst.append(paths)
                running_loss += loss.item() 
    
    labels    = np.concatenate(label_lst)
    predicted = np.concatenate(predicted_lst)
    proba     = np.concatenate(proba_lst)
    paths     = np.concatenate(path_lst)
    loss      = running_loss / len(predicted)
    
    return labels, predicted, loss, proba, paths

def trainModel(ws, modelTag, epochNum, trainLoader, validLoader, device, lr=3e-4, weightDecay=9e-5):
    #
    model  = Net()
    model  = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weightDecay)
    scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=True, patience=5, factor=0.8)

    outputlogFilePath = f'./traces/{ws}/logs'
    outputtracesPath  = f'./traces/{ws}'
    #shutil.rmtree(outputtracesPath)
    #os.mkdir(outputtracesPath)

    result_lst = list()

    message = '----------'
    with open(outputlogFilePath, 'a') as writer:
        writer.write(message + '\n')
    print(message)
    
    for epoch in range(epochNum):

        tlabel, tpredicted, tloss = train(model, optimizer, trainLoader, device)
        vlabel, vpredicted, vloss, vproba, vproba = assess(model, validLoader, device)

        message  = f'Train: {modelTag} '
        message += '[{:04d}] '.format(epoch)

        tf1score  = f1_score(tlabel, tpredicted)
        message  += 'TF1: {:2.4f}, '.format(tf1score*100)
        message  += 'Tloss: {:2.8f}, '.format(tloss)

        vf1score  = f1_score(vlabel, vpredicted)
        message  += 'VF1: {:2.4f}, '.format(vf1score*100)
        message  += 'VLoss: {:2.8f},'.format(vloss)  
    
        with open(outputlogFilePath, 'a') as writer:
            writer.write(message + '\n')
        print(message)

        modelOutputPath = f'{outputtracesPath}/model_{modelTag}_{epoch:03d}.pth'
        torch.save(model.state_dict(), modelOutputPath)
        result_lst.append((epoch, modelOutputPath, vlabel, vpredicted, vproba, vf1score, vloss, tf1score, tloss))

        scheduler.step(tloss)

    df = pd.DataFrame(result_lst, 
                      columns=['epoch', 'path', 'labels', 'predicted', 'proba', 'vf1score', 'vloss', 'tf1score', 'tloss'])
    df.to_parquet(f'{outputtracesPath}/{modelTag}.parquet')

    message = '----------'
    with open(outputlogFilePath, 'a') as writer:
        writer.write(message + '\n')
    print(message)

    return df

def evaluate(ws, modelPathList, dataloader, device, numberFragments=1):
    modelResultList = []
    outputlogFilePath = f'./traces/{ws}/logs'
    
    for modelPath in modelPathList:
        for fragment in range(numberFragments):
            mdl = Net().to(device)
            mdl.load_state_dict(torch.load(modelPath))
            mdl.eval()
            modelResult = assess(mdl, dataloader, device)
            modelF1Score = f1_score(modelResult[0], modelResult[1])
            modelResultList.append((modelPath, modelF1Score,) + modelResult)
            message  = f'Evaluate: '
            message += f'ModelPath={modelPath} Fragment={fragment:02d} '
            message += f'score={modelF1Score}'
            print(message)
            with open(outputlogFilePath, 'a') as writer:
                writer.write(message + '\n')
    return pd.DataFrame(modelResultList, columns=['name', 'f1score', 'Truth', 'Predicted', 'loss', 'Proba', 'Path'])

def extendDataset(ws, result_df, probaUpperBorn = 0.8,  probaLowerBorn = 0.2):
    outputlogFilePath = f'./traces/{ws}/logs'
    results   = np.vstack(result_df.Proba.values)

    truth       = result_df.Truth.iloc[0]
    paths       = result_df.Path.iloc[0]
    result_mean = results.mean(axis=0)
    predicted   = (result_mean > 0.5).astype('int')
    f1score     = f1_score(truth, predicted)

    vtruth        = truth[(result_mean >= probaUpperBorn) | (result_mean <= probaLowerBorn)]
    vpaths        = paths[(result_mean >= probaUpperBorn) | (result_mean <= probaLowerBorn)]
    vresult_prob  = result_mean[(result_mean >= probaUpperBorn) | (result_mean <= probaLowerBorn)]
    vpredicted    = (vresult_prob > 0.5).astype('int')
    vcoverage     = (len(vtruth)/len(truth))
    vextendSize   = len(vtruth)
    vf1score      = f1_score(vtruth, vpredicted)

    etruth       = truth[(result_mean < probaUpperBorn) & (result_mean > probaLowerBorn)]
    epaths       = paths[(result_mean < probaUpperBorn) & (result_mean > probaLowerBorn)]
    eresult_prob = result_mean[(result_mean < probaUpperBorn) & (result_mean > probaLowerBorn)]
    epredicted    = (eresult_prob > 0.5).astype('int')
    ecoverage     = (len(etruth)/len(truth))
    erestSize     = len(etruth)
    ef1score      = f1_score(etruth, epredicted)

    message  = f'Extend: '
    message += f'f1score={f1score*100:2.4f}, '
    message += f'vcoverage={vcoverage*100:2.4f}, vf1score={vf1score*100:2.4f}, vexentdSize={vextendSize}, '
    message += f'ecoverage={ecoverage*100:2.4f}, ef1score={ef1score*100:2.4f}, erestSize={erestSize}'

    print(message)
    with open(outputlogFilePath, 'a') as writer:
        writer.write(message + '\n')
    extend_df = dataset_df = pd.DataFrame( {'filePath': vpaths, 
                                            'label'   : vtruth })
    
    rest_df = dataset_df = pd.DataFrame( {'filePath': vpaths, 
                                            'label'   : vtruth })    
    
    return extend_df, rest_df

def getDataloaders(dataset_df, otest_df, ntest_df, batchSize=32, numWorkers=16, trainPercentage = 0.8):
    rand_idx = np.random.RandomState(seed=54).permutation(len(dataset_df))
    train_df = dataset_df.iloc[rand_idx[:int(trainPercentage * len(dataset_df))]]
    valid_df = dataset_df.iloc[rand_idx[int(trainPercentage * len(dataset_df)):]]

    print(len(train_df))
    print(train_df.label.value_counts())
    print(len(valid_df))
    print(valid_df.label.value_counts())
    print(len(otest_df))
    print(otest_df.label.value_counts())
    print(len(ntest_df))
    print(ntest_df.label.value_counts())
    
    trainDataset = SampleDataset(train_df.filePath.values, train_df.label.values)
    trainLoader  = DataLoader(trainDataset, batch_size=batchSize, shuffle=True, num_workers=numWorkers)

    validDataset = SampleDataset(valid_df.filePath.values, valid_df.label.values)
    validLoader  = DataLoader(validDataset, batch_size=2*batchSize, shuffle=False, num_workers=numWorkers)

    otestDataset = SampleDataset(otest_df.filePath.values, otest_df.label.values)
    otestLoader  = DataLoader(otestDataset,  batch_size=2*batchSize, shuffle=False, num_workers=numWorkers)
    
    ntestDataset = SampleDataset(ntest_df.filePath.values, ntest_df.label.values)
    ntestLoader  = DataLoader(ntestDataset,  batch_size=2*batchSize, shuffle=False, num_workers=numWorkers)
    
    return trainLoader, validLoader, otestLoader, ntestLoader

In [2]:
#
ws               = 'studyWS05'
epochNum         = 50
dataset_rootDir  = '/ws/mnt/local/data/zoo/'
device           = torch.device('cuda:1')
ensembleSize     = 6

In [3]:
outputlogFilePath = f'./traces/{ws}/logs'
outputtracesPath  = f'./traces/{ws}'
os.mkdir(outputtracesPath)

In [4]:
timedf = pd.read_parquet('dataset/timedf.parquet')
years = [2013, 2014, 2015, 2016, 2017, 2018, 2019]

In [5]:
dataset_lst     = list()
overtime_result = list()

initial_df = timedf.loc[timedf.year == 2013]
dataset_lst.append(initial_df)

In [None]:
for idx in range(1, len(years)):
    currentTag = str(years[idx])
    
    message  = '######## '
    message += currentTag

    with open(outputlogFilePath, 'a') as writer:
        writer.write(message + '\n')
    print(message)
    
    #
    otest_df = timedf.loc[timedf.year == years[idx-1]]
    ntest_df = timedf.loc[timedf.year == years[idx]]
    
    dataset_df = pd.concat(dataset_lst)
    trainLoader, validLoader, otestLoader, ntestLoader = getDataloaders(dataset_df, otest_df, ntest_df, trainPercentage=0.8)

    #
    models_df = trainModel(ws, f'train_{currentTag}', epochNum, trainLoader, validLoader, device)
    models_df.sort_values(by=['vloss', 'tloss'], inplace=True)
    selectedModelPaths = models_df.path.iloc[:ensembleSize].tolist()
    
    evalresult_df = evaluate(ws, selectedModelPaths, ntestLoader, device)
    exresult_df   = evaluate(ws, selectedModelPaths, otestLoader, device)
    
    extend_df, _  = extendDataset(ws, evalresult_df, probaUpperBorn = 0.9, probaLowerBorn = 0.1)
    _, rest_df    = extendDataset(ws, exresult_df,   probaUpperBorn = 0.9, probaLowerBorn = 0.1)

    #
    dataset_lst.append(extend_df)

    #
    currentResults = pd.DataFrame([(currentTag, models_df, evalresult_df, exresult_df, dataset_lst, rest_df)], 
                                     columns=['TimeTag', 'models', 'evalResuls', 
                                              'extendResults', 'datasetList', 
                                              'restDataset'])
    
    #
    outputPath = f'traces/{ws}/{currentTag}.pickle'
    currentResults.to_pickle(outputPath)

    message = '########'
    with open(outputlogFilePath, 'a') as writer:
        writer.write(message + '\n')
    print(message)
    
    #break

######## 2014
8000
1    4043
0    3957
Name: label, dtype: int64
2000
0    1043
1     957
Name: label, dtype: int64
10000
1    5000
0    5000
Name: label, dtype: int64
10000
1    5000
0    5000
Name: label, dtype: int64
----------
Train: train_2014 [0000] TF1: 94.7922, Tloss: 0.00454611, VF1: 91.8033, VLoss: 0.00340146,
Train: train_2014 [0001] TF1: 97.5221, Tloss: 0.00228906, VF1: 95.9481, VLoss: 0.00193433,
Train: train_2014 [0002] TF1: 98.1249, Tloss: 0.00181175, VF1: 85.8951, VLoss: 0.00548506,
Train: train_2014 [0003] TF1: 98.4894, Tloss: 0.00146706, VF1: 88.3613, VLoss: 0.00493631,
Train: train_2014 [0004] TF1: 98.6240, Tloss: 0.00126724, VF1: 78.4858, VLoss: 0.00824524,
Train: train_2014 [0005] TF1: 98.4871, Tloss: 0.00139749, VF1: 94.4112, VLoss: 0.00258923,
Train: train_2014 [0006] TF1: 99.1087, Tloss: 0.00096319, VF1: 96.3402, VLoss: 0.00206205,
Train: train_2014 [0007] TF1: 98.9967, Tloss: 0.00101753, VF1: 93.2612, VLoss: 0.00300264,
Train: train_2014 [0008] TF1: 98.7013, Tl

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  from ipykernel import kernelapp as app


----------
Train: train_2015 [0000] TF1: 96.1102, Tloss: 0.00351723, VF1: 88.1081, VLoss: 0.00439404,
Train: train_2015 [0001] TF1: 97.8409, Tloss: 0.00193428, VF1: 95.9210, VLoss: 0.00163108,
Train: train_2015 [0002] TF1: 98.0387, Tloss: 0.00175371, VF1: 96.8233, VLoss: 0.00143411,
Train: train_2015 [0003] TF1: 98.5274, Tloss: 0.00138879, VF1: 98.3931, VLoss: 0.00087439,
Train: train_2015 [0004] TF1: 98.7115, Tloss: 0.00121834, VF1: 98.4170, VLoss: 0.00082409,
Train: train_2015 [0005] TF1: 98.4395, Tloss: 0.00137598, VF1: 97.1932, VLoss: 0.00126544,
Train: train_2015 [0006] TF1: 98.8851, Tloss: 0.00104481, VF1: 95.0127, VLoss: 0.00229119,
Train: train_2015 [0007] TF1: 98.7706, Tloss: 0.00109183, VF1: 95.1891, VLoss: 0.00197351,
Train: train_2015 [0008] TF1: 98.9269, Tloss: 0.00103572, VF1: 97.8445, VLoss: 0.00106170,
Train: train_2015 [0009] TF1: 99.0082, Tloss: 0.00090693, VF1: 86.4190, VLoss: 0.00481715,
Train: train_2015 [0010] TF1: 98.9191, Tloss: 0.00090509, VF1: 88.7029, VLoss: 