General colab instantiations

In [0]:
%matplotlib inline

In [0]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')


In [0]:
import os

path = 'drive/My Drive/TUM/adversarial-toxic' #add path of project folder in your G-drive

os.chdir(path)

Code imports

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.autograd import Variable
import argparse
import shutil
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, f1_score
import copy

# local imports
import model
import scoring
import transformer
import advloaddata
import dataloader

GPU related init.

In [0]:
# gpu related init
torch.manual_seed(7)
torch.cuda.manual_seed_all(7)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

General variables


In [0]:
args = argparse.Namespace(
  data = 0,
  charlength = 1014,
  wordlength = 500,
  space = False,
  trans = False,
  backward = -1,
  batchsize = 128,
  maxnorm = 400,
  maxbatches = None,
  advsamplepath = None,
  externaldata = '',
  dictionarysize = 20000,
  lr = 0.0005,
  epochs = 10,
  power = 1
)

DEBUG_L1 = True
DEBUG_L2 = True

models = ["bilstm","simplernn","wordcnn"]
datatypes = ["word","word","word"]

Helper functions

In [0]:
def debug(message, b):
  """
    Used to print debugging messages if enabled.

    Parameters: 
    message: to print (can be any object printable by python)
    b: boolean to specify the level of debugging
  """
  if b: print(message)

def save_checkpoint(state, is_best, filename='checkpoint.dat'):
  """
    Saves a trained model.

    Parameters:
    state: a dictionary object containing state info on the model.
    is_best: boolean to make an extra copy of the best model.
    filename: the name of the file where the model is to be saved.
  """
  torch.save(state, filename + '_checkpoint.dat')
  if is_best:
    shutil.copyfile(filename + '_checkpoint.dat', filename + "_bestmodel.dat")

def plot_roc(fpr_l, tpr_l, auc_l):
  """
    Plots multiple ROC curves.

    Parameters:
    fpr_l: list of false positive rates (x values of curve)
    tpr_l: list of true positive rates (y values of curve)
    auc_l: list of area under the curve (auc) values for every curve drawn.
  """
  colors = ['darkorange','green','magenta']
  model_name = ["Bi-LSTM","Simple RNN","Simple CNN"]
  
  plt.figure()
  lw = 2
  for i in range(len(auc_l)):
    plt.plot(fpr_l[i], tpr_l[i], color=colors[i], 
      lw=lw, label='%s (area = %0.2f)' % (model_name[i],auc_l[i]))

  plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
  plt.xlim([0.0, 1.0])
  plt.ylim([0.0, 1.05])
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive Rate')
  plt.title('Receiver operating characteristic for models')
  plt.legend(loc="lower right")
  plt.savefig("training_rocs.png")

Loading data


In [0]:
debug("Loading data..", DEBUG_L1)
# data for word models
(train, adv, test, tokenizer, numclass, rawtrain, rawadv, rawtest) = advloaddata.loaddatawithtokenize(args.data, nb_words = args.dictionarysize, datalen = args.wordlength, withraw = True)
word_index = tokenizer.word_index
trainword_set = dataloader.Worddata(train, getidx = True, rawdata = rawtrain)
advword_set = dataloader.Worddata(adv, getidx = True, rawdata = rawadv)
testword_set = dataloader.Worddata(test, getidx = True, rawdata = rawtest)
trainword_loader = DataLoader(trainword_set, batch_size = args.batchsize, num_workers = 4, shuffle = True)
advword_loader = DataLoader(advword_set, batch_size = args.batchsize, num_workers = 4, shuffle = True)
testword_loader = DataLoader(testword_set, batch_size = args.batchsize, num_workers = 4)
maxlength =  args.wordlength

# feedback on loading
debug("Data loaded.", DEBUG_L1)
debug("Size of training set: %d" % len(trainword_set.inputs), DEBUG_L1)
debug("Size of test set: %d" % len(testword_set.inputs), DEBUG_L1)
debug("Size of adversarial set: %d" % len(advword_set.inputs), DEBUG_L1)

In [0]:
# get basic statistics on dataset
train_labels = trainword_set.labels
test_labels = testword_set.labels
adv_labels = advword_set.labels

debug("====== train ======", DEBUG_L1)
debug("Total number of instances: %d" % len(train_labels), DEBUG_L1)
debug("Total number of positive class instances: %d" % len(train_labels[train_labels == 1]), DEBUG_L1)
debug("Total number of negative class instances: %d" % len(train_labels[train_labels == 0]), DEBUG_L1)

debug("====== test ======", DEBUG_L1)
debug("Total number of instances: %d" % len(test_labels), DEBUG_L1)
debug("Total number of positive class instances: %d" % len(test_labels[test_labels == 1]), DEBUG_L1)
debug("Total number of negative class instances: %d" % len(test_labels[test_labels == 0]), DEBUG_L1)

debug("====== adv ======", DEBUG_L1)
debug("Total number of instances: %d" % len(adv_labels), DEBUG_L1)
debug("Total number of positive class instances: %d" % len(adv_labels[adv_labels == 1]), DEBUG_L1)
debug("Total number of negative class instances: %d" % len(adv_labels[adv_labels == 0]), DEBUG_L1)

In [0]:
def get_model(model_name, numclass):
  """
    Generates a model of a specified kind and number of output classes.

    Returns:
    Generated model moved to CUDA device if possible/applicable.

    Parameters:
    model_name: type of model to generate.
    numclass: number of output classes of the model
  """
  if model_name == "charcnn":
      model_instance = model.CharCNN(classes = numclass)
  elif model_name == "simplernn":
      model_instance = model.smallRNN(classes = numclass)
  elif model_name == "bilstm":
      model_instance = model.smallRNN(classes = numclass, bidirection = True)
  elif model_name == "smallcharrnn":
      model_instance = model.smallcharRNN(classes = numclass)
  elif model_name == "wordcnn":
      model_instance = model.WordCNN(classes = numclass)

  model_instance = model_instance.to(device)
  debug(model_instance, DEBUG_L1)

  return model_instance

def train_model(model_name, model, train_loader, test_loader):
  """
    Trains a given model on train_loader and tests it on test_loader in every epoch.

    Returns:
    the predictions of the best model achieved.

    Parameters:
    model_name: name/type of model being trained (used to save the model state to a file)
    model: the model to train.
    train_loader: pytorch data loader for trainset.
    test_loader: pytorch data loader for testset.
  """
  optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
  bestacc = 0
  for epoch in range(args.epochs+1):
    debug('Start epoch %d' % epoch, DEBUG_L1)
    model.train()
    for dataid, data in enumerate(train_loader):
      try:
        inputs, target, idx, raw = data
      except:
        inputs, target = data

      inputs, target = Variable(inputs),  Variable(target)
      inputs, target = inputs.to(device), target.to(device)
      output = model(inputs)

      loss = F.nll_loss(output, target)

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
    
    correct = .0
    total_loss = 0
    model.eval()
    all_pred = []
    for dataid, data in enumerate(test_loader):
        inputs, target, idx, raw = data
        inputs, target = inputs.to(device), target.to(device)
        output = model(inputs)
        loss = F.nll_loss(output, target)
        total_loss += loss.item()
        pred = output.data.max(1, keepdim=True)[1]
        all_pred.append(pred.cpu())
        correct += pred.eq(target.data.view_as(pred)).cpu().sum().item()

    all_pred = torch.cat(all_pred[:-1], dim=0)
    all_pred = torch.cat((all_pred,pred.cpu()),dim=0)

    acc = correct/len(test_loader.dataset)
    avg_loss = total_loss/len(test_loader.dataset)
    debug('Epoch %d : Loss %.4f Accuracy %.5f' % (epoch, avg_loss, acc), DEBUG_L1)
    debug('All pred dim: %d' % len(all_pred), DEBUG_L2)

    is_best = acc > bestacc
    if is_best:
        bestacc = acc
        best_pred = all_pred.numpy()
    if args.dictionarysize!=20000:
        fname = "models/advtrain_combined_swap_" + model_name +str(args.dictionarysize) + "_" + str(args.data)
    else:
        fname = "models/advtrain_combined_swap_" + model_name + "_" + str(args.data)
          
    save_checkpoint({'epoch': epoch + 1,
              'state_dict': model.state_dict(),
              'bestacc': bestacc,
              'optimizer' : optimizer.state_dict(),
          }, is_best, filename = fname)

  return best_pred

def test_model(model, data_loader):
  """
    Test a given model on the data provided.

    Returns:
    all_probs: probabilities returned by the model.
    all_preds: class predictions deduced from model probabilities.

    Parameters:
    model: the model to be tested.
    data_loader: a pytorch data loader of the data to be used for testing.
  """
  curr_model = get_model(model, numclass)
  curr_model = curr_model.to(device)
  model_path = "models/advtrain_combined_swap_"+model+"_0_bestmodel.dat"
  state = torch.load(model_path)

  try:
      curr_model.load_state_dict(state['state_dict'])
  except:
      curr_model = torch.nn.DataParallel(model).to(device)
      curr_model.load_state_dict(state['state_dict'])
      curr_model = curr_model.module

  debug("Model ready.", DEBUG_L2)

  curr_model.eval()

  all_probs = []
  all_preds = []
  correct = .0
  total_loss = 0
  for dataid, data in enumerate(data_loader):
      debug("Test instance #%d..." % dataid, DEBUG_L2)
      try: 
        inputs, target, idx, raw = data
      except:
        inputs, target = data
      inputs, target = inputs.to(device), target.to(device)
      output = curr_model(inputs)
      probs_pos = output.data.transpose(0,1)[1] # probability of the positive class
      loss = F.nll_loss(output, target)
      total_loss += loss.item()
      pred = output.data.max(1, keepdim=True)[1]
      all_probs.append(probs_pos.cpu())
      all_preds.append(pred.cpu())
      correct += pred.eq(target.data.view_as(pred)).cpu().sum().item()

  all_probs = torch.cat(all_probs[:-1], dim = 0)
  all_probs = torch.cat((all_probs,probs_pos.cpu()), dim = 0)

  all_preds = torch.cat(all_preds[:-1], dim = 0)
  all_preds = torch.cat((all_preds,pred.cpu()), dim = 0)

  return all_probs, all_preds

Train classifiers

In [0]:
def train_classifiers(train_loader, test_loader): 
  """
    Trains all the models defined in the global object 'models'.

    Parameters:
    train_loader: the training data.
    test_loader: test data used to evaluate the model at every epoch.
  """

  for i in range(len(models)):
    debug("====== model #%d =========" % i, DEBUG_L2)
    debug(models[i],DEBUG_L2)

    if models[i] == "smallcharrnn": # minor modification for smallcharrnn
        args.charlength = 300

    model_instance = get_model(models[i], numclass)

    if datatypes[i] == 'word':
      train_model(models[i], model_instance, train_loader, test_loader)
    elif datatypes[i] == 'char':
      train_model(models[i], model_instance, train_loader, test_loader)

train_classifiers(trainword_loader, testword_loader)


**Below code is in the process of refactoring**

In [0]:
def evaluate_classifiers(data_loader, labels):
  fpr_l = []
  tpr_l = []
  auc_l = []

  models = ["bilstm","simplernn","wordcnn"]
  datatypes = ["word","word","word"]
  for i in range(len(models)):
    debug("Loaded data.", DEBUG_L2)
    debug("Creating model...", DEBUG_L2)

    probs, preds = test_model(models[i], data_loader)
  
    f1 = f1_score(labels, preds)

    log = open('train_f1.txt','a')
    log.write('%d\t%s\t%.2f\n' % (args.data, models[i], 100*f1))
    
    debug('F1 score %.5f' % f1, DEBUG_L1)
    debug("Done testing. Computing ROC...", DEBUG_L2)

    fpr, tpr, _ = roc_curve(labels, probs)
    area = auc(fpr,tpr)

    fpr_l.append(fpr)
    tpr_l.append(tpr)
    auc_l.append(area)

  plot_roc(fpr_l, tpr_l, auc_l);
  debug("Done. ROC plot generated.", DEBUG_L2)


In [0]:
alltimebest = 0
bestfeature = []
def recoveradv(rawsequence, index2word, inputs, advwords):
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n ' # not used??
    rear_ct = len(rawsequence)
    advsequence = rawsequence[:]
    try:
        for i in range(inputs.size()[0]-1,-1,-1):
            wordi = index2word[inputs[i].item()]
            rear_ct = rawsequence[:rear_ct].rfind(wordi)
                # print(rear_ct)
            if inputs[i].item()>=3:
                advsequence = advsequence[:rear_ct] + advwords[i] + advsequence[rear_ct + len(wordi):]
    except:
        print('something went wrong')
    return advsequence
    
def attackword(model_name, model_instance, data_loader, scoring_fn, trans_fn, maxbatch = None):
    corrects = .0
    total_loss = 0
    model_instance.eval()
    wordinput = []
    tgt = []
    adv = []
    origsample = []
    origsampleidx = []
    
    for dataid, data in enumerate(data_loader):
        debug(dataid, DEBUG_L1)
        if maxbatch!=None and dataid >= maxbatch:
            break
        inputs, target, idx, raw = data
        inputs, target = inputs.to(device), target.to(device)
        origsample.append(inputs)
        origsampleidx.append(idx)
        tgt.append(target)
        wtmp = []
        output = model_instance(inputs)
        pred = torch.max(output, 1)[1].view(target.size())
        
        losses = scoring.scorefunc(scoring_fn)(model_instance, inputs, pred, numclass)
        
        sorted, indices = torch.sort(losses, dim=1, descending=True)

        advinputs = inputs.clone()
        
        for k in range(inputs.size()[0]):
            wtmp.append([])
            for i in range(inputs.size()[1]):
                if advinputs[k,i].item()>3: # word beyond padding
                    wtmp[-1].append(index2word[advinputs[k,i].item()])
                else:
                    wtmp[-1].append('')
        # change important words
        debug("========= words changed ========", DEBUG_L2)
        for k in range(inputs.size()[0]):
            j = 0
            t = 0
            while j < args.power and t<inputs.size()[1]:
                if advinputs[k,indices[k][t]].item()>3:
                  word, advinputs[k,indices[k][t]] = transformer.transform(trans_fn)(advinputs[k,indices[k][t]].item(), word_index, index2word, top_words = args.dictionarysize)
                  wtmp[k][indices[k][t]] = word
                  debug(word, DEBUG_L2)
                  j+=1
                t+=1
        adv.append(advinputs)
        
        output2 = model_instance(advinputs)
        pred2 = torch.max(output2, 1)[1].view(target.size())
        
        corrects += (pred2 == target).sum().item()
        for i in range(len(wtmp)):
          if pred[i].item() != pred2[i].item(): 
            debug(raw[i], DEBUG_L2)
            debug(pred[i].item(), DEBUG_L2)
            wordinputi = recoveradv(raw[i], index2word, inputs[i], wtmp[i])
            debug(wordinputi, DEBUG_L2)
            wordinput.append(wordinputi)
            debug(pred2[i].item(), DEBUG_L2)

    target = torch.cat(tgt)
    advinputs = torch.cat(adv)
    origsamples = torch.cat(origsample)
    origsampleidx = torch.cat(origsampleidx)
    acc = corrects/advinputs.size(0)
    debug('Accuracy %.5f' % acc, DEBUG_L1)
    f = open('attack_log.txt','a')
    f.write('%d\t%d\t%s\t%s\t%s\t%d\t%.2f\n' % (args.data, args.wordlength, model_name, scoring_fn, trans_fn, args.power, 100*acc))
    if args.advsamplepath == None:
        advsamplepath = 'advsamples/test_%s_%d_%s_%s_%d_%d.dat' % (model_name, args.data, scoring_fn, trans_fn, args.power, args.wordlength)
    else:
        advsamplepath = args.advsamplepath
    torch.save({'original':origsamples, 'sampleid':origsampleidx, 'wordinput':wordinput, 'advinputs':advinputs, 'labels':target}, advsamplepath)


In [0]:
# load adv exampels

adv_paths = ["advsamples/bilstm_0_combined_swap_1_500.dat", 
         "advsamples/bilstm_0_combined_homoglyph_1_500.dat",
         "advsamples/bilstm_0_random_swap_1_500.dat",
         "advsamples/bilstm_0_random_homoglyph_1_500.dat"]

path = "advsamples/test_bilstm_0_combined_homoglyph_1_500.dat"
data = torch.load(path)

debug("Loaded the following data items: %s" % data.keys(), DEBUG_L2)

inputs = data['advinputs'].cpu()
labels = data['labels'].cpu()

debug(inputs, DEBUG_L2)

debug("Loaded adv samples of size: %d" % len(inputs), DEBUG_L2)
debug("Loaded target data of size: %d" % len(labels), DEBUG_L2)

adv_set = advData(inputs,labels)

adv_loader = DataLoader(adv_set, batch_size = args.batchsize, num_workers = 4, shuffle = True)


In [0]:
class advData(Dataset):
    def __init__(self, inputs, outputs, tokenizer = True, length=1014, space = False, backward = -1, getidx = False, rawdata = None):
        self.backward = backward
        self.length = length
        (self.inputs,self.labels) = (inputs,outputs)
        # self.labels = torch.LongTensor(self.labels)
        # self.inputs = torch.from_numpy(self.inputs).long()
        self.getidx = getidx
        if rawdata:
            self.raw = rawdata
    def __len__(self):
        return len(self.inputs)
    def __getitem__(self,idx):
        x = self.inputs[idx]
        y = self.labels[idx]
        if self.getidx==True:
            if self.raw:
                return x,y,idx,self.raw[idx]
            else:
                return x,y,idx
        else:
            return x,y


In [0]:
paths = ["advsamples/bilstm_0_combined_swap_1_500.dat", 
         "advsamples/bilstm_0_combined_homoglyph_1_500.dat",
         "advsamples/bilstm_0_random_swap_1_500.dat",
         "advsamples/bilstm_0_random_homoglyph_1_500.dat"]

path = "advsamples/bilstm_0_random_homoglyph_1_500.dat"
data = torch.load(path)

debug("Loaded the following data items: %s" % data.keys(), DEBUG_L2)

inputs = data['advinputs'].cpu()
labels = data['labels'].cpu()

debug(inputs, DEBUG_L2)

debug("Loaded adv samples of size: %d" % len(inputs), DEBUG_L2)
debug("Loaded target data of size: %d" % len(labels), DEBUG_L2)

adv_set = advData(inputs,labels)

# adv_loader = DataLoader(adv_set, batch_size = args.batchsize, num_workers = 4, shuffle = True)

# augment adversarial set
adv_train_set_inputs = trainword_set.inputs.clone()
adv_train_set_labels = trainword_set.labels.clone()

debug("Train samples of size: %d" % len(adv_train_set_inputs), DEBUG_L2)
debug("Train target samples of size: %d" % len(adv_train_set_labels), DEBUG_L2)

adv_train_set_inputs = torch.cat((adv_train_set_inputs, adv_set.inputs), dim=0)
adv_train_set_labels = torch.cat((adv_train_set_labels, adv_set.labels), dim=0)

debug("Adv train samples of size: %d" % len(adv_train_set_inputs), DEBUG_L2)
debug("Adv train target samples of size: %d" % len(adv_train_set_labels), DEBUG_L2)

adv_train_set = advData(adv_train_set_inputs, adv_train_set_labels)

adv_train_loader = DataLoader(adv_train_set, batch_size = args.batchsize, num_workers = 4, shuffle = True)


In [0]:
 import copy

paths = ["advsamples/bilstm_0_combined_swap_1_500.dat", 
         "advsamples/bilstm_0_combined_homoglyph_1_500.dat",
         "advsamples/bilstm_0_random_swap_1_500.dat",
         "advsamples/bilstm_0_random_homoglyph_1_500.dat"]

path = "advsamples/bilstm_0_combined_homoglyph_1_500.dat"
data = torch.load(path)

debug("Loaded the following data items: %s" % data.keys(), DEBUG_L2)

inputs = data['advinputs'].cpu()
labels = data['labels'].cpu()

debug(inputs, DEBUG_L2)

debug("Loaded adv samples of size: %d" % len(inputs), DEBUG_L2)
debug("Loaded target data of size: %d" % len(labels), DEBUG_L2)

adv_set = advData(inputs,labels)

# adv_loader = DataLoader(adv_set, batch_size = args.batchsize, num_workers = 4, shuffle = True)

# augment adversarial set
adv_train_set_inputs = trainword_set.inputs.clone()
adv_train_set_labels = trainword_set.labels.clone()

debug("Train samples of size: %d" % len(adv_train_set_inputs), DEBUG_L2)
debug("Train target samples of size: %d" % len(adv_train_set_labels), DEBUG_L2)

adv_train_set_inputs = torch.cat((adv_train_set_inputs, adv_set.inputs), dim=0)
adv_train_set_labels = torch.cat((adv_train_set_labels, adv_set.labels), dim=0)

debug("Adv train samples of size: %d" % len(adv_train_set_inputs), DEBUG_L2)
debug("Adv train target samples of size: %d" % len(adv_train_set_labels), DEBUG_L2)

adv_train_set = advData(adv_train_set_inputs, adv_train_set_labels)

adv_train_loader = DataLoader(adv_train_set, batch_size = args.batchsize, num_workers = 4, shuffle = True)




In [0]:
# attack models
models = ["bilstm"]
datatypes = ["word"]

transformers = {
    'word': ["homoglyph"]
}

scorings = {
    'word': ["combined"]
}

# transformers = {
#     'word': ["swap","flip","f2","insert","remove","r2","homoglyph"],
#     'char': ["remove","flip","homoglyph"]
# }

# scorings = {
#     'word': ["temporal","tail","combined","replaceone","random","ucgrad","grad"],
#     'char': ["temporal","tail","combined","replaceone","random","grad"]
# }

torch.manual_seed(8)
torch.cuda.manual_seed(8)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

debug("Loading model state...", DEBUG_L2)

for m in range(len(models)):

  trans_fns = transformers[datatypes[m]]
  scoring_fns = scorings[datatypes[m]]

  for t in range(len(trans_fns)):
    for s in range(len(scoring_fns)):

      model_instance = get_model(models[m], numclass)
      model_path = "models/"+models[m]+"_0_bestmodel.dat"
      state = torch.load(model_path)
      model_instance = model_instance.to(device)

      try:
          model_instance.load_state_dict(state['state_dict'])
      except:
          model_instance = torch.nn.DataParallel(model_instance).to(device)
          model_instance.load_state_dict(state['state_dict'])
          model_instance = model_instance.module

      debug("Model state loaded...", DEBUG_L2)

      debug("Attacking model...", DEBUG_L2)
      if datatypes[m] == "char":
        attackchar(models[m], model_instance, advchar_loader, scoring_fns[s], trans_fns[t], maxbatch = args.maxbatches)
      elif datatypes[m] == "word":
        index2word = {}
        index2word[0] = '[PADDING]'
        index2word[1] = '[START]'
        index2word[2] = '[UNKNOWN]'
        index2word[3] = ''
        if args.dictionarysize==20000:
          for i in word_index:
            if word_index[i]+3 < args.dictionarysize:
                index2word[word_index[i]+3]=i
        else:
          for i in word_index:
            if word_index[i] + 3 < args.dictionarysize:
                index2word[word_index[i]+3]=i  
        attackword(models[m], model_instance, testword_loader, scoring_fns[s], trans_fns[t], maxbatch = args.maxbatches)


# **Stage 1:** Regular detection


In [0]:
# code below will generate AUC-ROC curves and F1-scores 
# for the performance of all classifiers on test data.

evaluate_classifiers(testword_loader, labels)

# **Stage 2:** Adversarial attack

In [0]:
evaluate_classifiers(adv_loader, labels)

# **Stage 3:** Retraining on adversarially augmented data

In [0]:
train_classifiers(adv_trainword_loader, testword_loader)

# **Stage 4:** Evaluating performance on adversarial sample after retraining 

In [0]:
evaluate_classifiers(adv_loader, labels)