In [1]:
%matplotlib inline
import sys, os, pdb, warnings, torch, time
sys.path.insert(0, './core/')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functools import partial
from itertools import product
from copy import deepcopy
from torch.utils.data import DataLoader, Dataset

from minilib import *
from utmLib import utils
from utmLib.clses import Logger

np.set_printoptions(suppress=True, linewidth=120, precision=4)
pd.set_option('display.max_columns', 15)
pd.set_option('display.width', 1000)
plt.rcParams["figure.figsize"] = [10,2]

# this is mainly for papermill parameter detection
try:
    dummy_x89757 = data_name
except:
    data_name = 'parkinson'
    log_file = 'results/exp.log'
    G_delta = 0.5
    testing = 0

logger = Logger(log_file)
res_log = Logger(log_file + '.res', with_time = False)


# Utility functions 

In [2]:
def visualize_imgs(img_array, rows = 2, cols = 8, selected = None, label_array = None):
    if img_array[0].size == 196:
        process = lambda x:x.reshape(14,14)
    else:
        process = lambda x:x.reshape(28,28)
    
    # random select some examples for display if not specified
    if selected is None:
        assert(img_array.shape[0] >= rows * cols)
        selected = np.random.choice(img_array.shape[0], rows * cols, replace = False)
    else:
        assert(selected.size >= rows * cols)
    
    k = 1
    labels = []
    fid = plt.figure()
    
    for i in range(rows):
        for j in range(cols):
            plt.subplot(rows, cols, k)
            plt.imshow(process(img_array[selected[k-1]]), cmap='gray' )
            if label_array is not None:
                labels.append(label_array[selected[k-1]])
            k += 1
            plt.axis('off')
    
    if len(labels):
        print(labels)
    plt.show()

In [3]:
def adversial_step(masses, delta = 0.5):
    
    def get_w(alpha):
        # w = np.exp( -masses / alpha )
        # w = w / np.sum(w) * N
        # return w
        vec = -masses / alpha
        vec -= max(vec)
        w = np.exp(vec)
        w = w/w.sum()
        w *= N
        return w
        
    
    def valid(alpha):
        w = get_w(alpha)
        return np.log(np.power(w,w)).sum() <= M
    
    # use the idea of binary search, time complexity O(N * lgN)
    N = masses.size
    M = N * delta
    l = 0.1
    r = 2 ** 10
    
    # need to gurantee that r is big enough 
    while not valid(r):
        r = r * 2
    
    # binary search a valid alpha in range [l,r] 
    while r - l > 1e-2:
        m = (l+r) / 2

        if valid(m):
            r = m 
        else:
            l = m
            
    return get_w(r)


In [4]:
def fit_weighted_mg(data, R):
    # mg allows exact close form solution
    N, D = data.shape 
    mg = MultivariateGaussain()
    R = R.reshape(N, 1)
    mg.mu = np.mean( R * data, axis = 0)
    
    mat = data - mg.mu.reshape(1, D)
    mat2 = mat.copy()
    mat2 = mat2 * R / N
    S2 = mat.T @ mat2
    
    mg.S = S2
    return mg


In [5]:
def five_number_statistic(logmass):
    p25, median, p75 = np.percentile(logmass, [25,50,75])
    average = np.mean(logmass)
    std = np.std(logmass)
    ret = (p25, median, p75, average, std)
    return list(np.round(ret, 4))

# training and experiment helpers 

In [6]:
def compute_neighbor_ll(model, data, seed = 7, num_nb = 500, eps = 0.2):
    '''
    compute the worst and average ll for each sample in the data 
    '''
    N, D = data.shape 
    rng = np.random.RandomState(seed)
    
    results = [] 
    for x in data:
        pertubation = rng.rand(num_nb, D)
        pertubation = (pertubation - 0.5) * 2 * eps
        nb_data = x.reshape(1,-1) + pertubation
        nb_ll = model.mass(nb_data, logmode = True)

        # we only care about the worst and average 
        results.append([np.min(nb_ll), np.mean(nb_ll)])

    min_nb_ll, avg_nb_ll = list(zip(*results))
    return min_nb_ll, avg_nb_ll
    

In [7]:
import random 

def evaluate(model, dataset):
    avg_stats = []
    for name, test_X in zip(['uncorrupted', 'Gaussian-Noise', 'Random-jitter'], dataset.test):
        cur_mass = model.mass(test_X, logmode = True)
        mass_stat = five_number_statistic(cur_mass)
        logger.write('{:<20} p25:{} median:{} p75:{} Mean:{} Std:{}'.format(name, *mass_stat))
        avg_stats.append( str(mass_stat[3]) )

    for cur_mass, name in zip(compute_neighbor_ll(model, dataset.test[0]),
                            ['Worst NB LL','Avg NB LL']):
                                
        mass_stat = five_number_statistic(cur_mass)
        logger.write('{:<20} p25:{} median:{} p75:{} Mean:{} Std:{}'.format(name, *mass_stat))
        avg_stats.append( str(mass_stat[3]) )
    
    res_log.write(','.join(avg_stats), echo = 0)
    
        

# Experiement of Loglikelihoods

In [8]:
# define some common parameters across experiments
from utmLib.ml.GBN import GBN
from models.ours.Gaussians import MultivariateGaussain
from utmLib.clses import MyObject
############################################################
np.random.seed(7)

# Gloable variables
G_value_scale = 0.2          # the scale we mess up the test set
TRAINING_RATIO = 0.85
VALID_RATIO = 0.2
EPS = 1e-2
VERBOSE = 1
############################################################

# all dataset meta-data specify
default_options = MyObject()
default_options.root_dir = '/home/leondong/proj/robust/dataset/'
default_options.down_sample = False
default_options.with_label = False
default_options.num_per_class = 1000
default_options.normalize = True
default_options.transform = None

dataset_names = [('mnist', 'mnist/'), ('airquality','airquality/AirQualityUCI.csv'), 
                 ('parkinson', 'parkinson/parkinsons_updrs.data'),
                ('energy','energy/data.csv'), ('hepmass','hepmass/hepmass.h5'),
                ('miniboone', 'miniboone/miniboone.h5'), 
                 ('onlinenews','onlinenews/OnlineNewsPopularity.csv'),
                ('superconduct','superconduct/data.csv'),
                ('sdd', 'SDD/Sensorless_drive_diagnosis.txt')]

loader_options = MyObject()
for name, path in dataset_names: 
    l_op = deepcopy(default_options)
    l_op.data_path = path
    loader_options[name] = l_op

# custom field for some dataset
loader_options.mnist.transform = './output/vae-mnist-e250-d20.pkl'
    

In [9]:
import importlib
from sklearn.preprocessing import StandardScaler
# load the given dataset
cur_options = loader_options[data_name]
loader_module = importlib.import_module('loaders.{}'.format(data_name))
dataset = loader_module.load_data(cur_options)

# train, test split
if len(dataset) == 2:
    train, test = dataset
else:
    np.random.shuffle(dataset)
    n_train = int(dataset.shape[0] * TRAINING_RATIO)
    train = dataset[: n_train]
    test = dataset[n_train:]

# convert to float32 type
train = train.astype('f4')
test = test.astype('f4')

# handle test case 
if testing:
    train = train[:1000]
    test = test[:100]

if cur_options.transform is None:
    # do standardize
    scaler = StandardScaler().fit(train)
    train, test = [scaler.transform(x) for x in [train,test]]
      
# shuffle the train split
assert(len(train.shape) == 2), "Data size does not match"
np.random.shuffle(train)
    
# train, valid split
n_valid = int(train.shape[0] * VALID_RATIO)
valid = train[:n_valid]
train = train[n_valid:]

# random messup the test
test_gaussian = gaussian_noise(test, 1, G_value_scale)
test_pj = pixel_jitter(test, 0.25, -G_value_scale, G_value_scale)

# conduct transformation 
if cur_options.transform is not None:
    model = utils.pkload(cur_options.transform)
    model.model.to('cpu')
    model.device = 'cpu'
    train,valid,test,test_gaussian,test_pj = [model.transform(data) 
                             for data in [train,valid,test,test_gaussian,test_pj]]
    

# wrap up the dataset together
dataset = MyObject()
dataset.train = train
dataset.valid = valid
# dataset.valid = np.vstack([valid, gaussian_noise(valid, 1, G_value_scale)])
dataset.test = [test, test_gaussian, test_pj]

logger.write(f'{data_name} load complete!')

Loading parkinson data .....
parkinson load complete!


# Main

In [10]:
train_conf = MyObject()
train_conf.final_iter = 75
train_conf.update_iter = 25
train_conf.n_step = 150
train_conf.delta = float(G_delta)
train_conf.n_comp = 3

if testing:
    train_conf.n_iter = 10

In [11]:
# auto determine the number of components from 2-10
std_best = (-1e50, None)
adv_best = (-1e50, None)

for n_comp in range(3, 10):
    print(f'working on {n_comp} ....')
    R = np.ones( dataset.train.shape[0] )
    # normal mixmg
    learner =  MixMGLearner(max_iter = train_conf.final_iter + train_conf.update_iter, 
                            n_components = n_comp,
                            reg_covar = 1e-4).fit(dataset.train)
    score = learner.get_model().mass(dataset.valid, logmode = True).mean()
    std_best = ( score, deepcopy(learner) ) if score > std_best[0] else std_best
    
    # adv mixmg
    for i in range(train_conf.n_step):
        masses = learner.get_model().mass(dataset.train, logmode = True)
        R = adversial_step(masses, train_conf.delta)
        cur_iter = train_conf.update_iter + ( train_conf.final_iter if i+1 == train_conf.n_step else 0 )
        for _ in range( cur_iter ):
            learner._estep()
            learner._mstep(dataset.train, R)
        score = learner.get_model().mass(dataset.valid, logmode = True).mean()
        adv_best = ( score, deepcopy(learner) ) if score > adv_best[0] else adv_best

evaluate(std_best[1].get_model(), dataset)
print('-' * 50)
evaluate(adv_best[1].get_model(), dataset)

working on 2 ....


  return np.log(np.power(w,w)).sum() <= M


working on 3 ....


  return np.log(np.power(w,w)).sum() <= M


working on 4 ....


  return np.log(np.power(w,w)).sum() <= M


working on 5 ....


  return np.log(np.power(w,w)).sum() <= M


working on 6 ....
working on 7 ....


  return np.log(np.power(w,w)).sum() <= M


working on 8 ....


  return np.log(np.power(w,w)).sum() <= M


working on 9 ....


  return np.log(np.power(w,w)).sum() <= M


uncorrupted          p25:-6.4046 median:-2.1184 p75:0.9453 Mean:-3.8103 Std:11.1338
Gaussian-Noise       p25:-19.1105 median:-15.2832 p75:-11.8463 Mean:-16.1342 Std:9.8582
Random-jitter        p25:-23.3147 median:-14.8339 p75:-8.2782 Mean:-20.4697 Std:33.9389
Worst NB LL          p25:-22.504 median:-19.6241 p75:-17.2737 Mean:-21.2031 Std:13.2773
Avg NB LL            p25:-11.7273 median:-9.0438 p75:-7.4841 Mean:-10.5703 Std:9.9372
--------------------------------------------------
uncorrupted          p25:-7.9398 median:-5.0228 p75:-1.9355 Mean:-5.4674 Std:7.1436
Gaussian-Noise       p25:-17.312 median:-13.1382 p75:-10.068 Mean:-14.299 Std:5.916
Random-jitter        p25:-23.3448 median:-12.8353 p75:-7.6644 Mean:-15.8827 Std:10.7533
Worst NB LL          p25:-21.1918 median:-17.6062 p75:-15.3712 Mean:-18.8252 Std:4.6656
Avg NB LL            p25:-11.0902 median:-8.6016 p75:-7.0249 Mean:-10.0709 Std:5.0567
