In [1]:

from __future__ import absolute_import, division, print_function

import pprint
import argparse
import logging
import os
import random
import sys
import pickle
import copy
import collections
import math

import numpy as np
import numpy
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler,TensorDataset
os.environ["CUDA_VISIBLE_DEVICES"]="0" # Set GPU Index to use
from torch.nn import CrossEntropyLoss, MSELoss

from transformer import BertForSequenceClassification,WEIGHTS_NAME, CONFIG_NAME
from transformer.modeling_quant import BertForSequenceClassification as QuantBertForSequenceClassification
from transformer import BertTokenizer
from transformer import BertAdam
from transformer import BertConfig
from transformer import QuantizeLinear, QuantizeAct, BertSelfAttention, FP_BertSelfAttention, ClipLinear, BertAttention, FP_BertAttention
from utils_glue import *
from bertviz import model_view

from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch.nn.functional as F

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0 
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def do_eval(model, task_name, eval_dataloader,
            device, output_mode, eval_labels, num_labels, teacher_model=None):
    eval_loss = 0
    nb_eval_steps = 0
    preds = []

    for batch_ in tqdm(eval_dataloader, desc="Inference"):
        batch_ = tuple(t.to(device) for t in batch_)
        
        with torch.no_grad():
            input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch_

            # teacher attnmap test
            if teacher_model is not None:
                
                # logits, _, teacher_reps, teacher_probs, teacher_values = teacher_model(input_ids, segment_ids, input_mask)
                
                # # logits, _, _, _, _ = model(input_ids, segment_ids, input_mask, teacher_probs=teacher_probs)
                # logits, _, _, _, _ = model(input_ids, segment_ids, input_mask, teacher_probs=(teacher_probs, teacher_values, teacher_reps))
                teacher_logits, teacher_atts, teacher_reps, teacher_probs, teacher_values = teacher_model(input_ids, segment_ids, input_mask)
                logits, student_atts, student_reps, student_probs, student_values  = model(input_ids, segment_ids, input_mask, teacher_outputs=(teacher_probs, teacher_values, teacher_reps, teacher_logits, teacher_atts))
            else:
                logits, _, _, _, _ = model(input_ids, segment_ids, input_mask)
        
        # create eval loss and other metric required by the task
        if output_mode == "classification":
            loss_fct = CrossEntropyLoss()
            tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
        elif output_mode == "regression":
            loss_fct = MSELoss()
            tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))

        eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if len(preds) == 0:
            preds.append(logits.detach().cpu().numpy())
        else:
            preds[0] = np.append(
                preds[0], logits.detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps

    preds = preds[0]
    if output_mode == "classification":
        preds = np.argmax(preds, axis=1)
    elif output_mode == "regression":
        preds = np.squeeze(preds)
    result = compute_metrics(task_name, preds, eval_labels.numpy())
    result['eval_loss'] = eval_loss
    return result

processors = {
    "cola": ColaProcessor,
    "mnli": MnliProcessor,
    "mnli-mm": MnliMismatchedProcessor,
    "mrpc": MrpcProcessor,
    "sst-2": Sst2Processor,
    "sts-b": StsbProcessor,
    "qqp": QqpProcessor,
    "qnli": QnliProcessor,
    "rte": RteProcessor   
}

output_modes = {
        "cola": "classification",
        "mnli": "classification",
        "mrpc": "classification",
        "sst-2": "classification",
        "sts-b": "regression",
        "qqp": "classification",
        "qnli": "classification",
        "rte": "classification"
}

default_params = {
        "cola": {"max_seq_length": 64,"batch_size":16,"eval_step": 400}, # No Aug : 50 Aug : 400
        "mnli": {"max_seq_length": 128,"batch_size":32,"eval_step":8000},
        "mrpc": {"max_seq_length": 128,"batch_size":32,"eval_step":20},
        "sst-2": {"max_seq_length": 64,"batch_size":32,"eval_step":100},
        "sts-b": {"max_seq_length": 128,"batch_size":32,"eval_step":100},
        "qqp": {"max_seq_length": 128,"batch_size":32,"eval_step":1000},
        "qnli": {"max_seq_length": 128,"batch_size":32,"eval_step":1000},
        "rte": {"max_seq_length": 128,"batch_size":32,"eval_step":100}
    }

def get_tensor_data(output_mode, features):
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)


    all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long)
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,all_label_ids, all_seq_lengths)
    return tensor_data, all_label_ids


In [2]:
task_name = "cola"
bert_size = "base"

if bert_size == "large":
    layer_num = 24
    head_num = 16
else: 
    layer_num = 12
    head_num = 12
    
teacher_model = None
# torch.cuda.empty_cache()
# !nvidia-smi

# DEVICE / DATASET

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_dir = "models"
output_dir = "output"

if bert_size == "large":
    model_dir = os.path.join(model_dir, "BERT_large")
    output_dir = os.path.join(output_dir, "BERT_large")

teacher_model_dir = os.path.join(model_dir,task_name)

# Processor & Task Info
processor = processors[task_name]()
output_mode = output_modes[task_name]
label_list = processor.get_labels()
num_labels = len(label_list)

if task_name in default_params:
    batch_size = default_params[task_name]["batch_size"]
    max_seq_length = default_params[task_name]["max_seq_length"]
    eval_step = default_params[task_name]["eval_step"]
    
# Tokenizer
tokenizer = BertTokenizer.from_pretrained(teacher_model_dir, do_lower_case=True)


# Load Dataset
data_dir = os.path.join("data",task_name)
processed_data_dir = os.path.join(data_dir,'preprocessed')

train_examples = processor.get_train_examples(data_dir)
train_features = convert_examples_to_features(train_examples, label_list,
                                max_seq_length, tokenizer, output_mode)

len_train_data = int(len(train_features) * 1)
train_features = train_features[:len_train_data]

eval_examples = processor.get_dev_examples(data_dir)
eval_features = convert_examples_to_features(eval_examples, label_list, max_seq_length, tokenizer, output_mode)
# dev_file = train_file = os.path.join(processed_data_dir,'dev.pkl') 
# eval_features = pickle.load(open(dev_file,'rb'))

train_data, train_labels = get_tensor_data(output_mode, train_features)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

eval_data, eval_labels = get_tensor_data("classification", eval_features)
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1)
eval_data, eval_labels = get_tensor_data(output_mode, eval_features)

eval_examples = processor.get_dev_examples(data_dir)

# Sampling Sentence 
i = 0 
# num = 3
num = 1



07/13 01:06:01 AM Writing example 0 of 8551
07/13 01:06:01 AM *** Example ***
07/13 01:06:01 AM guid: train-0
07/13 01:06:01 AM tokens: [CLS] our friends won ' t buy this analysis , let alone the next one we propose . [SEP]
07/13 01:06:01 AM input_ids: 101 2256 2814 2180 1005 1056 4965 2023 4106 1010 2292 2894 1996 2279 2028 2057 16599 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/13 01:06:01 AM input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/13 01:06:01 AM segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/13 01:06:01 AM label: 1
07/13 01:06:01 AM label_id: 1
07/13 01:06:02 AM Writing example 0 of 1043
07/13 01:06:02 AM *** Example ***
07/13 01:06:02 AM guid: dev-0
07/13 01:06:02 AM tokens: [CLS] the sailors rode the breeze clear of the rocks . 

In [4]:
# teacher_model = BertForSequenceClassification.from_pretrained(teacher_model_dir, num_labels=num_labels)
# teacher_model.to(device)
# teacher_model.eval()

st_model_name = "1SB_M_100"
student_model_dir = os.path.join(output_dir, task_name, "exploration", st_model_name)   
student_config = BertConfig.from_pretrained(student_model_dir)   
student_model = QuantBertForSequenceClassification.from_pretrained(student_model_dir, config = student_config, num_labels=num_labels)
student_model.to(device)
print()

07/13 01:06:02 AM loading configuration file output/cola/exploration/1SB_M_100/config.json
07/13 01:06:04 AM Loading model output/cola/exploration/1SB_M_100/pytorch_model.bin
07/13 01:06:05 AM loading model...
07/13 01:06:05 AM done!



In [None]:
import ops.tests as tests
import ops.datasets as datasets
import ops.loss_landscapes as lls

scale = 1e-0
n = 21
gpu = torch.cuda.is_available()

metrics_grid = lls.get_loss_landscape(
    student_model, 1, train_dataloader, transform=None,
    kws=["pos_embed", "relative_position"],
    x_min=-1.0 * scale, x_max=1.0 * scale, n_x=n, y_min=-1.0 * scale, y_max=1.0 * scale, n_y=n, gpu=gpu
)

metrics_dir = os.path.join("lls_logs", "%s_long_losslandscape.csv" % (task_name))
metrics_list = [[*grid, metrics] for grid, metrics in metrics_grid.items()]

with open(metrics_dir, 'w') as csvfile:
        writer = csv.writer(csvfile)
        for metrics in metrics_list:
            writer.writerow(metrics)

Grid:  [-1. -1.], Grid:  [-0.9 -1. ], Grid:  [-0.8 -1. ], Grid:  [-0.7 -1. ], Grid:  [-0.6 -1. ], Grid:  [-0.5 -1. ], Grid:  [-0.4 -1. ], Grid:  [-0.3 -1. ], Grid:  [-0.2 -1. ], Grid:  [-0.1 -1. ], Grid:  [ 0. -1.], Grid:  [ 0.1 -1. ], Grid:  [ 0.2 -1. ], Grid:  [ 0.3 -1. ], Grid:  [ 0.4 -1. ], Grid:  [ 0.5 -1. ], Grid:  [ 0.6 -1. ], Grid:  [ 0.7 -1. ], Grid:  [ 0.8 -1. ], Grid:  [ 0.9 -1. ], Grid:  [ 1. -1.], Grid:  [-1.  -0.9], Grid:  [-0.9 -0.9], Grid:  [-0.8 -0.9], Grid:  [-0.7 -0.9], Grid:  [-0.6 -0.9], Grid:  [-0.5 -0.9], Grid:  [-0.4 -0.9], Grid:  [-0.3 -0.9], Grid:  [-0.2 -0.9], Grid:  [-0.1 -0.9], Grid:  [ 0.  -0.9], Grid:  [ 0.1 -0.9], Grid:  [ 0.2 -0.9], Grid:  [ 0.3 -0.9], Grid:  [ 0.4 -0.9], Grid:  [ 0.5 -0.9], Grid:  [ 0.6 -0.9], Grid:  [ 0.7 -0.9], Grid:  [ 0.8 -0.9], Grid:  [ 0.9 -0.9], Grid:  [ 1.  -0.9], Grid:  [-1.  -0.8], Grid:  [-0.9 -0.8], Grid:  [-0.8 -0.8], Grid:  [-0.7 -0.8], Grid:  [-0.6 -0.8], Grid:  [-0.5 -0.8], Grid:  [-0.4 -0.8], Grid:  [-0.3 -0.8], Grid: 

In [None]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm


# load losslandscape raw data of ResNet-50 or ViT-Ti
# names = ["x", "y", "l1", "l2", "NLL", "Cutoff1", "Cutoff2", "Acc", "Acc-90", "Unc", "Unc-90", "IoU", "IoU-90", "Freq", "Freq-90", "Top-5", "Brier", "ECE", "ECSE"]
# path = "%s/resources/results/cifar100_vit_ti_losslandscape.csv" % root  # for ViT-Ti

names = ["x", "y", "NLL"]
data = pd.read_csv(metrics_dir, names=names)
data["loss"] = data["NLL"] # + optim_args["weight_decay"] * data["l2"]  # NLL + l2

# prepare data
p = int(math.sqrt(len(data)))
shape = [p, p]
xs = data["x"].to_numpy().reshape(shape) 
ys = data["y"].to_numpy().reshape(shape)
zs = data["loss"].to_numpy().reshape(shape)

zs = zs - zs[np.isfinite(zs)].min()
zs[zs > 42] = np.nan

norm = plt.Normalize(zs[np.isfinite(zs)].min(), zs[np.isfinite(zs)].max())  # normalize to [0,1]
colors = cm.plasma(norm(zs))
rcount, ccount, _ = colors.shape

fig = plt.figure(figsize=(4.2, 4), dpi=120)
ax = fig.gca(projection="3d")
ax.view_init(elev=10, azim=30)  # angle

# make the panes transparent
ax.xaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
ax.yaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
ax.zaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
# make the grid lines transparent
ax.xaxis._axinfo["grid"]['color'] =  (1,1,1,0)
ax.yaxis._axinfo["grid"]['color'] =  (1,1,1,0)
ax.zaxis._axinfo["grid"]['color'] =  (1,1,1,0)

surf = ax.plot_surface(
    xs, ys, zs, 
    rcount=rcount, ccount=ccount,
    cmap=plt.cm.coolwarm, shade=False,
)
surf.set_facecolor((0,0,0,0))

# remove white spaces
adjust_lim = 1
ax.set_xlim(-1 * adjust_lim, 1 * adjust_lim)
ax.set_ylim(-1 * adjust_lim, 1 * adjust_lim)
ax.set_zlim(0, 1)
fig.subplots_adjust(left=0, right=1, bottom=0, top=1)
# ax.axis('off')

ls = 9
ax.tick_params(axis="x", labelsize=ls)
ax.tick_params(axis="y", labelsize=ls)
ax.tick_params(axis="z", labelsize=ls)

plt.show()

In [76]:
xs[0,9]
ys[0,9]

-1.0