In [41]:
!pip install transformers
!pip install datasets

import os
import random
from sklearn.metrics import accuracy_score
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

import pandas as pd
from tqdm import tqdm

from tensorflow.keras.preprocessing.sequence import pad_sequences

from transformers import BertTokenizerFast
from transformers import BertForSequenceClassification

# for graphing
import seaborn as sns
import matplotlib.pyplot as plt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [42]:
from datasets import load_dataset

dataset = load_dataset("klue", "nli")

Found cached dataset klue (/Users/hwangtaegyeong/.cache/huggingface/datasets/klue/nli/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e)


  0%|          | 0/2 [00:00<?, ?it/s]

In [43]:
test = dataset['validation']
test

Dataset({
    features: ['guid', 'source', 'premise', 'hypothesis', 'label'],
    num_rows: 3000
})

In [44]:
# argment setting
model_checkpoint = "./FP32" #"bert-base-multilingual-cased"
batch_size_per_device = 32
max_length = 128
n_epochs = 3
warmup_ratio = .2
lr = 5e-5

In [45]:
# 디바이스 설정
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')
# torch.device("cpu") 

No GPU available, using the CPU instead.


In [46]:
# load model&tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_checkpoint)

In [47]:
model = BertForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [48]:
model_dynamic_quantized = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8,
)
model_dynamic_quantized.to(device)
model_dynamic_quantized

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (key): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (value): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (dropout): Dropout(p=0.1, inplace=False)
            )
     

In [49]:
def print_size_of_model(model, label=""):
    torch.save(model.state_dict(), "temp.p")
    size=os.path.getsize("temp.p")
    print("model: ",label,' \t','Size (KB):', size/1e3)
    os.remove('temp.p')
    return size

# compare the sizes
f=print_size_of_model(model,"fp32")
print(sum(p.numel() for p in model.parameters()))

q=print_size_of_model(model_dynamic_quantized,"int8")
print(sum(p.numel() for p in model_dynamic_quantized.parameters()))

print("{0:.2f} times smaller".format(f/q))

model:  fp32  	 Size (KB): 711492.297
177855747
model:  int8  	 Size (KB): 454969.253
92245248
1.56 times smaller


In [50]:
from datasets import list_metrics, load_metric
metrics_list = list_metrics()
len(metrics_list)
print(', '.join(metric for metric in metrics_list))

metric_accuracy = load_metric('accuracy')

accuracy, bertscore, bleu, bleurt, brier_score, cer, character, charcut_mt, chrf, code_eval, comet, competition_math, coval, cuad, exact_match, f1, frugalscore, glue, google_bleu, indic_glue, mae, mahalanobis, mape, mase, matthews_correlation, mauve, mean_iou, meteor, mse, nist_mt, pearsonr, perplexity, poseval, precision, recall, rl_reliability, roc_auc, rouge, sacrebleu, sari, seqeval, smape, spearmanr, squad, squad_v2, super_glue, ter, trec_eval, wer, wiki_split, xnli, xtreme_s, BucketHeadP65/confusion_matrix, BucketHeadP65/roc_curve, Drunper/metrica_tesi, Felipehonorato/my_metric, GMFTBY/dailydialog_evaluate, GMFTBY/dailydialogevaluate, JP-SystemsX/nDCG, Josh98/nl2bash_m, KevinSpaghetti/accuracyk, NCSOFT/harim_plus, NikitaMartynov/spell-check-metric, NimaBoscarino/weat, Ochiroo/rouge_mn, Vertaix/vendiscore, Viona/infolm, Vlasta/pr_auc, abdusah/aradiawer, abidlabs/mean_iou, abidlabs/mean_iou2, angelina-wang/directional_bias_amplification, anz2/iliauniiccocrevaluation, bstrai/classif

In [51]:
def get_input_ids(premise, hypothesis):
  document_bert = ["[CLS] " + str(s[0]) + " [SEP]" + str(s[1]) + " [SEP]" for s in zip(premise, hypothesis)]
  tokenized_texts = [tokenizer.tokenize(s) for s in tqdm(document_bert, "Tokenizing")]
  input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tqdm(tokenized_texts, "Converting tokens to ids")]
  print("Padding sequences...")
  input_ids = pad_sequences(input_ids, maxlen=max_length, dtype='long', truncating='post', padding='post')
  return input_ids

def get_attention_masks(input_ids):
  attention_masks = []
  for seq in tqdm(input_ids, "Generating attention masks"):
      seq_mask = [float(i > 0) for i in seq]
      attention_masks.append(seq_mask)
  return attention_masks

In [101]:
def preprocess_function(examples):
    return tokenizer(examples['premise'], examples['hypothesis'],
                     truncation=True, max_length=max_length, padding='max_length')
dataset = dataset.map(preprocess_function, batched=True)


  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [102]:
test_ids = torch.tensor(get_input_ids(test['premise'], test['hypothesis']))
test_masks = torch.tensor(get_attention_masks(test_ids))
test_labels = torch.tensor(test['label'])

Tokenizing: 100%|█████████████████████████| 3000/3000 [00:00<00:00, 4306.40it/s]
Converting tokens to ids: 100%|██████████| 3000/3000 [00:00<00:00, 24753.29it/s]


Padding sequences...


Generating attention masks: 100%|█████████| 3000/3000 [00:02<00:00, 1271.19it/s]


In [103]:
test_token_type_ids = torch.tensor(dataset["validation"]["token_type_ids"])

In [104]:
test_data = TensorDataset(test_ids, test_masks, test_token_type_ids, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size_per_device)

In [105]:
from sklearn.metrics import accuracy_score

def predict(model, data_loader):
    print('start predict')

    model.eval()

    total_preds = []
    total_labels = []

    for step, batch in tqdm(enumerate(data_loader)):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_token_type_ids, b_labels = batch
        
        with torch.no_grad():
            outputs = model(b_input_ids,
                            token_type_ids=b_token_type_ids,
                            attention_mask=b_input_mask)
        logit = outputs[0]

        logit = logit.detach().cpu().numpy()
        label = b_labels.cpu().numpy()
        
        total_preds += np.argmax(logit, axis=1).tolist()
        total_labels += label.tolist()

    avg = accuracy_score(total_labels, total_preds) * 100
    return avg

In [106]:
def print_size_of_model(model, label=""):
    torch.save(model.state_dict(), "temp.p")
    size=os.path.getsize("temp.p")
    print("model: ",label,' \t','Size (KB):', size/1e3)
    os.remove('temp.p')
    return size

# compare the sizes
f=print_size_of_model(model,"fp32")
print(sum(p.numel() for p in model.parameters()))

q=print_size_of_model(model_dynamic_quantized,"int8")
print(sum(p.numel() for p in model_dynamic_quantized.parameters()))

print("{0:.2f} times smaller".format(f/q))

model:  fp32  	 Size (KB): 711492.297
177855747
model:  int8  	 Size (KB): 454969.253
92245248
1.56 times smaller


In [107]:
import time

def time_model_evaluation(model, tokenizer):
    eval_start_time = time.time()
    avg_test_accuracy = predict(model, test_dataloader)
    eval_end_time = time.time()
    eval_duration_time = eval_end_time - eval_start_time
    print("Accuracy : {0:.4f}".format(avg_test_accuracy))
    print("Evaluate total time (seconds) : {0:.1f}".format(eval_duration_time))

# Evaluate the original FP32 BERT model
time_model_evaluation(model, tokenizer)

# Evaluate the INT8 BERT model after the dynamic quantization
time_model_evaluation(model_dynamic_quantized, tokenizer)

start predict


94it [07:08,  4.56s/it]


Accuracy : 73.1000
Evaluate total time (seconds) : 428.8
start predict


94it [08:10,  5.22s/it]

Accuracy : 68.5333
Evaluate total time (seconds) : 490.5





In [38]:
import time

from sklearn.metrics import f1_score

def sample(n):
  data = {
      'sec1': test['premise'],
      'sec2': test['hypothesis'],
      'label': test['label']
  }
  return pd.DataFrame(data).sample(n)

def eval(model, d):

  total_preds = []
  total_labels = d['label']

  eval_start_time = time.time()
  for step, sec in tqdm(enumerate(zip(d['sec1'], d['sec2']))):
    sec1, sec2 = sec
    output = torch.tensor([tokenizer.encode(sec1, sec2)]).to(device)
    with torch.no_grad():
      preds = model(output).logits.cpu()

    pred = preds[:, 0]
    total_preds.append(pred)

  eval_end_time = time.time()
  eval_duration_time = eval_end_time - eval_start_time

  pearsonr = metric_accuracy.compute(references=total_labels, predictions=total_preds)['accuracy']
  print("pearsonr : {0:.4f}".format(pearsonr))
  print("Evaluate total time (seconds) : {0:.1f}".format(eval_duration_time))

In [71]:
d = sample(100) # test
eval(model, d)
eval(model_dynamic_quantized, d)

100it [00:04, 21.59it/s]


pearsonr : 0.2600
Evaluate total time (seconds) : 4.6


100it [00:17,  5.56it/s]

pearsonr : 0.2700
Evaluate total time (seconds) : 18.0



