In [200]:
!pip install transformers
!pip install datasets

import os
import random
from sklearn.metrics import accuracy_score
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

import pandas as pd
from tqdm import tqdm

from tensorflow.keras.preprocessing.sequence import pad_sequences

from transformers import BertTokenizerFast
from transformers import BertForSequenceClassification

# for graphing
import seaborn as sns
import matplotlib.pyplot as plt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [201]:
from datasets import load_dataset

dataset = load_dataset("klue", "sts")
dataset = dataset.flatten()
dataset = dataset.rename_column('labels.real-label','labels')

Found cached dataset klue (/Users/hwangtaegyeong/.cache/huggingface/datasets/klue/sts/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e)


  0%|          | 0/2 [00:00<?, ?it/s]

In [202]:
# argment setting
model_checkpoint = "./FP32" #"bert-base-multilingual-cased"
batch_size_per_device = 32
max_length = 256
n_epochs = 3
warmup_ratio = .2
lr = 5e-5

In [203]:
# 디바이스 설정
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')
# torch.device("cpu") 

No GPU available, using the CPU instead.


In [204]:
# load model&tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_checkpoint)

In [205]:
model = BertForSequenceClassification.from_pretrained(model_checkpoint, num_labels=1)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [206]:
model_dynamic_quantized = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8,
)
model_dynamic_quantized.to(device)
model_dynamic_quantized

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (key): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (value): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (dropout): Dropout(p=0.1, inplace=False)
            )
     

In [207]:
def print_size_of_model(model, label=""):
    torch.save(model.state_dict(), "temp.p")
    size=os.path.getsize("temp.p")
    print("model: ",label,' \t','Size (KB):', size/1e3)
    os.remove('temp.p')
    return size

# compare the sizes
f=print_size_of_model(model,"fp32")
print(sum(p.numel() for p in model.parameters()))

q=print_size_of_model(model_dynamic_quantized,"int8")
print(sum(p.numel() for p in model_dynamic_quantized.parameters()))

print("{0:.2f} times smaller".format(f/q))

model:  fp32  	 Size (KB): 711486.153
177854209
model:  int8  	 Size (KB): 454967.717
92245248
1.56 times smaller


In [208]:
from datasets import list_metrics, load_metric
metrics_list = list_metrics()
len(metrics_list)
print(', '.join(metric for metric in metrics_list))

metric_pearsonr = load_metric('pearsonr')

accuracy, bertscore, bleu, bleurt, brier_score, cer, character, charcut_mt, chrf, code_eval, comet, competition_math, coval, cuad, exact_match, f1, frugalscore, glue, google_bleu, indic_glue, mae, mahalanobis, mape, mase, matthews_correlation, mauve, mean_iou, meteor, mse, nist_mt, pearsonr, perplexity, poseval, precision, recall, rl_reliability, roc_auc, rouge, sacrebleu, sari, seqeval, smape, spearmanr, squad, squad_v2, super_glue, ter, trec_eval, wer, wiki_split, xnli, xtreme_s, BucketHeadP65/confusion_matrix, BucketHeadP65/roc_curve, Drunper/metrica_tesi, Felipehonorato/my_metric, GMFTBY/dailydialog_evaluate, GMFTBY/dailydialogevaluate, JP-SystemsX/nDCG, Josh98/nl2bash_m, KevinSpaghetti/accuracyk, NCSOFT/harim_plus, NikitaMartynov/spell-check-metric, NimaBoscarino/weat, Ochiroo/rouge_mn, Vertaix/vendiscore, Viona/infolm, Vlasta/pr_auc, abdusah/aradiawer, abidlabs/mean_iou, abidlabs/mean_iou2, angelina-wang/directional_bias_amplification, anz2/iliauniiccocrevaluation, bstrai/classif

In [209]:
def preprocess_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'],
                     truncation=True, max_length=max_length, padding=True)

In [210]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

Loading cached processed dataset at /Users/hwangtaegyeong/.cache/huggingface/datasets/klue/sts/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e/cache-93031d45f4537168.arrow
Loading cached processed dataset at /Users/hwangtaegyeong/.cache/huggingface/datasets/klue/sts/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e/cache-ad21866d485645ec.arrow


In [243]:
encoded_dataset['validation']

Dataset({
    features: ['guid', 'source', 'sentence1', 'sentence2', 'labels.label', 'labels', 'labels.binary-label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 519
})

In [244]:
test_ids = torch.tensor(encoded_dataset['validation']['input_ids'])
test_masks = torch.tensor(encoded_dataset['validation']['attention_mask'])
test_token_type_ids = torch.tensor(encoded_dataset['validation']['token_type_ids'])
test_labels = torch.tensor(encoded_dataset['validation']['labels'])

In [245]:
test_data = TensorDataset(test_ids, test_masks, test_token_type_ids, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size_per_device)

In [246]:
metric_pearsonr = load_metric('pearsonr')
metric_f1 = load_metric("f1")

total_preds
total_labels
    
def predict(model, data_loader):
    print('start predict')

    model.eval()
    
    total_preds = []
    total_labels = []
    for step, batch in tqdm(enumerate(data_loader)):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_token_type_ids, b_labels = batch
        
        with torch.no_grad():
            outputs = model(b_input_ids,
                            token_type_ids=b_token_type_ids,
                            attention_mask=b_input_mask)
        logit = outputs[0]
        
        logit = logit.detach().cpu().numpy()
        label = b_labels.cpu().numpy()
            
        total_preds += logit[:, 0].tolist()
        total_labels += label.tolist()

    pearsonr = metric_pearsonr.compute(references=total_labels, predictions=total_preds)['pearsonr']
    f1 = metric_f1.compute(predictions=total_preds, references=total_labels, average='macro')['f1']
    return pearsonr, f1

In [248]:
import time

def time_model_evaluation(model, tokenizer):
    eval_start_time = time.time()
    pearsonr, f1 = predict(model, test_dataloader)
    eval_end_time = time.time()
    eval_duration_time = eval_end_time - eval_start_time
    print("pearsonr : {0:.4f}".format(pearsonr))
    print("f1 score : {0:.4f}".format(f1))
    print("Evaluate total time (seconds) : {0:.1f}".format(eval_duration_time))

# Evaluate the original FP32 BERT model
time_model_evaluation(model, tokenizer)

# Evaluate the INT8 BERT model after the dynamic quantization
time_model_evaluation(model_dynamic_quantized, tokenizer)

start predict


17it [01:17,  4.54s/it]


pearsonr : 0.8505
f1 score : 0.4261
Evaluate total time (seconds) : 78.3
start predict


17it [01:07,  3.95s/it]


pearsonr : 0.8204
f1 score : 0.3531
Evaluate total time (seconds) : 68.2


In [None]:
import time

metric_pearsonr = load_metric('pearsonr')
metric_f1 = load_metric("f1")

def sample(n):
  data = {
      'sentence1':test['sentence1'],
      'sentence2':test['sentence2'],
      'labels':test['labels']
  }
  return pd.DataFrame(data).sample(n)

def eval(model, d):
  
  total_preds = []
  total_labels = d['labels']

  eval_start_time = time.time()
  for step, sec in tqdm(enumerate(zip(d['sentence1'], d['sentence2']))):
    sec1, sec2 = sec
    output = torch.tensor([tokenizer.encode(sec1, sec2)]).to(device)
    with torch.no_grad():
      preds = model(output).logits.cpu()

    pred = preds[:, 0]
    total_preds.append(pred)

  eval_end_time = time.time()
  eval_duration_time = eval_end_time - eval_start_time
    
  pearsonr = metric_pearsonr.compute(references=total_labels, predictions=total_preds)['pearsonr']
  f1 = metric_f1.compute(predictions=total_preds, references=total_labels, average='macro')['f1']
  
  print("pearsonr : {0:.4f}".format(pearsonr))
  print("f1 score : {0:.4f}".format(f1))
  print("Evaluate total time (seconds) : {0:.1f}".format(eval_duration_time))

In [215]:
d=sample(500)
eval(model, d)

500it [00:46, 10.70it/s]


pearsonr : 0.6254
f1 score : 0.1321
Evaluate total time (seconds) : 46.7
