## Pipeline 示例：智能客服

用户问题（一次 query）意图识别

## Pipeline 加载

In [2]:
import torch
from pathlib import Path
from time import perf_counter
import numpy as np
from transformers import pipeline
from tqdm import tqdm

In [3]:
# import os
# os.environ['HTTP_PROXY'] = 'http://127.0.0.1:7890'
# os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:7890'

In [4]:
bert_ckpt = 'transformersbook/bert-base-uncased-finetuned-clinc'
pipe = pipeline('text-classification', model=bert_ckpt, device=0)

In [5]:
next(iter(pipe.model.parameters())).device

device(type='cuda', index=0)

In [6]:
query = """Hey, I'd like to rent a vehicle from Nov 1st to Nov 15th in Paris and I need a 15 passenger van"""
pipe(query)

[{'label': 'car_rental', 'score': 0.5490034222602844}]

## Pipeline 模型结构

In [7]:
# classifier head: 151 分类
pipe.model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

## 模型性能评估

- Model performance
    - dataset accuracy
- Latency
    - query/inference time
- Memory
    - model size

### datasets

In [8]:
from datasets import load_dataset

In [9]:
clinc = load_dataset("clinc_oos", "plus")

In [10]:
clinc['test'][42]

{'text': 'transfer $100 from my checking to saving account', 'intent': 133}

### metrics

In [11]:
intents = clinc['test'].features['intent']
intents

ClassLabel(names=['restaurant_reviews', 'nutrition_info', 'account_blocked', 'oil_change_how', 'time', 'weather', 'redeem_rewards', 'interest_rate', 'gas_type', 'accept_reservations', 'smart_home', 'user_name', 'report_lost_card', 'repeat', 'whisper_mode', 'what_are_your_hobbies', 'order', 'jump_start', 'schedule_meeting', 'meeting_schedule', 'freeze_account', 'what_song', 'meaning_of_life', 'restaurant_reservation', 'traffic', 'make_call', 'text', 'bill_balance', 'improve_credit_score', 'change_language', 'no', 'measurement_conversion', 'timer', 'flip_coin', 'do_you_have_pets', 'balance', 'tell_joke', 'last_maintenance', 'exchange_rate', 'uber', 'car_rental', 'credit_limit', 'oos', 'shopping_list', 'expiration_date', 'routing', 'meal_suggestion', 'tire_change', 'todo_list', 'card_declined', 'rewards_balance', 'change_accent', 'vaccines', 'reminder_update', 'food_last', 'change_ai_name', 'bill_due', 'who_do_you_work_for', 'share_location', 'international_visa', 'calendar', 'translate',

In [13]:
from evaluate import load
accuracy_score = load('accuracy')

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [14]:
class PerformanceBenchmark:
    def __init__(self, pipe, dataset, optim_type='BERT baseline'):
        self.pipe = pipe
        self.dataset = dataset
        self.optim_type = optim_type
    
    def compute_accuracy(self):
        preds, labels = [], []
        # 可以改造为批次化的 input
        for example in tqdm(self.dataset, desc='evaluate on test dataset'):
            pred = self.pipe(example['text'])[0]['label']
            label = example['intent']
            preds.append(intents.str2int(pred))
            labels.append(label)
        accuracy = accuracy_score.compute(predictions=preds, references=labels)
        print(f'Accuracy on test set: {accuracy["accuracy"]:.3f}')
        return accuracy
    
    def compute_size(self):
        state_dict = self.pipe.model.state_dict()
        tmp_path = Path('model.pth')
        torch.save(state_dict, tmp_path)
        size_mb = Path(tmp_path).stat().st_size / (1024*1024)
        tmp_path.unlink()
        print(f'Model size (MB): {size_mb:.2f}')
        return {'size_mb': size_mb}
    
    def time_pipeline(self, query='what is the pin number of my account'):
        latencies = []
        
        # warmup
        for _ in range(10):
            _ = self.pipe(query)
            
        # timed run
        for _ in range(100):
            start_time = perf_counter()
            _ = self.pipe(query)
            latency = perf_counter() - start_time
            latencies.append(latency)
        
        # run stats
        time_avg_time = 1000 * np.mean(latencies)
        time_std_time = 1000 * np.std(latencies)
        print(f'Average latency (ms): {time_avg_time:.2f} +\- {time_std_time:.2f}')
        return {'time_avg_ms': time_avg_time, 'time_std_ms': time_std_time}
    
    def run_benchmark(self):
        metrics = {}
        metrics[self.optim_type] = self.compute_size()
        metrics[self.optim_type].update(self.time_pipeline())
        metrics[self.optim_type].update(self.compute_accuracy())
        return metrics

In [15]:
benchmark = PerformanceBenchmark(pipe, clinc['test'])
benchmark.run_benchmark()

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Model size (MB): 418.15
Average latency (ms): 6.10 +\- 0.57


evaluate on test dataset: 100%|██████████| 5500/5500 [00:33<00:00, 162.43it/s]

Accuracy on test set: 0.867





{'BERT baseline': {'size_mb': 418.1497859954834,
  'time_avg_ms': 6.103005185723305,
  'time_std_ms': 0.5661808087132775,
  'accuracy': 0.8672727272727273}}