# Environment setup


In [1]:
!nvidia-smi

Wed Oct  4 04:45:50 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [1]:
!pip install transformers
!pip install datasets
!pip install onnx
!pip install onnxruntime

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m59.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m106.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m78.1 MB/s[0m eta [36m0:00:00[0m
Coll

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoModelForSequenceClassification
import transformers.convert_graph_to_onnx as onnx_convert
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

from onnxruntime.quantization import quantize_dynamic, QuantType

# Data Process

In [4]:
model_name = 'microsoft/xtremedistil-l6-h384-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
from datasets import load_dataset
dataset = load_dataset("go_emotions", "raw")

In [6]:
# 将训练集转换为DataFrame
df_train = pd.DataFrame(dataset["train"])

# 显示DataFrame的前几行
print(df_train.head())

                                                text       id  \
0                                    That game hurt.  eew5j0j   
1   >sexuality shouldn’t be a grouping category I...  eemcysk   
2     You do right, if you don't care then fuck 'em!  ed2mah1   
3                                 Man I love reddit.  eeibobj   
4  [NAME] was nowhere near them, he was by the Fa...  eda6yn6   

                author            subreddit    link_id   parent_id  \
0                Brdd9                  nrl  t3_ajis4z  t1_eew18eq   
1          TheGreen888     unpopularopinion  t3_ai4q37   t3_ai4q37   
2             Labalool          confessions  t3_abru74  t1_ed2m7g7   
3        MrsRobertshaw             facepalm  t3_ahulml   t3_ahulml   
4  American_Fascist713  starwarsspeculation  t3_ackt2f  t1_eda65q2   

    created_utc  rater_id  example_very_unclear  admiration  ...  love  \
0  1.548381e+09         1                 False           0  ...     0   
1  1.548084e+09        37               

In [7]:
emotions = [
 'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [8]:
dataset = dataset.map(lambda x : {"labels": [x[c] for c in emotions]})

In [9]:
def tokenize_fn(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64)


cols = dataset["train"].column_names
cols.remove("labels")
dataset_encode = dataset.map(tokenize_fn, batched=True, remove_columns=cols)
dataset_encode

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 211225
    })
})

In [10]:
dataset_encode.set_format("torch")
dataset_encode = (dataset_encode
          .map(lambda x : {"float_labels": x["labels"].to(torch.float)}, remove_columns=["labels"])
          .rename_column("float_labels", "labels"))

In [11]:
dataset_encode['train'].features

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)}

# Train Model

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(emotions), problem_type="multi_label_classification")
model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h384-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


这里需要重启

In [14]:
pip install accelerate -U



In [15]:
from transformers import TrainingArguments
training_args = TrainingArguments("test_trainer",
                                  per_device_train_batch_size=128,
                                  num_train_epochs=4,learning_rate=3e-05,
                                  evaluation_strategy="no")
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_encode['train']
)

In [16]:
trainer.train()

Step,Training Loss
500,0.26
1000,0.1572
1500,0.1516
2000,0.1459
2500,0.1399
3000,0.1342
3500,0.1308
4000,0.1278
4500,0.1263
5000,0.1241


TrainOutput(global_step=6604, training_loss=0.14319511199271873, metrics={'train_runtime': 1388.3174, 'train_samples_per_second': 608.578, 'train_steps_per_second': 4.757, 'total_flos': 3505971733555200.0, 'train_loss': 0.14319511199271873, 'epoch': 4.0})

# Convert

In [3]:
import transformers
import transformers.convert_graph_to_onnx as onnx_convert
from pathlib import Path

In [5]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model_name = 'microsoft/xtremedistil-l6-h384-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
#model_path = '/content/test_trainer/checkpoint-6500/'
model_path = 'bergum/xtremedistil-l6-h384-go-emotion'
model = AutoModelForSequenceClassification.from_pretrained(model_path)

Downloading (…)lve/main/config.json:   0%|          | 0.00/526 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

查看tokenizer是否一致

In [7]:
sentence  = "{\"question\": \"Where is Bob Dylan From?\", \"context\": \"Bob Dylan is from Duluth, Minnesota and is an American singer-songwriter\"}"
tokens = tokenizer.encode(sentence, return_tensors="np")
print(tokens)

[[  101  1063  1000  3160  1000  1024  1000  2073  2003  3960  7758  2013
   1029  1000  1010  1000  6123  1000  1024  1000  3960  7758  2003  2013
  28218  1010  5135  1998  2003  2019  2137  3220  1011  6009  1000  1065
    102]]


In [45]:
tokenizer

BertTokenizerFast(name_or_path='microsoft/xtremedistil-l6-h384-uncased', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [6]:
model = model.to("cpu")

In [7]:
pipeline = transformers.pipeline("text-classification",model=model,tokenizer=tokenizer)

In [8]:
test_text = "why did you do that\uFF1F" #@param {type:"string"}
pipeline(test_text)

[{'label': 'curiosity 🤔', 'score': 0.33093440532684326}]

In [11]:
onnx_convert.convert_pytorch(pipeline, opset=11, output=Path("classifier.onnx"), use_external_format=False)

Using framework PyTorch: 2.0.1+cu118
Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input token_type_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch'}
Ensuring inputs are in correct order
position_ids is not present in the generated input list.
Generated inputs order: ['input_ids', 'attention_mask', 'token_type_ids']
verbose: False, log level: Level.ERROR



In [12]:
from onnxruntime.quantization import quantize_dynamic, QuantType
quantize_dynamic("classifier.onnx", "classifier_int8.onnx",
                 weight_type=QuantType.QUInt8)



Ignore MatMul due to non constant B: /[/bert/encoder/layer.0/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.0/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.1/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.1/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.2/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.2/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.3/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.3/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.4/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.4/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.5/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.5/atten

In [48]:
from google.colab import files
files.download("bert-go-emotion.onnx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>