# 初始化环境

## 设置环境变量

In [1]:
import os

os.environ['http_proxy'] = ''
os.environ['https_proxy'] = ''
os.environ['HF_HOME'] = '/root/onethingai-fs/models'
os.environ['HF_HUB_CACHE'] = '/root/onethingai-fs/models/hub'
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

## 定义所需函数

In [2]:
from transformers.pipelines import SUPPORTED_TASKS, TASK_ALIASES
# 获取task的默认模型
def get_default_modelname(taskname):
    if taskname in TASK_ALIASES:
        taskname = TASK_ALIASES[taskname]
    if taskname not in SUPPORTED_TASKS:
        raise Exception('There is no task with the name of ' + taskname)
    return str(SUPPORTED_TASKS[taskname]["default"]["model"])

In [3]:
# 文本分类

## 中文模型

In [4]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-jd-binary-chinese")
model = AutoModelForSequenceClassification.from_pretrained("uer/roberta-base-finetuned-jd-binary-chinese")
# # 仅指定任务时，使用默认模型（不推荐）
taskname = "sentiment-analysis"

defaultModelName = get_default_modelname(taskname)
print(taskname +" default Model: " + defaultModelName)
# pipe = pipeline(task=taskname)
pipe = pipeline(task = taskname, model = model, tokenizer = tokenizer)
pipe("今儿上海可真冷啊")



sentiment-analysis default Model: {'pt': ('distilbert-base-uncased-finetuned-sst-2-english', 'af0f99b'), 'tf': ('distilbert-base-uncased-finetuned-sst-2-english', 'af0f99b')}


[{'label': 'positive (stars 4 and 5)', 'score': 0.9025527834892273}]

In [5]:
pipe("我觉得这家店蒜泥白肉的味道一般")

[{'label': 'positive (stars 4 and 5)', 'score': 0.5883451104164124}]

In [6]:
pipe("你学东西真的好快，理论课一讲就明白了")

[{'label': 'positive (stars 4 and 5)', 'score': 0.9434759616851807}]

In [7]:
pipe("You learn things really quickly. You understand the theory class as soon as it is taught.")

[{'label': 'negative (stars 1, 2 and 3)', 'score': 0.6445640325546265}]

In [8]:
pipe("Today Shanghai is really cold.")

[{'label': 'positive (stars 4 and 5)', 'score': 0.5239152312278748}]

## 批处理模型

In [9]:
text_list = [
    "这部电影拍得不错",
    "我觉得这道咖喱猪肉菜，味道非常一般",
    "你学得非常快。老师刚讲完你就懂了",
    "大家不要买这款手机，质量非常差"
]

pipe(text_list)

[{'label': 'positive (stars 4 and 5)', 'score': 0.9875775575637817},
 {'label': 'negative (stars 1, 2 and 3)', 'score': 0.8558918237686157},
 {'label': 'positive (stars 4 and 5)', 'score': 0.9480956196784973},
 {'label': 'negative (stars 1, 2 and 3)', 'score': 0.9836820960044861}]

# NLP

## NER

In [10]:
from transformers import AutoModelForTokenClassification,AutoTokenizer,pipeline

model_name = 'uer/roberta-base-finetuned-cluener2020-chinese'
model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

taskname = "ner"

defaultModelName = get_default_modelname(taskname)
print(taskname +" default Model: " + defaultModelName)

classifier = pipeline('ner', model=model, tokenizer=tokenizer)

ner default Model: {'pt': ('dbmdz/bert-large-cased-finetuned-conll03-english', 'f2482bf'), 'tf': ('dbmdz/bert-large-cased-finetuned-conll03-english', 'f2482bf')}


In [11]:
preds = classifier("Hugging Face is a French company based in New York City.")
preds = [
    {
        "entity": pred["entity"],
        "score": round(pred["score"], 4),
        "index": pred["index"],
        "word": pred["word"],
        "start": pred["start"],
        "end": pred["end"],
    }
    for pred in preds
]
print(*preds, sep="\n")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'entity': 'B-address', 'score': 0.8887, 'index': 14, 'word': 'new', 'start': 42, 'end': 45}
{'entity': 'I-address', 'score': 0.9487, 'index': 15, 'word': 'york', 'start': 46, 'end': 50}
{'entity': 'I-address', 'score': 0.9567, 'index': 16, 'word': 'city', 'start': 51, 'end': 55}


In [12]:
## 合并实体

In [13]:
classifier = pipeline(task="ner", grouped_entities=True, model=model, tokenizer=tokenizer)
classifier("Hugging Face is a French company based in New York City.")



[{'entity_group': 'address',
  'score': 0.9313647,
  'word': 'new york city',
  'start': 42,
  'end': 55}]

# Question Answering


## 默认的模型，又快又好用

In [14]:
from transformers import AutoModelForQuestionAnswering,AutoModelForCausalLM,AutoTokenizer,pipeline

modelname = 'distilbert/distilbert-base-cased-distilled-squad' # default
# modelname = 'uer/roberta-base-chinese-extractive-qa' # 中国北京都回答不准，有什么用？！
# modelname = 'FlagAlpha/Llama2-Chinese-7b-Chat'
model = AutoModelForQuestionAnswering.from_pretrained(modelname)
# model = AutoModelForCausalLM.from_pretrained(modelname)
tokenizer = AutoTokenizer.from_pretrained(modelname)

taskname = 'question-answering'
defaultModelName = get_default_modelname(taskname)
print(taskname + ' default model: ' + defaultModelName)
question_answerer = pipeline(task=taskname, model=model, tokenizer=tokenizer)


question-answering default model: {'pt': ('distilbert-base-cased-distilled-squad', '626af31'), 'tf': ('distilbert-base-cased-distilled-squad', '626af31')}


In [15]:
preds = question_answerer(
    question="What is the name of the repository?",
    context="The name of the repository is huggingface/transformers",
)
print(
    f"score: {round(preds['score'], 4)}, start: {preds['start']}, end: {preds['end']}, answer: {preds['answer']}"
)

score: 0.9327, start: 30, end: 54, answer: huggingface/transformers


In [16]:
preds = question_answerer(
    question="What is the capital of China?",
    context="On 1 October 1949, CCP Chairman Mao Zedong formally proclaimed the People's Republic of China in Tiananmen Square, Beijing.",
)
print(
    f"score: {round(preds['score'], 4)}, start: {preds['start']}, end: {preds['end']}, answer: {preds['answer']}"
)

score: 0.9458, start: 115, end: 122, answer: Beijing


## Summarization

In [17]:
from transformers import pipeline
taskname = "summarization"

defaultModelName = get_default_modelname(taskname)
print(taskname +" default Model: " + defaultModelName)

summarizer = pipeline(task="summarization",
                      model="t5-small",
                      min_length=8,
                      max_length=32,
)

summarization default Model: {'pt': ('sshleifer/distilbart-cnn-12-6', 'a4f8f3e'), 'tf': ('t5-small', 'd769bba')}


In [18]:
summarizer(
    """
    In this work, we presented the Transformer, the first sequence transduction model based entirely on attention, 
    replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention. 
    For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers. 
    On both WMT 2014 English-to-German and WMT 2014 English-to-French translation tasks, we achieve a new state of the art. 
    In the former task our best model outperforms even all previously reported ensembles.
    """
)


[{'summary_text': 'the Transformer replaces the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention . the'}]

# Audio 音频处理任务

In [19]:
## 使用本地文件


In [20]:
from transformers import pipeline

classifier = pipeline(task="audio-classification", model="superb/hubert-base-superb-er")
preds = classifier("/root/tools/data/audio/mlk.flac")
preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
preds

Some weights of the model checkpoint at superb/hubert-base-superb-er were not used when initializing HubertForSequenceClassification: ['hubert.encoder.pos_conv_embed.conv.weight_g', 'hubert.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at superb/hubert-base-superb-er and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametriza

[{'score': 0.4532, 'label': 'hap'},
 {'score': 0.3622, 'label': 'sad'},
 {'score': 0.0943, 'label': 'neu'},
 {'score': 0.0903, 'label': 'ang'}]

# ASR

In [21]:
from transformers import pipeline

taskname = 'automatic-speech-recognition'
defaultModelName = get_default_modelname(taskname)
print(taskname + ' default model: ' + defaultModelName)

# 使用 `model` 参数指定模型
#transcriber = pipeline(task=taskname, model="openai/whisper-small")

transcriber = pipeline(task=taskname)

No model was supplied, defaulted to facebook/wav2vec2-base-960h and revision 55bb623 (https://hf-mirror.com/facebook/wav2vec2-base-960h).
Using a pipeline without specifying a model name and revision in production is not recommended.


automatic-speech-recognition default model: {'pt': ('facebook/wav2vec2-base-960h', '55bb623')}


Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

In [22]:
text = transcriber("/root/tools/data/audio/mlk.flac")
text

{'text': 'I HAVE A DREAM BUT ONE DAY THIS NATION WILL RISE UP LIVE UP THE TRUE MEANING OF ITS TREES'}

## 比较默认的wav2vec2-base-960h vs whisper-small
whisper-small: {'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
default: {'text': 'I HAVE A DREAM BUT ONE DAY THIS NATION WILL RISE UP LIVE UP THE TRUE MEANING OF ITS TREES'}

明显whisper-small更优一些

# Computer Vision 计算机视觉
## Image Classificaiton

In [23]:
from transformers import pipeline

taskname = 'image-classification'
defaultModelName = get_default_modelname(taskname)
print(taskname + ' default model: ' + defaultModelName)
classifier = pipeline(task=taskname)
# classifier = pipeline(task=taskname, model = "victor/animals-classifier")


No model was supplied, defaulted to google/vit-base-patch16-224 and revision 5dca96d (https://hf-mirror.com/google/vit-base-patch16-224).
Using a pipeline without specifying a model name and revision in production is not recommended.


image-classification default model: {'pt': ('google/vit-base-patch16-224', '5dca96d'), 'tf': ('google/vit-base-patch16-224', '5dca96d')}




## 使用本地图片

In [24]:
preds = classifier(
    "/root/tools/data/image/cat-chonk.jpeg"
)
preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
print(*preds, sep="\n")

{'score': 0.4335, 'label': 'lynx, catamount'}
{'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}
{'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}
{'score': 0.0239, 'label': 'Egyptian cat'}
{'score': 0.0229, 'label': 'tiger cat'}


### 比较
vit-base-patch16-224 vs animals-classifier

#### animals-classifier
```json
{'score': 0.6456, 'label': 'lion'}
{'score': 0.1472, 'label': 'giraffe'}
{'score': 0.0771, 'label': 'hippo'}
{'score': 0.0669, 'label': 'elephant'}
{'score': 0.0632, 'label': 'dolph
```
#### vit-base-patch16-224 (默认，相对较准)
```json
{'score': 0.4335, 'label': 'lynx, catamount'}
{'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}
{'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}
{'score': 0.0239, 'label': 'Egyptian cat'}
{'score': 0.0229, 'lab '': 't ger'c

```at'}in'}

## Object Detection

In [25]:
from transformers import pipeline

taskname = "object-detection"

defaultModelName = get_default_modelname(taskname)
print(taskname +" default Model: " + defaultModelName)

detector = pipeline(task=taskname)
# detector = pipeline(task=taskname, model = "hustvl/yolos-tiny")

No model was supplied, defaulted to facebook/detr-resnet-50 and revision 2729413 (https://hf-mirror.com/facebook/detr-resnet-50).
Using a pipeline without specifying a model name and revision in production is not recommended.


object-detection default Model: {'pt': ('facebook/detr-resnet-50', '2729413')}


Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
preds = detector(
    "/root/tools/data/image/cat-chonk.jpeg"
)
preds = [{"score": round(pred["score"], 4), "label": pred["label"], "box": pred["box"]} for pred in preds]
preds

[{'score': 0.9864,
  'label': 'cat',
  'box': {'xmin': 178, 'ymin': 154, 'xmax': 882, 'ymax': 598}}]

In [27]:
preds = detector(
    "/root/tools/data/image/dog-and-cat.jpg"
)
preds = [{"score": round(pred["score"], 4), "label": pred["label"], "box": pred["box"]} for pred in preds]
preds

[{'score': 0.9996,
  'label': 'dog',
  'box': {'xmin': 0, 'ymin': 56, 'xmax': 271, 'ymax': 389}},
 {'score': 0.9843,
  'label': 'cat',
  'box': {'xmin': 340, 'ymin': 41, 'xmax': 601, 'ymax': 390}}]

### 对比
#### facebook/detr-resnet-50 (default)
```json
[{'score': 0.9996,
  'label': 'dog',
  'box': {'xmin': 0, 'ymin': 56, 'xmax': 271, 'ymax': 389}},
 {'score': 0.9843,
  'label': 'cat',
  'box': {'xmin': 340, 'ymin': 41, 'xmax': 601, 'ymax': 390}}]
```
#### hustvl/yolos-tiny
```json
[{'score': 0.9846,
  'label': 'dog',
  'box': {'xmin': 339, 'ymin': 40, 'xmax': 600, 'ymax': 393}},
 {'score': 0.9994,
  'label': 'dog',
  'box': {'xmin': 0, 'ymin': 59, 'xmax': 271, 'ymax': 386}},
 {'score': 0.9637,
  'label': 'frisbee',
  'box': {'xmin': 461, 'ymin': 120, 'xmax': 557, 'ymax': 190}}]
```