In [None]:
from transformers import pipeline

## pipeline
pipeline()函数将模型与其必要的预处理和后处理步骤连接起来，能够通过直接输入任何文本并获得最终的答案

将一些文本传递到pipeline时涉及三个主要步骤：
- 文本被预处理为模型可以理解的格式。
- 预处理的输入被传递给模型。
- 模型处理后输出最终人类可以理解的结果。

In [None]:
# 一次只输入一个数据
classifier = pipeline("sentiment-analysis")  # 设置需要完成的任务
answer = classifier(
    "I've been waiting for a HuggingFace course my whole life.")
print(answer)
"""
[{'label': 'POSITIVE', 'score': 0.9598047137260437}]
"""

In [None]:
# 一次只输入多个数据
answer = classifier(
    ["I've been waiting for a HuggingFace course my whole life.", 
     "I hate this so much!"]
)
print(answer)
"""
[{'label': 'POSITIVE', 'score': 0.9598047137260437},
 {'label': 'NEGATIVE', 'score': 0.9994558095932007}]
"""

## 零样本分类（zero-shot）
不需要对数据上的模型进行微调即可使用它。它可以直接返回您想要的任何标签列表的概率分数

In [None]:
classifier = pipeline("zero-shot-classification")
answer = classifier(
    "This is a course about the Transformers library",
    candidate_labels=["education", "politics", "business"],
)
print(answer)
"""
{'sequence': 'This is a course about the Transformers library',
 'labels': ['education', 'business', 'politics'],
 'scores': [0.8445963859558105, 0.111976258456707, 0.043427448719739914]}
"""

## 文本生成（text generation）

In [None]:
generator = pipeline("text-generation")
answer = generator("In this course, we will teach you how to",
                   num_return_sequences=2,
                   max_length=15)  # 生成2个句子，每个句子最大长队15
print(answer)

In [None]:
generator = pipeline("text-generation", model="distilgpt2")  # model可以指定想要使用的模型
answer = generator(
    "In this course, we will teach you how to",
    max_length=30,
    num_return_sequences=2,
)
print(answer)

## mask filling
填充给定文本中的空白  

注意：模型填充了特殊的< mask >词，它通常被称为掩码标记。其他掩码填充模型可能有不同的掩码标记。

In [None]:
unmasker = pipeline("fill-mask")
answer = unmasker(
    "This course will teach you all about <mask> models.",  # 其中的一个词被mask掉了，不同的模型mask的方式不同
    top_k=2  # 返回两种可能性答案
)
print(answer)

## 命名实体识别（NER）

In [None]:
# grouped_entities=True是对属于同一个实体的重新组合
ner = pipeline("ner", 
               grouped_entities=True,
               model="StanfordAIMI/stanford-deidentifier-base")
answer = ner("My name is Sylvain and I work at Hugging Face in Brooklyn.")
print(answer)

## 问答系统
使用来自给定上下文的信息回答问题

In [None]:
question_answerer = pipeline("question-answering")
answer = question_answerer(
    question="Where do I work?",
    context="My name is Sylvain and I work at Hugging Face in Brooklyn",
)
print(answer)
"""
{'score': 0.6385916471481323, 'start': 33, 'end': 45, 'answer': 'Hugging Face'}
klyn",
)
"""

## 文本摘要
可以指定结果的 max_length 或 min_length

In [None]:
summarizer = pipeline("summarization")
# max_length或min_length控制文本摘要长短
answer = summarizer(
    """
    America has changed dramatically during recent years. Not only has the number of 
    graduates in traditional engineering disciplines such as mechanical, civil, 
    electrical, chemical, and aeronautical engineering declined, but in most of 
    the premier American universities engineering curricula now concentrate on 
    and encourage largely the study of engineering science. As a result, there 
    are declining offerings in engineering subjects dealing with infrastructure, 
    the environment, and related issues, and greater concentration on high 
    technology subjects, largely supporting increasingly complex scientific 
    developments. While the latter is important, it should not be at the expense 
    of more traditional engineering.

    Rapidly developing economies such as China and India, as well as other 
    industrial countries in Europe and Asia, continue to encourage and advance 
    the teaching of engineering. Both China and India, respectively, graduate 
    six and eight times as many traditional engineers as does the United States. 
    Other industrial countries at minimum maintain their output, while America 
    suffers an increasingly serious decline in the number of engineering graduates 
    and a lack of well-educated engineers.
"""
)
print(answer)

## 翻译
可以指定结果的 max_length 或 min_length

In [None]:
translator = pipeline(
    "translation", model="Helsinki-NLP/opus-mt-fr-en")  # 法语翻译英语
# max_length或min_length控制翻译长短
answer = translator("Ce cours est produit par Hugging Face.")
print(answer)

## 偏见与局限
使用的原始模型的时候，很容易生成性别歧视、种族主义或恐同内容。这种固有偏见不会随着微调模型而使消失。

In [None]:
unmasker = pipeline("fill-mask", model="bert-base-uncased")
result = unmasker("This man works as a [MASK].")
print([r["token_str"] for r in result])

result = unmasker("This woman works as a [MASK].")
print([r["token_str"] for r in result])

## datasets使用

In [None]:
from datasets import list_metrics, load_metric
from pprint import pprint
from datasets import list_datasets, load_dataset
datasets_list = list_datasets()  # 全部数据集列表
# print(len(datasets_list))

# 加载SST数据集（训练数据部分）
dataset = load_dataset('sst', split='train')
# print(len(dataset))

# 打印以字典对象存储的样本，字典中存储标签、原始句子、标记序列、句法分析树
# pprint(dataset[0])

# datasets提供的评价方法
metrics_list = list_metrics()  # 全部评价方法
# print(metrics_list)

# 加载准确率评价方法
accuracy_metric = load_metric('accuracy')
# references真是结果与predictions预测结果计算准确率
results = accuracy_metric.compute(references=[0, 1, 0], predictions=[1, 1, 0])
print(results)