# Hugging Face Transformers

1. Text Classification
You can use the pipeline function for quick experimentation:

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.46.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp311-cp311-macosx_11_0_arm64.whl.metadata (3.8 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.7 kB)
Downloading transformers-4.46.1-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading huggingface_hub-0.26.2-py3-none-any.whl (447 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m447.5/447.5 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[

In [15]:
pip install datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-18.0.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.66.6-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/

In [7]:
from transformers import pipeline

classifier = pipeline(
    "text-classification",
    model="distilbert/distilbert-base-uncased-finetuned-sst-2-english",
    framework="pt",
    device=0  # explicitly specify PyTorch (though this is default)
)
result = classifier("I love using Hugging Face Transformers!")
print(result)


[{'label': 'POSITIVE', 'score': 0.9971315860748291}]


This will give you a quick result on the sentiment of the text.

To fine-tune a text classification model:


In [22]:
!pip uninstall pyarrow
# pip install pyarrow==12.0.1

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Found existing installation: pyarrow 12.0.1
Uninstalling pyarrow-12.0.1:
  Would remove:
    /opt/anaconda3/lib/python3.11/site-packages/pyarrow-12.0.1.dist-info/*
    /opt/anaconda3/lib/python3.11/site-packages/pyarrow/*
Proceed (Y/n)? ^C
[31mERROR: Operation cancelled by user[0m[31m
[0m

In [23]:
# First, try just the dataset import
from datasets import load_dataset
dataset = load_dataset("imdb")
print("Dataset loaded successfully")

# Then try tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print("Tokenizer loaded successfully")

# Then try model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
print("Model loaded successfully")

AttributeError: module 'pyarrow.lib' has no attribute 'ListViewType'

In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch


# Load dataset and model
dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2  # binary classification for IMDB
)

# Set device
device = "mps" if torch.backends.mps.is_available() else "cpu"
model = model.to(device)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples["text"], 
        truncation=True, 
        padding=True,
        return_tensors="pt"  # explicitly request PyTorch tensors
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Fine-tuning
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    no_cuda=True,  # disable CUDA
    use_mps_device=True if device == "mps" else False  # enable MPS if available
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"]
)

trainer.train()

RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

2. Named Entity Recognition (NER)
For Named Entity Recognition, you can use:

In [11]:
from transformers import pipeline

ner = pipeline("ner", grouped_entities=True)
result = ner("Hugging Face Inc. is a company based in New York City.")
print(result)

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

RuntimeError: Failed to import transformers.models.bert.modeling_tf_bert because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments

tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")


3. Text Generation
To generate text using GPT-2:



In [None]:
from transformers import pipeline

generator = pipeline("text-generation", model="gpt2")
result = generator("Once upon a time", max_length=50, num_return_sequences=1)
print(result)

# If you want to fine-tune GPT-2, you can use:

from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")


4. Text Summarization
For text summarization, use BART or T5:

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization")
text = """
Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very close to the Manhattan Bridge.
Hugging Face is known for its open-source library Transformers, which provides pre-trained models for natural language processing.
"""
result = summarizer(text, max_length=50, min_length=25, do_sample=False)
print(result)

5. Question Answering
For question answering, you can do:

In [None]:
from transformers import pipeline

question_answerer = pipeline("question-answering")
result = question_answerer(question="Where is Hugging Face based?", context="Hugging Face Inc. is based in New York City.")
print(result)

6. Translation Example
For translation, you can use the mBART model:

In [None]:
from transformers import pipeline

translator = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr")
result = translator("Hello, how are you?")
print(result)

7. 迁移学习 (Transfer Learning)

迁移学习是利用在大型数据集上训练的预训练模型（如 BERT、GPT-2），然后对特定任务进行微调。示例代码如上所示，BERT 微调情感分析任务即为迁移学习的典型应用。


8. 零样本学习 (Zero-Shot Learning)

使用零样本分类模型（例如 BART 或 RoBERTa）进行分类：

In [None]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
sequence_to_classify = "This is a great product, I love it!"
candidate_labels = ["positive", "negative", "neutral"]
result = classifier(sequence_to_classify, candidate_labels)
print(result)

9. 少量样本学习 (Few-Shot Learning)

利用 GPT-3 或类似的大型语言模型，通过提供几个示例进行少量样本学习：

In [None]:
import openai

openai.api_key = 'your-api-key'

prompt = """
The following are examples of classifying movie reviews as positive or negative:

Review: "This movie was amazing, the acting was great!"
Label: Positive

Review: "The film was dull and boring."
Label: Negative

Review: "I had a great time watching it."
Label:
"""
response = openai.Completion.create(
    engine="text-davinci-003",
    prompt=prompt,
    max_tokens=5
)

print(response.choices[0].text.strip())

10. 从0训练模型 (Training from Scratch)

从头开始训练一个 Transformer 模型：

In [None]:
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

# 定义模型配置
config = AutoConfig.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification(config)

# 加载数据集
dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# 对数据进行tokenize
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 定义训练参数
training_args = TrainingArguments(output_dir="./results", evaluation_strategy="epoch")

# 使用 Trainer 进行训练
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"]
)

trainer.train()

有难度的类型

除了上面提到的常见任务，NLP 中还有一些更具挑战性的任务：

多任务学习 (Multi-Task Learning)：训练一个模型同时完成多个不同类型的任务，例如同时进行文本分类和命名实体识别。

领域自适应 (Domain Adaptation)：将模型从一个领域迁移到另一个领域，通常需要应对不同领域之间的差异，例如从新闻数据迁移到医学数据。

强化学习 (Reinforcement Learning for NLP)：在对话生成或摘要中使用强化学习来优化生成质量。

元学习 (Meta Learning)：通过学习如何学习，提高模型在少样本场景中的表现。

对抗性训练 (Adversarial Training)：训练模型在面对对抗性输入时具有更好的鲁棒性，防止模型受到对抗性攻击的影响。