In [1]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install git-lfs

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7

In [None]:
#setup git
!git config --global user.email "you@example.com"
!git config --global user.name "Your Name"

In [None]:
#登录hugging face hub
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from transformers import TrainingArguments
#TrainingArguments用于定义训练过程中的设置
#bert-finetuned-mrpc是训练保存的输出目录的路径
#每个epoch结束时保存一次
#训练结束后自动将模型推送到hugging face model hub
training_args = TrainingArguments(
    "bert-finetuned-mrpc", save_strategy="epoch", push_to_hub=True
)

In [None]:
#Camembert是用于Masked Language Modeling（MLM）任务的预训练模型
from transformers import AutoModelForMaskedLM, AutoTokenizer

checkpoint = "camembert-base"
model = AutoModelForMaskedLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
#调用push_to_hub方法将模型和分词器上传到hugging face hub上进行共享

#将名为dummy-model的model上传到hub
model.push_to_hub("dummy-model")
#上传tokenizer
tokenizer.push_to_hub("dummy-model")
#上传tokenizer并指定它属于一个组织——hugging face
tokenizer.push_to_hub("dummy-model", organization="huggingface")
#上传tokenizer,指定组织，还使用了认证令牌（token）
# <token>需要被替换成实际的认证令牌，该令牌用于验证上传者的身份
#hugging face需要认证令牌来确保只有授权用户才能上传或管理模型
tokenizer.push_to_hub("dummy-model", organization="huggingface", use_auth_token="<TOKEN>")

In [None]:
#导入hugging face中的一些功能

from huggingface_hub import(
    #User management
    login,
    logout,
    whoami, #返回当前已登录用户的账户信息，包括用户名和与该账户相关的权限

    #Repository creation and management
    create_repo,
    delete_repo,
    update_repo_visibility, #用于更新仓库的可见性设置。可设置为public或private

    #Some methods to retrieve/change information about the content
    list_models,
    list_datasets,
    list_metrics,
    list_repo_files,
    upload_file,
    delete_file,
)


In [None]:
#创建仓库
from huggingface_hub import create_repo

create_repo("dummy-model")

In [None]:
#创建仓库并指定组织
from huggingface_hub import create_repo

create_repo("dummy-model", prganication="huggingface")

In [None]:
#使用upload_file函数将本地文件上传到hugging face hub上的指定仓库
from huggingface_hub import upload_file

upload_file(
    "<path_to_file>/config.json", #希望上传的本地文件的路径
    path_in_repo="config.json", #文件上传到hugging face hub后仓库中的存储路径或文件名
    repo_id="<namespace>/dummy-model" #仓库标识符。例如"my-username/dummy-model"唯一标识了dummy-model仓库，属于用户my-username
)

In [None]:
#使用Repository将指定仓库从Huggingface hub克隆到本地目录，或将本地目录与远程仓库进行关联
repo = Repository("<path_to_dummy_folder>", clone_from="<namespace>/dummy-model")
#<path_to_dummy_folder>是希望在本地创建或克隆仓库的目录路径

In [None]:
#Git操作方法，通常用于在本地和hugging face hub仓库之间同步更改

repo.git_pull() #从远程仓库拉取最新的更新到本地仓库
repo.git_add() #将本地更改添加到暂存区，这样，我的更改就准备好被提交了
repo.git_commit() #提交本地的暂存区更改，并添加提交信息，更改的是本地仓库
repo.git_push() #将本地的提交推送到远程仓库，更改的是远程仓库
repo.git_tag() #给当前的提交创建一个标签

In [None]:
#将模型和分词器的状态保存到本地文件夹
model.save_pretrained("<path_to_dummy_folder>")
tokenizer.save_pretrained("<path_to_dummy_folder>")

In [None]:
"""
从 Hugging Face Hub 加载一个预训练的 Camembert 模型和分词器。
对模型进行操作（训练、微调等）。
将训练后的模型和分词器保存到指定的本地文件夹，以便后续使用。

完整的过程如下：
"""
from transformers import AutoModelForMaskedLM, AutoTokenizer

checkpoint = "camembert-base"

model = AutoModelForMaskedLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Do whatever with the model, train it, fine-tune it...

model.save_pretrained("<path_to_dummy_folder>")
tokenizer.save_pretrained("<path_to_dummy_folder>")