# 预训练

## 安装相关依赖

In [3]:
# -q: 这是pip install命令的一个选项，表示静默安装（quiet mode），减少输出信息。
# -r ./requirements.txt: 这是pip install命令的一个选项，表示从指定的文件中读取要安装的包。
!pip install -r ./requirements.txt

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting torch==2.1.2 (from -r ./requirements.txt (line 8))
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/07/b4/a064d86c3879a13912e6bf6742934c11dc547e728064e58fd65073664e01/torch-2.1.2-cp39-none-macosx_11_0_arm64.whl (59.6 MB)
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 2.1.0
    Uninstalling torch-2.1.0:
      Successfully uninstalled torch-2.1.0
Successfully installed torch-2.1.2


In [4]:
# Ignore insignificant warnings (ex: deprecations)
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Set a seed for reproducibility
import torch


def fix_torch_seed(seed=42):
    torch.manual_seed(seed)  # 为CPU设置随机种子
    torch.cuda.manual_seed(seed)  # 为当前GPU设置随机种子
    torch.backends.cudnn.deterministic = True  # 保证每次结果一样
    torch.backends.cudnn.benchmark = False  # 禁用benchmark，提高稳定性


fix_torch_seed()

## 加载预训练模型

In [None]:
model_path_or_name = "./models/upstage/TinySolar-248m-4k"

In [None]:
from transformers import AutoModelForCausalLM
tiny_general_model = AutoModelForCausalLM.from_pretrained(
    model_path_or_name,
    device_map="cpu",  # change to auto if you have access to a GPU
    torch_dtype=torch.bfloat16
)

In [None]:
from transformers import AutoTokenizer
tiny_general_tokenizer = AutoTokenizer.from_pretrained(
    model_path_or_name
)

## 文本生成任务

In [None]:
prompt = "I am an engineer. I love"

In [None]:
inputs = tiny_general_tokenizer(prompt, return_tensors="pt")

In [None]:
from transformers import TextStreamer
streamer = TextStreamer(
    tiny_general_tokenizer,
    # If you set to false, the model will first return the prompt and then the generated text
    skip_prompt=True,  # skip_prompt表示是否跳过输入的prompt，直接返回生成的文本
    skip_special_tokens=True  # skip_special_tokens表示是否跳过特殊的token，如<eos>等
)

In [None]:
outputs = tiny_general_model.generate(
    **inputs,
    streamer=streamer,
    use_cache=True,  # use_cache表示是否使用cache，如果为False，则不使用cache
    max_new_tokens=128,
    do_sample=False,  # do_sample表示是否使用采样，如果为False，则使用贪婪搜索
    temperature=0.0,
    repetition_penalty=1.1  # repetition_penalty表示重复惩罚，用于控制生成文本中重复的token
)

## Python代码生成

In [None]:
prompt = "def find_max(numbers):"

In [None]:
inputs = tiny_general_tokenizer(prompt, return_tensors="pt").to(tiny_general_model.device)

streamer = TextStreamer(
    tiny_general_tokenizer,
    skip_prompt=True,  # Set to false to include the prompt in the output
    skip_special_tokens=True
)

In [None]:
outputs = tiny_general_model.generate(
    **inputs,
    streamer=streamer,
    use_cache=True,
    max_new_tokens=128,
    do_sample=False,
    temperature=0.0,
    repetition_penalty=1.1
)

## 使用针对Python代码微调后的模型

In [None]:
model_path_or_name = "./models/upstage/TinySolar-248m-4k-code-instruct"

In [None]:
tiny_finetuned_model = AutoModelForCausalLM.from_pretrained(
    model_path_or_name,
    device_map="cpu",
    torch_dtype=torch.bfloat16,
)

tiny_finetuned_tokenizer = AutoTokenizer.from_pretrained(
    model_path_or_name
)

In [None]:
prompt = "def find_max(numbers):"

inputs = tiny_finetuned_tokenizer(
    prompt, return_tensors="pt"
).to(tiny_finetuned_model.device)

streamer = TextStreamer(
    tiny_finetuned_tokenizer,
    skip_prompt=True,
    skip_special_tokens=True
)

outputs = tiny_finetuned_model.generate(
    **inputs,
    streamer=streamer,
    use_cache=True,
    max_new_tokens=128,
    do_sample=False,
    temperature=0.0,
    repetition_penalty=1.1
)

- 仅通过预训练的模型具备相关知识，但是表达能力有限
- 通过微调提升模型表达能力
- 二次预训练提供模型更多的相关知识

## 在大量Python代码上进行预训练的模型

In [None]:
model_path_or_name = "./models/upstage/TinySolar-248m-4k-py"

In [None]:
tiny_custom_model = AutoModelForCausalLM.from_pretrained(
    model_path_or_name,
    device_map="cpu",
    torch_dtype=torch.bfloat16,
)

tiny_custom_tokenizer = AutoTokenizer.from_pretrained(
    model_path_or_name
)

In [None]:
prompt = "def find_max(numbers):"

inputs = tiny_custom_tokenizer(
    prompt, return_tensors="pt"
).to(tiny_custom_model.device)

streamer = TextStreamer(
    tiny_custom_tokenizer,
    skip_prompt=True,
    skip_special_tokens=True
)

outputs = tiny_custom_model.generate(
    **inputs, streamer=streamer,
    use_cache=True,
    max_new_tokens=128,
    do_sample=False,
    repetition_penalty=1.1
)