# 源数据处理

## 1 导入相关包

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer

## 2 加载数据集

In [4]:
data_train=r'./train.csv'
data_test=r'./test.csv'
ds=load_dataset('csv',data_files={'train':data_train, 'test': data_test},
                                split=['train', 'test'])
ds

Downloading and preparing dataset csv/default to C:/Users/Administrator/.cache/huggingface/datasets/csv/default-2e83d9102f537b73/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to C:/Users/Administrator/.cache/huggingface/datasets/csv/default-2e83d9102f537b73/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

[Dataset({
     features: ['src', 'tgt'],
     num_rows: 92644
 }),
 Dataset({
     features: ['src', 'tgt'],
     num_rows: 1000
 })]

## 4 数据处理

In [8]:
model_path='YeungNLP/bloom-6b4-zh'
tokenizer = AutoTokenizer.from_pretrained(model_path)

ValueError: check_hostname requires server_hostname

In [4]:
def process_func(examples):
    MAX_LENGTH = 150
    contents='机器翻译:\n' + examples['src']
    # 对输入与label进行编码
    inputs=tokenizer(contents)
    labels = tokenizer(text_target=examples['tgt'] + tokenizer.eos_token)
    input_ids=inputs["input_ids"]+labels["input_ids"]
    attention_mask=inputs["attention_mask"] + labels["attention_mask"]
    labels = [-100] * len(inputs["input_ids"]) + labels["input_ids"]
    # 数据截断
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [5]:
tokenized_train=ds[0].map(process_func, remove_columns=ds[0].column_names)

## 5 创建模型

In [23]:
import torch
model = AutoModelForCausalLM.from_pretrained(model_path,low_cpu_mem_usage=True,torch_dtype=torch.half,
                                            load_in_4bit=True, # 4bit混合训练
                                            bnb_4bit_compute_dtype=torch.half,
                                             bnb_4bit_quant_type='nf4', # nf4 量化
                                             bnb_4bit_use_double_quant=True
                                            )

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
model.enable_input_require_grads() # 执行这行代码，在使用gradient_checkpointing时才不会报错

In [8]:
# 6.1 创建配置文件
from peft import LoraConfig,get_peft_model,TaskType
comfig = LoraConfig(task_type=TaskType.CAUSAL_LM)
comfig

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules=None, lora_alpha=8, lora_dropout=0.0, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={})

In [9]:
# 6.2 创建模型
model_lora = get_peft_model(model,comfig)

In [10]:
model_lora=model_lora.half()

In [11]:
model_lora.print_trainable_parameters()

trainable params: 3,932,160 || all params: 6,234,353,664 || trainable%: 0.06307245645536737


In [12]:
model.device

device(type='cuda', index=0)

## 7 配置训练参数

In [13]:
import os
os.environ["WANDB_DISABLED"] = "true" # 防止日志输出到wandb.ai
args= TrainingArguments(
                                  output_dir='./modelcheak/m5',
                                  logging_dir=r'./modelcheak/m5',
                                  per_device_train_batch_size=16,  # batch_size
                                  gradient_accumulation_steps=2,
                                  logging_steps=20,
                                  optim="paged_adamw_32bit",  # 分页优化器，QLora要使用
                                  num_train_epochs=1,
                                  gradient_checkpointing=True
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


## 8 创建训练器

In [14]:
trainr=Trainer(
    args=args,
    model=model_lora,
    train_dataset=tokenized_train,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer)
)

In [15]:
trainr.train()

You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
20,3.9167
40,3.5068
60,2.92
80,2.5355
100,2.3735
120,2.2766
140,2.2146
160,2.1853
180,2.2397
200,2.0766


TrainOutput(global_step=2895, training_loss=1.9793793515220208, metrics={'train_runtime': 5295.8774, 'train_samples_per_second': 17.494, 'train_steps_per_second': 0.547, 'total_flos': 8.78107053612073e+16, 'train_loss': 1.9793793515220208, 'epoch': 1.0})

## 9 权重合并与

In [19]:
model.device

device(type='cuda', index=0)

In [35]:
from peft import PeftModel
# model_id 是checkpoint那个路径
prft_model=PeftModel.from_pretrained(model=model,model_id=r"C:\Users\30535\Desktop\CodeProgram\Python\deepstudy\code2\使用Transformer进行中英文翻译\modelcheak\m5\checkpoint-2500")
# 权重合并
prft_model=prft_model.to('cuda')

In [None]:
# 模型保存
# merge_model.save_pretrained('./modelcheak/trans11')

In [39]:
import re
import sacrebleu
def is_english_sentence(sentence):
    # 使用正则表达式检查句子中是否包含英文字母
    english_pattern = re.compile(r'[a-zA-Z]')
    match = english_pattern.search(sentence)
    
    if match:
        return True
    else:
        return False
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

smooth = SmoothingFunction().method1
bleu_scores=[]
m1,m2=[],[]
m3,m4=[],[]
import time
t=time.time()
for i in range(len(ds[1]['src'])):
    if i%40==0:
        print(i/len(ds[1]['src']))
    x="机器翻译:\n{}".format(ds[1]['src'][i]).strip()
    ipt = tokenizer(x,return_tensors='pt').to('cuda')
#     print('被翻译句子： ',ds[1]['src'][i])
    y=tokenizer.decode(prft_model.generate(**ipt,max_length=150, do_sample=False)[0],skip_special_tokens=True)[len(x):]
#     print('翻译结果: ',y)
#     print()
    if is_english_sentence(ds[1]['tgt'][i]):
        m1.append(ds[1]['tgt'][i])
        m2.append([y])
    else:
        m3.append(list(ds[1]['tgt'][i][:-1]))
        m4.append([list(y)[:-1]])
# print('时间',time.time()-t)
smooth = SmoothingFunction().method1
b1=[sacrebleu.sentence_bleu(candidate, refs).score for candidate, refs in zip(m1, m2)]
for i in range(len(m4)):
    b2 = sentence_bleu(m4[i], m3[i], weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)*100
    b1.append(b2)
print(sum(b1)/len(ds[1]['src']))

0.0
被翻译句子：  我只是在帮她。
翻译结果 I'm just doing her a favor.

被翻译句子：  I imagined myself in a courtroom at his trial, facing down the bearded man who has haunted my dreams over the last nine years.
翻译结果 我幻想自己在他审判时，面对着那张胡须浓密的脸，他一直在我梦里纠缠不休。

被翻译句子：  There's a good place nearby. 
翻译结果 附近有个好地方。

被翻译句子：  他是从外地来的货郎 
翻译结果 He's a local landlord

被翻译句子：  但是不要忘记，埃利森是一个创业者和梦想家，他缔造了一个公司、一种文化，事实上缔造了整个行业。
翻译结果 But don't forget, Elison is a creator and a dreamer, he created a corporation, a culture, in fact he created the whole industry.

时间 4.347065210342407
0.3682862200723244


## 9 模型推理

In [None]:
from transformers import pipeline

In [None]:
pipe=pipeline('text2text-generation',model=merge_model,tokenizer=tokenizer,device=0)

In [None]:
pipe('机器翻译:\n'+'我有一个苹果',max_length=30,do_sample=False)