# 1. 准备基础环境

## 1.1 升级Python SDK

In [None]:
!pip install --upgrade boto3
!pip install --upgrade sagemaker

## 1.2 获取Runtime资源配置

In [None]:
import boto3
import sagemaker
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()
sagemaker_default_bucket = sess.default_bucket()

account = sess.boto_session.client("sts").get_caller_identity()["Account"]
region = sess.boto_session.region_name

# 1.3 更新Docker镜像存储位置

避免后续执行、训练出现“No Space Left”错误,提前更新Docker镜像存储位置<br>
只需执行一次,避免引起Docker运行错误

In [None]:
! sudo systemctl stop docker
! sudo systemctl stop docker.socket
! sudo mv /var/lib/docker /home/ec2-user/SageMaker 
! sudo ln -s /home/ec2-user/SageMaker/docker /var/lib/docker 
! sudo systemctl start docker.socket
! sudo systemctl start docker

# 2. 准备微调模型

## 2.1 克隆ChatGLM代码

In [None]:
%%script bash
rm -rf ChatGLM-6B
git clone https://github.com/THUDM/ChatGLM-6B.git
cd ChatGLM-6B
git checkout 163f94e160f08751545e3722730f1832d73b92d1


## 2.2 下载数据集

广告数据集:根据输入,输出广告词,如下所示:

{  
    "content": "类型#上衣*版型#宽松*版型#显瘦*图案#线条*衣样式#衬衫*衣袖型#泡泡袖*衣款式#抽绳",  
    "summary": "这件衬衫的款式非常的宽松，利落的线条可以很好的隐藏身材上的小缺点，穿在身上有着很好的显瘦效果。领口装饰了一个可爱的抽绳，漂亮的绳结展现出了十足的个性，配合时尚的泡泡袖型，尽显女性甜美可爱的气息。"  
}

In [None]:
# 下载 ADGEN 数据集
!wget -O AdvertiseGen.tar.gz https://cloud.tsinghua.edu.cn/f/b3f119a008264b1cabd1/?dl=1

# 解压数据集
!tar -xzvf AdvertiseGen.tar.gz

## 2.3 下载ChatGLM原始模型

In [None]:
!pip install huggingface_hub

In [None]:
from huggingface_hub import snapshot_download
from pathlib import Path


local_cache_path = Path("./model")
local_cache_path.mkdir(exist_ok=True)

model_name = "THUDM/chatglm-6b"

# Only download pytorch checkpoint files
allow_patterns = ["*.json", "*.pt", "*.bin", "*.model", "*.py"]

model_download_path = snapshot_download(
    repo_id=model_name,
    cache_dir=local_cache_path,
    allow_patterns=allow_patterns,
)

In [None]:
# Get the model files path
import os
from glob import glob

local_model_path = None

paths = os.walk(r'./model')
for root, dirs, files in paths:
    for file in files:
        if file == 'config.json':
            # print(os.path.join(root, file))
            local_model_path = str(os.path.join(root, file))[0:-11]
            print(local_model_path)
if local_model_path == None:
    print("Model download may failed, please check prior step!")

## 2.4 将数据集和模型拷贝到S3

In [None]:
%%script env sagemaker_default_bucket=$sagemaker_default_bucket local_model_path=$local_model_path bash

chmod +x ./s5cmd
./s5cmd sync ${local_model_path} s3://${sagemaker_default_bucket}/llm/models/chatglm/original-6B/
./s5cmd sync ./AdvertiseGen/ s3://${sagemaker_default_bucket}/llm/datasets/chatglm/AdvertiseGen/

rm -rf model
# rm -rf AdvertiseGen
rm -rf AdvertiseGen.tar.gz

# 3. 开始微调模型

## 3.1 准备微调代码

In [None]:
%%writefile ChatGLM-6B/ptuning/arguments.py

from dataclasses import dataclass, field
from typing import Optional


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """
    
    model_output_s3_path: str = field(
        metadata={"help": "Path to model saved in s3 path using s5cmd utily"}
    )

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    ptuning_checkpoint: str = field(
        default=None, metadata={"help": "Path to p-tuning v2 checkpoints"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
    )
    use_fast_tokenizer: bool = field(
        default=True,
        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
    )
    model_revision: str = field(
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
    )
    use_auth_token: bool = field(
        default=False,
        metadata={
            "help": (
                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                "with private models)."
            )
        },
    )
    resize_position_embeddings: Optional[bool] = field(
        default=None,
        metadata={
            "help": (
                "Whether to automatically resize the position embeddings if `max_source_length` exceeds "
                "the model's position embeddings."
            )
        },
    )
    quantization_bit: Optional[int] = field(
        default=None
    )
    pre_seq_len: Optional[int] = field(
        default=None
    )
    prefix_projection: bool = field(
        default=False
    )


@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    lang: Optional[str] = field(default=None, metadata={"help": "Language id for summarization."})

    dataset_name: Optional[str] = field(
        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
    )
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
    prompt_column: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
    )
    response_column: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the column in the datasets containing the summaries (for summarization)."},
    )
    history_column: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the column in the datasets containing the history of chat."},
    )
    train_file: Optional[str] = field(
        default=None, metadata={"help": "The input training data file (a jsonlines or csv file)."}
    )
    validation_file: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "An optional input evaluation data file to evaluate the metrics (rouge) on (a jsonlines or csv file)."
            )
        },
    )
    test_file: Optional[str] = field(
        default=None,
        metadata={
            "help": "An optional input test data file to evaluate the metrics (rouge) on (a jsonlines or csv file)."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )
    preprocessing_num_workers: Optional[int] = field(
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
    max_source_length: Optional[int] = field(
        default=1024,
        metadata={
            "help": (
                "The maximum total input sequence length after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded."
            )
        },
    )
    max_target_length: Optional[int] = field(
        default=128,
        metadata={
            "help": (
                "The maximum total sequence length for target text after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded."
            )
        },
    )
    val_max_target_length: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "The maximum total sequence length for validation target text after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
                "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
                "during ``evaluate`` and ``predict``."
            )
        },
    )
    pad_to_max_length: bool = field(
        default=False,
        metadata={
            "help": (
                "Whether to pad all samples to model maximum sentence length. "
                "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
                "efficient on GPU but very bad for TPU."
            )
        },
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of training examples to this "
                "value if set."
            )
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
                "value if set."
            )
        },
    )
    max_predict_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
                "value if set."
            )
        },
    )
    num_beams: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
                "which is used during ``evaluate`` and ``predict``."
            )
        },
    )
    ignore_pad_token_for_loss: bool = field(
        default=True,
        metadata={
            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
        },
    )
    source_prefix: Optional[str] = field(
        default="", metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
    )

    forced_bos_token: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "The token to force as the first generated token after the decoder_start_token_id."
                "Useful for multilingual models like mBART where the first generated token"
                "needs to be the target language token (Usually it is the target language token)"
            )
        },
    )

    

    def __post_init__(self):
        if self.dataset_name is None and self.train_file is None and self.validation_file is None and self.test_file is None:
            raise ValueError("Need either a dataset name or a training/validation/test file.")
        else:
            if self.train_file is not None:
                extension = self.train_file.split(".")[-1]
                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
            if self.validation_file is not None:
                extension = self.validation_file.split(".")[-1]
                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
        if self.val_max_target_length is None:
            self.val_max_target_length = self.max_target_length


In [None]:
%%writefile ChatGLM-6B/ptuning/sm_ptune_train.sh

PYTORCH_CUDA_ALLOC_CONF='max_split_size_mb:32'
TRAIN_DATASET='/opt/ml/input/data/AdvertiseGen/train.json'
TEST_DATASET='/opt/ml/input/data/AdvertiseGen/dev.json'
RESPONSE_COLUMN='summary'
PROMPT_COLUMN='content'
MODEL_NAME_OR_PATH='s3://sagemaker_default_bucket/llm/models/chatglm/original-6B/'
OUTPUT_DIR='/opt/ml/model/adgen-chatglm-6b-ft'
MODEL_OUTPUT_S3_PATH='s3://sagemaker_default_bucket/llm/models/chatglm/finetune-ptuning-adgen-notebook/'
TRAIN_STEPS=50

PRE_SEQ_LEN=128 && LR=2e-2 && CUDA_VISIBLE_DEVICES=0 python3 main.py \
    --do_train \
    --train_file $TRAIN_DATASET \
    --validation_file $TEST_DATASET \
    --prompt_column ${PROMPT_COLUMN} \
    --response_column ${RESPONSE_COLUMN}  \
    --overwrite_cache \
    --model_name_or_path ${MODEL_NAME_OR_PATH} \
    --model_output_s3_path ${MODEL_OUTPUT_S3_PATH} \
    --output_dir ${OUTPUT_DIR} \
    --overwrite_output_dir \
    --max_source_length 64 \
    --max_target_length 64 \
    --per_device_train_batch_size 4 \
    --per_device_eval_batch_size 1 \
    --gradient_accumulation_steps 4 \
    --predict_with_generate \
    --max_steps ${TRAIN_STEPS} \
    --logging_steps 10 \
    --save_steps ${TRAIN_STEPS} \
    --learning_rate $LR \
    --pre_seq_len $PRE_SEQ_LEN \
    --quantization_bit 4


In [None]:
%%script env sagemaker_default_bucket=$sagemaker_default_bucket bash

echo $sagemaker_default_bucket
sed -i "s/sagemaker_default_bucket/$sagemaker_default_bucket/g" ./ChatGLM-6B/ptuning/sm_ptune_train.sh

In [None]:
%%writefile ChatGLM-6B/ptuning/main.py

#!/usr/bin/env python
# coding=utf-8
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for sequence to sequence.
"""
# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.

import logging
import os
import sys
import json

import numpy as np
from datasets import load_dataset
import jieba 
from rouge_chinese import Rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import torch

import transformers
from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    Seq2SeqTrainingArguments,
    set_seed,
)
from trainer_seq2seq import Seq2SeqTrainer

from arguments import ModelArguments, DataTrainingArguments

logger = logging.getLogger(__name__)

def main():

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

    if training_args.should_log:
        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
        transformers.utils.logging.set_verbosity_info()

    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    # datasets.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Load dataset
    data_files = {}
    if data_args.train_file is not None:
        data_files["train"] = data_args.train_file
        extension = data_args.train_file.split(".")[-1]
    if data_args.validation_file is not None:
        data_files["validation"] = data_args.validation_file
        extension = data_args.validation_file.split(".")[-1]
    if data_args.test_file is not None:
        data_files["test"] = data_args.test_file
        extension = data_args.test_file.split(".")[-1]

    raw_datasets = load_dataset(
        extension,
        data_files=data_files,
        cache_dir=model_args.cache_dir,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    
    # if s3 path model, use s5cmd to download the model to /tmp/orignal/ for model load

    if "s3" in model_args.model_name_or_path:
        os.system("cp ./s5cmd  /tmp/ && chmod +x /tmp/s5cmd")
        os.system("/tmp/s5cmd sync {0} {1}".format(model_args.model_name_or_path + "*", "/tmp/orignal/"))
        model_args.model_name_or_path = "/tmp/orignal/"

    # Load pretrained model and tokenizer
    config = AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
    config.pre_seq_len = model_args.pre_seq_len
    config.prefix_projection = model_args.prefix_projection

    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)

    if model_args.ptuning_checkpoint is not None:
        # Evaluation
        # Loading extra state dict of prefix encoder
        model = AutoModel.from_pretrained(model_args.model_name_or_path, config=config, trust_remote_code=True)
        prefix_state_dict = torch.load(os.path.join(model_args.ptuning_checkpoint, "pytorch_model.bin"))
        new_prefix_state_dict = {}
        for k, v in prefix_state_dict.items():
            if k.startswith("transformer.prefix_encoder."):
                new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
        model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
    else:
        model = AutoModel.from_pretrained(model_args.model_name_or_path, config=config, trust_remote_code=True)

    if model_args.quantization_bit is not None:
        print(f"Quantized to {model_args.quantization_bit} bit")
        model = model.quantize(model_args.quantization_bit)
    if model_args.pre_seq_len is not None:
        # P-tuning v2
        model = model.half()
        model.transformer.prefix_encoder.float()
    else:
        # Finetune
        model = model.float()

    prefix = data_args.source_prefix if data_args.source_prefix is not None else ""

    # Preprocessing the datasets.
    # We need to tokenize inputs and targets.
    if training_args.do_train:
        column_names = raw_datasets["train"].column_names
    elif training_args.do_eval:
        column_names = raw_datasets["validation"].column_names
    elif training_args.do_predict:
        column_names = raw_datasets["test"].column_names
    else:
        logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
        return

    # Get the column names for input/target.
    prompt_column = data_args.prompt_column
    response_column = data_args.response_column
    history_column = data_args.history_column
    
    # Temporarily set max_target_length for training.
    max_target_length = data_args.max_target_length

    def preprocess_function_eval(examples):
        inputs, targets = [], []
        for i in range(len(examples[prompt_column])):
            if examples[prompt_column][i] and examples[response_column][i]:
                query = examples[prompt_column][i]
                if history_column is None or len(examples[history_column][i]) == 0:
                    prompt = query
                else:
                    prompt = ""
                    history = examples[history_column][i]
                    for turn_idx, (old_query, response) in enumerate(history):
                        prompt += "[Round {}]\n问：{}\n答：{}\n".format(turn_idx, old_query, response)
                    prompt += "[Round {}]\n问：{}\n答：".format(len(history), query)
                inputs.append(prompt)
                targets.append(examples[response_column][i])

        inputs = [prefix + inp for inp in inputs]
        model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, truncation=True, padding=True)
        labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)

        if data_args.ignore_pad_token_for_loss:
            labels["input_ids"] = [
                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
            ]
        model_inputs["labels"] = labels["input_ids"]

        return model_inputs

    def preprocess_function_train(examples):
        max_seq_length = data_args.max_source_length + data_args.max_target_length

        model_inputs = {
            "input_ids": [],
            "labels": [],
        }
        for i in range(len(examples[prompt_column])):
            if examples[prompt_column][i] and examples[response_column][i]:
                query, answer = examples[prompt_column][i], examples[response_column][i]

                if history_column is None:
                    prompt = query
                else:
                    prompt = ""
                    history = examples[history_column][i]
                    for turn_idx, (old_query, response) in enumerate(history):
                        prompt += "[Round {}]\n问：{}\n答：{}\n".format(turn_idx, old_query, response)
                    prompt += "[Round {}]\n问：{}\n答：".format(len(history), query)

                prompt = prefix + prompt
                a_ids = tokenizer.encode(text=prompt, add_special_tokens=False)
                b_ids = tokenizer.encode(text=answer, add_special_tokens=False)

                if len(a_ids) > data_args.max_source_length - 1:
                    a_ids = a_ids[: data_args.max_source_length - 1]

                if len(b_ids) > data_args.max_target_length - 2:
                    b_ids = b_ids[: data_args.max_target_length - 2]

                input_ids = tokenizer.build_inputs_with_special_tokens(a_ids, b_ids)

                context_length = input_ids.index(tokenizer.bos_token_id)
                mask_position = context_length - 1
                labels = [-100] * context_length + input_ids[mask_position+1:]
                
                pad_len = max_seq_length - len(input_ids)
                input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
                labels = labels + [tokenizer.pad_token_id] * pad_len
                if data_args.ignore_pad_token_for_loss:
                    labels = [(l if l != tokenizer.pad_token_id else -100) for l in labels]

                model_inputs["input_ids"].append(input_ids)
                model_inputs["labels"].append(labels)

        return model_inputs
    
    def print_dataset_example(example):
        print("input_ids",example["input_ids"])
        print("inputs", tokenizer.decode(example["input_ids"]))
        print("label_ids", example["labels"])
        print("labels", tokenizer.decode(example["labels"]))

    if training_args.do_train:
        if "train" not in raw_datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = raw_datasets["train"]
        if data_args.max_train_samples is not None:
            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
            train_dataset = train_dataset.select(range(max_train_samples))
        with training_args.main_process_first(desc="train dataset map pre-processing"):
            train_dataset = train_dataset.map(
                preprocess_function_train,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on train dataset",
            )
        print_dataset_example(train_dataset[0])

    if training_args.do_eval:
        max_target_length = data_args.val_max_target_length
        if "validation" not in raw_datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = raw_datasets["validation"]
        if data_args.max_eval_samples is not None:
            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
            eval_dataset = eval_dataset.select(range(max_eval_samples))
        with training_args.main_process_first(desc="validation dataset map pre-processing"):
            eval_dataset = eval_dataset.map(
                preprocess_function_eval,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on validation dataset",
            )
        print_dataset_example(eval_dataset[0])

    if training_args.do_predict:
        max_target_length = data_args.val_max_target_length
        if "test" not in raw_datasets:
            raise ValueError("--do_predict requires a test dataset")
        predict_dataset = raw_datasets["test"]
        if data_args.max_predict_samples is not None:
            max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
            predict_dataset = predict_dataset.select(range(max_predict_samples))
        with training_args.main_process_first(desc="prediction dataset map pre-processing"):
            predict_dataset = predict_dataset.map(
                preprocess_function_eval,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on prediction dataset",
            )
        print_dataset_example(predict_dataset[0])

    # Data collator
    label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        label_pad_token_id=label_pad_token_id,
        pad_to_multiple_of=None,
        padding=False
    )

    # Metric
    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        if isinstance(preds, tuple):
            preds = preds[0]
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        if data_args.ignore_pad_token_for_loss:
            # Replace -100 in the labels as we can't decode them.
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        score_dict = {
            "rouge-1": [],
            "rouge-2": [],
            "rouge-l": [],
            "bleu-4": []
        }
        for pred, label in zip(decoded_preds, decoded_labels):
            hypothesis = list(jieba.cut(pred))
            reference = list(jieba.cut(label))
            rouge = Rouge()
            scores = rouge.get_scores(' '.join(hypothesis) , ' '.join(reference))
            result = scores[0]
            
            for k, v in result.items():
                score_dict[k].append(round(v["f"] * 100, 4))
            bleu_score = sentence_bleu([list(label)], list(pred), smoothing_function=SmoothingFunction().method3)
            score_dict["bleu-4"].append(round(bleu_score * 100, 4))

        for k, v in score_dict.items():
            score_dict[k] = float(np.mean(v))
        return score_dict

    # Override the decoding parameters of Seq2SeqTrainer
    training_args.generation_max_length = (
        training_args.generation_max_length
        if training_args.generation_max_length is not None
        else data_args.val_max_target_length
    )
    training_args.generation_num_beams = (
        data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams
    )
    # Initialize our Trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics if training_args.predict_with_generate else None,
        save_prefixencoder=model_args.pre_seq_len is not None
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        # elif last_checkpoint is not None:
        #     checkpoint = last_checkpoint
        model.gradient_checkpointing_enable()
        model.enable_input_require_grads()
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        # trainer.save_model()  # Saves the tokenizer too for easy upload

        metrics = train_result.metrics
        max_train_samples = (
            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
        )
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        
        print("------saving model!-----")
        
        # save_model_dir = os.environ['OUTPUT_DIR']
        save_model_dir = training_args.output_dir
        tokenizer.save_pretrained(save_model_dir)
        trainer.save_model(save_model_dir)
        print("save_model_dir : {}".format(save_model_dir))
        print("------model is saved!-----")
        
        os.system("./s5cmd sync {0} {1}".format(save_model_dir, model_args.model_output_s3_path))

        
        # os.system("./s5cmd sync {0} {1}".format(save_model_dir, os.environ['MODEL_OUTPUT_S3_PATH']))

    # Evaluation
    results = {}
    max_seq_length = data_args.max_source_length + data_args.max_target_length + 1
    if training_args.do_eval:
        logger.info("*** Evaluate ***")
        metrics = trainer.evaluate(metric_key_prefix="eval", do_sample=True, top_p=0.7, max_length=max_seq_length, temperature=0.95)
        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    if training_args.do_predict:
        logger.info("*** Predict ***")
        predict_results = trainer.predict(predict_dataset, metric_key_prefix="predict", max_length=max_seq_length, do_sample=True, top_p=0.7, temperature=0.95)
        metrics = predict_results.metrics
        max_predict_samples = (
            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
        )
        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))

        trainer.log_metrics("predict", metrics)
        trainer.save_metrics("predict", metrics)

        if trainer.is_world_process_zero():
            if training_args.predict_with_generate:
                predictions = tokenizer.batch_decode(
                    predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
                )
                predictions = [pred.strip() for pred in predictions]
                labels = tokenizer.batch_decode(
                    predict_results.label_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
                )
                labels = [label.strip() for label in labels]
                output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.txt")
                with open(output_prediction_file, "w", encoding="utf-8") as writer:
                    for p, l in zip(predictions, labels):
                        res = json.dumps({"labels": l, "predict": p}, ensure_ascii=False)
                        writer.write(f"{res}\n")
    return results


def _mp_fn(index):
    # For xla_spawn (TPUs)
    main()


if __name__ == "__main__":
    main()


In [None]:
%%writefile ChatGLM-6B/ptuning/requirements.txt

protobuf
#git+https://github.com/huggingface/transformers.git@68d640f7c368bcaaaecfc678f11908ebbd3d6176
transformers==4.28.0
cpm_kernels
torch>=1.10
gradio
mdtex2html
sentencepiece
accelerate
datasets
huggingface
jieba
rouge_chinese
nltk
deepspeed==0.8.3

In [None]:
! chmod +x s5cmd
! cp s5cmd ChatGLM-6B/ptuning

## 3.2 准备微调镜像

In [None]:
!chmod +x build_push.sh

In [None]:
! ./build_push.sh

## 3.3 训练微调模型

In [None]:
! docker run --runtime=nvidia -ti chatglm-finetune-ptuning:latest /bin/bash -c /opt/ml/code/sm_ptune_train.sh

# 4. 部署模型


In [None]:
! pip install -r ./code/requirements.txt

## 4.1 部署原始模型

In [None]:
import sys
import json
import traceback
from transformers import AutoTokenizer, AutoModel


tokenizer_original = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
model_original = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()

In [None]:
text = "类型#上衣\*材质#牛仔布\*颜色#白色\*风格#简约\*图案#刺绣\*衣样式#外套\*衣款式#破洞"
response_original, history = model.chat(tokenizer_original, text, history=[])

print(response_original)

In [None]:
text = "你好"
response, history = model.chat(tokenizer, text, history=[])

print(response)

## 4.2 部署微调模型

In [None]:
model_name_or_path = 's3://{}/llm/models/chatglm/original-6B/'.format(sagemaker_default_bucket)
finetune_model_name_or_path = 's3://{}/llm/models/chatglm/finetune-ptuning-adgen-notebook/adgen-chatglm-6b-ft/checkpoint-50/pytorch_model.bin'.format(sagemaker_default_bucket)

In [None]:
import os

os.system("chmod +x s5cmd")
os.system("./s5cmd sync {0} {1}".format(model_name_or_path + "*", "./models/chatglm-orignal/"))
os.system("./s5cmd sync {0} {1}".format(finetune_model_name_or_path + "*", "./models/chatglm-finetune/"))



In [None]:
import sys
import json
import traceback
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig

pre_seq_len=128
model_name_or_path = "./models/chatglm-orignal/"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True, pre_seq_len=pre_seq_len)
model = AutoModel.from_pretrained(model_name_or_path, config=config, trust_remote_code=True)

prefix_state_dict = torch.load("./models/chatglm-finetune/pytorch_model.bin")

new_prefix_state_dict = {}
for k, v in prefix_state_dict.items():
    new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)

model = model.quantize(4)
model.half().cuda()

In [None]:
text = "类型#上衣\*材质#牛仔布\*颜色#白色\*风格#简约\*图案#刺绣\*衣样式#外套\*衣款式#破洞"
response, history = model.chat(tokenizer, text, history=[])

print(response)

In [None]:
export PYTORCH_CUDA_ALLOC_CONF='max_split_size_mb:32'
export TRAIN_DATASET='/opt/ml/input/data/AdvertiseGen/train.json'
export TEST_DATASET='/opt/ml/input/data/AdvertiseGen/dev.json'
export PROMPT_COLUMN='content'
export RESPONSE_COLUMN='summary'
export MODEL_NAME_OR_PATH='s3://sagemaker-us-west-2-928808346782/llm/models/chatglm/original-6B/'
export OUTPUT_DIR='/opt/ml/model/adgen-chatglm-6b-ft'
export MODEL_OUTPUT_S3_PATH='s3://sagemaker-us-west-2-928808346782/llm/models/chatglm/finetune-ptuning-adgen-notebook/'
export TRAIN_STEPS=50

In [None]:
%%writefile env sagemaker_default_bucket=$sagemaker_default_bucket ChatGLM-6B/ptuning/sm_ptune_train.sh

PYTORCH_CUDA_ALLOC_CONF='max_split_size_mb:32'
TRAIN_DATASET='/opt/ml/input/data/AdvertiseGen/train.json'
TEST_DATASET='/opt/ml/input/data/AdvertiseGen/dev.json'
RESPONSE_COLUMN='summary'
PROMPT_COLUMN='content'
MODEL_NAME_OR_PATH='s3://${sagemaker_default_bucket}/llm/models/chatglm/original-6B/'
OUTPUT_DIR='/opt/ml/model/adgen-chatglm-6b-ft'
MODEL_OUTPUT_S3_PATH='s3://${sagemaker_default_bucket}/llm/models/chatglm/finetune-ptuning-adgen-notebook/'


PRE_SEQ_LEN=128 && LR=2e-2 && CUDA_VISIBLE_DEVICES=0 python3 main.py \
    --do_train \
    --train_file ${TRAIN_DATASET} \
    --validation_file ${TEST_DATASET} \
    --prompt_column ${PROMPT_COLUMN} \
    --response_column ${RESPONSE_COLUMN}  \
    --overwrite_cache \
    --model_name_or_path ${MODEL_NAME_OR_PATH} \
    --model_output_s3_path ${MODEL_OUTPUT_S3_PATH} \
    --output_dir ${OUTPUT_DIR} \
    --overwrite_output_dir \
    --max_source_length 64 \
    --max_target_length 64 \
    --per_device_train_batch_size 4 \
    --per_device_eval_batch_size 1 \
    --gradient_accumulation_steps 4 \
    --predict_with_generate \
    --max_steps ${TRAIN_STEPS} \
    --logging_steps 10 \
    --save_steps ${TRAIN_STEPS} \
    --learning_rate ${LR} \
    --pre_seq_len ${PRE_SEQ_LEN} \
    --quantization_bit 4


In [None]:
! pip3 install -r ./code/requirements.txt

In [None]:
import sys
import json
import traceback
from transformers import AutoTokenizer, AutoModel

In [None]:
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)


In [None]:
model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()

In [None]:
# result = answer("你好", model=model)
response, history = model.chat(tokenizer, "类型#上衣\*材质#牛仔布\*颜色#白色\*风格#简约\*图案#刺绣\*衣样式#外套\*衣款式#破洞", history=[])

In [None]:
response

## 3.2 定义微调参数

In [None]:
# Define Training Job Name
import time
from sagemaker.huggingface import HuggingFace

job_name = f'huggingface-chatglm-finetune-ptuning-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'

instance_type  = "ml.g4dn.2xlarge"
instance_count = 1

# 基础模型存放地址
# model_name_or_path = 'THUDM/chatglm-6b'
model_name_or_path = 's3://{}/llm/models/chatglm/original-6B/'.format(sagemaker_default_bucket)

# 微调模型输出地址
output_dir         = '/opt/ml/model/adgen-chatglm-6b-ft'
model_s3_path      = 's3://{}/llm/models/chatglm/finetune-ptuning-adgen/'.format(sagemaker_default_bucket)

# 模型环境变量设置
environment = {
    'PYTORCH_CUDA_ALLOC_CONF': 'max_split_size_mb:32',
    'TRAIN_DATASET'          : '/opt/ml/input/data/AdvertiseGen/train.json',
    'TEST_DATASET'           : '/opt/ml/input/data/AdvertiseGen/dev.json',
    'PROMPT_COLUMN'          : 'content',
    'RESPONSE_COLUMN'        : 'summary',
    'MODEL_NAME_OR_PATH'     : model_name_or_path,
    'OUTPUT_DIR'             : output_dir,
    'MODEL_OUTPUT_S3_PATH'   : model_s3_path,
    'TRAIN_STEPS'            : '50'
}

inputs = {
   'AdvertiseGen': f"s3://{sagemaker_default_bucket}/llm/datasets/chatglm/AdvertiseGen/"
}

In [None]:
environment

## 3.3 启动微调训练

In [None]:
# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'sm_ptune_train.py',
    source_dir           = './ChatGLM-6B/ptuning',
    instance_type        = instance_type,
    instance_count       = instance_count,
    base_job_name        = job_name,
    role                 = role,
    script_mode          = True,
    transformers_version = '4.26',
    pytorch_version      = '1.13',
    py_version           = 'py39',
    environment          = environment
)

In [None]:
huggingface_estimator.fit(inputs=inputs)

# 4. 模型部署

# 4.1 获取Runtime资源配置

In [None]:
import boto3
import sagemaker
from sagemaker import get_execution_role

sess                     = sagemaker.Session()
role                     = get_execution_role()
sagemaker_default_bucket = sess.default_bucket()

account                  = sess.boto_session.client("sts").get_caller_identity()["Account"]
region                   = sess.boto_session.region_name

# 4.2 准备Dummy模型

In [None]:
!touch dummy
!tar czvf model.tar.gz dummy
assets_dir = 's3://{0}/{1}/assets/'.format(sagemaker_default_bucket, 'chatglm')
model_data = 's3://{0}/{1}/assets/model.tar.gz'.format(sagemaker_default_bucket, 'chatglm')
!aws s3 cp model.tar.gz $assets_dir
!rm -f dummy model.tar.gz

# 4.3 配置模型参数

In [None]:
model_name                  = None
entry_point                 = 'chatglm-inference-finetune.py'
framework_version           = '1.13.1'
py_version                  = 'py39'
base_model_name_or_path     = 's3://{}/llm/models/chatglm/original-6B/'.format(sagemaker_default_bucket)
finetune_model_name_or_path = 's3://{}/llm/models/chatglm/finetune-ptuning-adgen/adgen-chatglm-6b-ft/checkpoint-50/pytorch_model.bin'.format(sagemaker_default_bucket)

# 模型环境变量设置
model_environment  = {
    'SAGEMAKER_MODEL_SERVER_TIMEOUT': '600',
    'SAGEMAKER_MODEL_SERVER_WORKERS': '1',
    'MODEL_NAME_OR_PATH'            : base_model_name_or_path,
    'PRE_SEQ_LEN'                   : '128',
    'FINETUNE_MODEL_NAME_OR_PATH'   : finetune_model_name_or_path,
}


In [None]:
from sagemaker.pytorch.model import PyTorchModel

model = PyTorchModel(
    name              = model_name,
    model_data        = model_data,
    entry_point       = entry_point,
    source_dir        = './code',
    role              = role,
    framework_version = framework_version, 
    py_version        = py_version,
    env               = model_environment
)

In [None]:
! chmod +x s5cmd
! cp s5cmd code/

# 4.4 部署微调模型

In [None]:
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

endpoint_name         = None
instance_type         = 'ml.g4dn.2xlarge'
instance_count        = 1

predictor = model.deploy(
    endpoint_name          = endpoint_name,
    instance_type          = instance_type, 
    initial_instance_count = instance_count,
    serializer             = JSONSerializer(),
    deserializer           = JSONDeserializer()
)

# 4.5 测试微调模型

In [None]:
# Wait model loading

import time

time.sleep(300)

In [None]:
inputs = {
    "ask": "类型#上衣\*材质#牛仔布\*颜色#白色\*风格#简约\*图案#刺绣\*衣样式#外套\*衣款式#破洞"

}

response = predictor.predict(inputs)
print(response["answer"])
