In [None]:
import os,torch,transformers,datasets,peft,trl

print("torch version:",torch.__version__)
print("transformers version:",transformers.__version__)
print("datasets version:",datasets.__version__)
print("peft version:",peft.__version__)
print("trl version:",trl.__version__)

In [1]:
import os
import torch
import torch.nn as nn
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoModel,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from accelerate import Accelerator

  warn(f"Failed to load image Python extension: {e}")


In [2]:
# The model that you want to train from the Hugging Face hub
model_name = "../autodl-tmp/glm-4-9b-chat"

# The instruction dataset to use
dataset_name = "wedoctor_data_350.json"

# Fine-tuned model name
new_model = "glm-4-9b-chat-diabetes-finetune"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.6

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 3

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training 原来是4
per_device_train_batch_size = 1

# Batch size per GPU for evaluation 原来是4
per_device_eval_batch_size = 1

# Number of update steps to accumulate the gradients for 原来是1
gradient_accumulation_steps = 2

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer) 原来是2e-4
learning_rate = 3e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 10

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [3]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
#tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.padding_side = "left" # Fix weird overflow issue with fp16 training


# Load dataset (you can process it here)
dataset = load_dataset("json",data_files=dataset_name,split="train")

def format_chat_template(row):
    row_json = [{"role": row["messages"][0]["role"], "content": row["messages"][0]["content"]},
                {"role": row["messages"][1]["role"], "content": row["messages"][1]["content"]},
                {"role": row["messages"][2]["role"], "content": row["messages"][2]["content"]},
               ]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc= 4,
)



Generating train split: 0 examples [00:00, ? examples/s]

Map (num_proc=4):   0%|          | 0/350 [00:00<?, ? examples/s]

In [4]:
dataset.remove_columns(["messages"])

Dataset({
    features: ['text'],
    num_rows: 350
})

In [5]:
# Split the dataset into a training and validation set
# You can specify the ratio or the number of validation examples
dataset = dataset.train_test_split(test_size=0.2)  # For example, 20% for validation

train_dataset = dataset['train']
val_dataset = dataset['test']

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1



# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

# Modify the trainer initialization to include the train_dataset and eval_dataset
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,  # Use the split training dataset
    eval_dataset=val_dataset,  # Use the split validation dataset
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train model
trainer.train()

Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/280 [00:00<?, ? examples/s]

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-bit optimizers.


[2024-11-20 22:43:30,351] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/root/miniconda3/compiler_compat/ld: cannot find -laio
collect2: error: ld returned 1 exit status




  def forward(ctx, input, weight, bias=None):
  def backward(ctx, grad_output):


Step,Training Loss
10,1.9414
20,2.5378
30,2.0429
40,1.1003
50,0.7903
60,0.4801
70,0.4805
80,0.4602
90,0.309
100,0.3325


TrainOutput(global_step=420, training_loss=0.457231719153268, metrics={'train_runtime': 723.8192, 'train_samples_per_second': 1.161, 'train_steps_per_second': 0.58, 'total_flos': 3.141757487038464e+16, 'train_loss': 0.457231719153268, 'epoch': 3.0})

In [6]:
# Save trained model
trainer.model.save_pretrained(new_model)

In [7]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True, 
# bf16
    torch_dtype=torch.float16,
    device_map=device_map,
    trust_remote_code=True
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
#tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.padding_side = "left"

Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

  adapters_weights = torch.load(


In [8]:
save_directory = "../autodl-tmp/fine_tuned_chatglm_for_diabetes"

# Save the model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('../autodl-tmp/fine_tuned_chatglm_for_diabetes/tokenizer_config.json',
 '../autodl-tmp/fine_tuned_chatglm_for_diabetes/special_tokens_map.json',
 '../autodl-tmp/fine_tuned_chatglm_for_diabetes/tokenizer.model',
 '../autodl-tmp/fine_tuned_chatglm_for_diabetes/added_tokens.json')

In [10]:
# Empty VRAM
del model
del trainer
import gc
gc.collect()
gc.collect()

0

In [11]:
del model

NameError: name 'model' is not defined

In [2]:
from transformers import AutoModel, AutoTokenizer

load_directory="../autodl-tmp/fine_tuned_chatglm_for_diabetes"

model = AutoModel.from_pretrained(load_directory,trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(load_directory,trust_remote_code=True)
tokenizer.padding_side="left"
#tokenizer.pad_token="[PAD]"
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to DataParallel for multi-GPU usage
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

model = model.to(device)
model.eval()  # Ensure evaluation mode



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

ChatGLMForConditionalGeneration(
  (transformer): ChatGLMModel(
    (embedding): Embedding(
      (word_embeddings): Embedding(151552, 4096)
    )
    (rotary_pos_emb): RotaryEmbedding()
    (encoder): GLMTransformer(
      (layers): ModuleList(
        (0-39): 40 x GLMBlock(
          (input_layernorm): RMSNorm()
          (self_attention): SelfAttention(
            (query_key_value): Linear(in_features=4096, out_features=4608, bias=True)
            (core_attention): SdpaAttention(
              (attention_dropout): Dropout(p=0.0, inplace=False)
            )
            (dense): Linear(in_features=4096, out_features=4096, bias=False)
          )
          (post_attention_layernorm): RMSNorm()
          (mlp): MLP(
            (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
            (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
          )
        )
      )
      (final_layernorm): RMSNorm()
    )
    (output_layer): Linear(in

In [15]:

# 4006
messages = [{
                "role": "system",
                "content": "你是一个专业的医生，请基于诊疗指南，为以下患者提供综合的管理意见:"
            },
            {
                "role": "user",
                "content":"基本情况：年龄：76；性别：女；类型：未分标。监测情况：第1次回访：监测内容：血糖；空腹血糖为7.2mmol/L；第2次回访：监测内容：血糖；晚餐后非空腹血糖为15.0mmol/L；辅助检查的餐后血糖为15.0mmol/L；辅助检查的空腹血糖为7.2mmol/L；无低血糖反应；糖尿病症状为4；本次监测描述为来院调药，餐后控制不好，建议调药后继续检测血糖，观察效果，预防低血糖。"

            }
           ]


prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
inputs = tokenizer(prompt, return_tensors='pt', padding= True, truncation=True).to(device)

outputs = model.generate(**inputs, max_length=2000, num_return_sequences=1)

#outputs = model.module.generate(**inputs, max_length=1000, num_return_sequences=1)

# temperature=0.1,top_p = 0.95,top_k= 5) if isinstance(model, nn.DataParallel) else model.generate(**inputs, max_length=1000, num_return_sequences=1,temperature=0.1,top_p = 0.95,top_k= 5)

text = tokenizer.decode(outputs[0], skip_special_tokens=False)

print("Output:")
print(text.split("<|assistant|>")[1].split("<|user|>")[0])

Output:
 
根据提供的信息，以下是该患者综合管理建议:您的第1次反馈结果如下：
您当前的血糖异常情况为：较高
建议监测时间为：空腹
对您提出的健康建议如下：1.结合您的年龄，近期血糖控制欠佳，餐后血糖值高于正常控制范围，建议继续监测空腹血糖和餐后2小时血糖，尽快检测糖化血红蛋白，了解近3个月的血糖水平，并及时联系专科医师复诊，调整治疗方案；
2.BMI值较高，建议适当减重；
3.坚持规律用药，注意监测血压；
4.注意糖尿病饮食，低脂饮食，适量运动（参考随后为您出具的糖尿病饮食方案和运动方案）； 
5.保持心情舒畅，定期复查，如有不适及时就医。
建议您每日总卡路里摄入在1800左右，
每日碳水化合物摄入225至293克，
每日蛋白质摄入68至90克，
每日脂肪摄入40至60克，
可参考的饮食结构：总热卡：1800千卡
肉:280克(360千卡，蛋白质36克)；
水:1500毫升以上；
奶: 250/袋(165千卡，蛋白质7.2克)；
油:25毫升(225千卡)；
水果:200克(109千卡)；
蔬菜:1斤（90千卡）；
盐:6克；
鸡蛋:40-50克/个（70千卡，蛋白质6.5克)；
主食（大米、面粉、荞麦面、薏米）:235克(846千卡，蛋白质18.8克)。
对您提出的运动建议如下：如何选择适合自己的运动：
在选择运动方式时，要考虑到年龄、性别、体质、生活方式等的不同，因人而异、因时制宜，选择个体化的运动方式。
总体来说，糖尿病患者的运动应具备三个特点: 适量、全身性、有节奏。有氧运动就同时具备这三个特点，适合大多数糖尿病患者。 
一般来说，有氧运动的安全阈值是最大心率的70%-75%，简单地说，就是运动时的心率应保持在（180-年龄）次/分钟，超过这个心率可能引起血压升高、血糖下降等不适。
运动时间：每周至少5次，每次30分钟以上。 
运动强度：达到最大心率的40%-60%，或达到自我感觉强度的50%-60%。
对您提出的健康指导如下：糖尿病饮食：糖尿病患者饮食治疗的原则是高碳水化合物、低脂肪、适量蛋白质、高纤维。
1.每日总热卡摄入1800千卡左右，其中：蛋白质65-90g，碳水化合物180-245g，脂肪40-60g。
2.三餐分配：早餐：总热卡528千卡，碳水化合物61g，蛋白质20g，脂肪10g，鸡蛋1个；午餐：总热卡528千卡，碳水化合物68g，