## **1. 基本训练代码模板** ##

In [8]:
# 定义训练参数
training_args = TrainingArguments(
    output_dir="./lora_finetuned_model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",  # 已将 'evaluation_strategy' 更改为 'eval_strategy'
    save_strategy="epoch",  # 已将 'save_strategy' 更改为 'save_strategy'
    load_best_model_at_end=True,
)

# 创建Trainer实例
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
)

# 开始训练
print("--- 开始训练 ---")
trainer.train()
print("--- 训练完成 ---")

--- 开始训练 ---


Epoch,Training Loss,Validation Loss
1,No log,0.672177
2,No log,0.653839
3,No log,0.645104


--- 训练完成 ---


## **2. 模型打包** ##

In [20]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# The path to the folder where you saved the model files
local_model_path = "/home/tywin/my_jupyter_project/gemma-2b"

# Load the tokenizer from the local folder
print(f"Loading tokenizer from {local_model_path}...")
tokenizer = AutoTokenizer.from_pretrained(local_model_path)

# Load the model from the local folder
print(f"Loading model from {local_model_path}...")
model = AutoModelForCausalLM.from_pretrained(local_model_path)

print("Model and tokenizer loaded successfully.")

Loading tokenizer from /home/tywin/my_jupyter_project/gemma-2b...
Loading model from /home/tywin/my_jupyter_project/gemma-2b...


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.27s/it]

Model and tokenizer loaded successfully.





# **3. 训练模型** #

In [24]:
# 导入必要的库
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset

# ---
# 1. 加载和准备数据
# ---

csv_file_path = "./output_data/Fact_Sales.csv"
print(f"Loading data from {csv_file_path}...")

try:
    df = pd.read_csv(csv_file_path, on_bad_lines='skip')
    
    if df.empty:
        raise ValueError("The CSV file is empty. Please check the file content.")
    
    # --- 只取前1000行 ---
    df = df.head(1000)
    print(f"Sampling the first {len(df)} rows for training.")

    # --- 格式化训练数据 ---
    if 'ProductKey' in df.columns and 'SalesAmount' in df.columns:
        df['text'] = "### 用户: 产品ID " + df['ProductKey'].astype(str) + " 的销售额是多少？\n### 助手: " + df['SalesAmount'].astype(str)
        print("Data formatted using 'ProductKey' and 'SalesAmount' columns.")
    else:
        print("Warning: 'ProductKey' or 'SalesAmount' column not found. Using generic formatting.")
        df['text'] = "### 用户: 告诉我关于这行数据的信息。\n### 助手: " + df.astype(str).agg(' '.join, axis=1)

    train_dataset = Dataset.from_pandas(df)
    print("Data loaded and formatted successfully.")
    print(f"Total training examples: {len(train_dataset)}")

except FileNotFoundError:
    print(f"Error: The file {csv_file_path} was not found. Please check the path.")
    exit()

except Exception as e:
    print(f"An error occurred while loading the data: {e}")
    exit()

# ---
# 2. 加载模型和分词器
# ---

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
print(f"Using device: {device}")

local_model_path = "/home/tywin/my_jupyter_project/gemma-2b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

print(f"Loading tokenizer from {local_model_path}...")
tokenizer = AutoTokenizer.from_pretrained(local_model_path)

print(f"Loading model from {local_model_path} with 4-bit quantization...")
model = AutoModelForCausalLM.from_pretrained(
    local_model_path,
    quantization_config=bnb_config,
    device_map="auto"
)

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)


# ---
# 3. 准备 LoRA 配置和分词数据
# ---

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "down_proj", "up_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
print("--- LoRA 模型配置完成 ---")
model.print_trainable_parameters()

def tokenize_function(examples):
    tokenized_output = tokenizer(examples['text'], truncation=True, padding="max_length", max_length=128)
    tokenized_output['labels'] = tokenized_output['input_ids'].copy()
    return tokenized_output

tokenized_dataset = train_dataset.map(tokenize_function, batched=True)


# ---
# 4. 训练模型
# ---

training_args = TrainingArguments(
    output_dir="./gemma_finetuned",
    learning_rate=2e-4,
    # 将批量大小减小到2
    per_device_train_batch_size=2, 
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    eval_strategy="no",
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

print("--- 开始训练 Gemma 模型 ---")
trainer.train()
print("--- 训练完成 ---")

trainer.save_model("./my_gemma_lora_adapter")
print("LoRA 适配器已保存到 ./my_gemma_lora_adapter 目录。")

Loading data from ./output_data/Fact_Sales.csv...
Sampling the first 1000 rows for training.
Data loaded and formatted successfully.
Total training examples: 1000
Using device: cuda
Loading tokenizer from /home/tywin/my_jupyter_project/gemma-2b...
Loading model from /home/tywin/my_jupyter_project/gemma-2b with 4-bit quantization...


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.83s/it]


--- LoRA 模型配置完成 ---
trainable params: 19,611,648 || all params: 2,525,784,064 || trainable%: 0.7765


Map: 100%|████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 22227.37 examples/s]

--- 开始训练 Gemma 模型 ---



  return fn(*args, **kwargs)


Step,Training Loss
500,2.8453
1000,2.3168
1500,2.1969


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


--- 训练完成 ---
LoRA 适配器已保存到 ./my_gemma_lora_adapter 目录。


# **4. 清空显存** #

In [3]:
import torch

# 1. 删除不再需要的变量
# 例如：
# del old_model
# del old_optimizer
# del old_dataset

# 2. 调用 PyTorch 的显存缓存清空函数
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("GPU显存已清空。")
else:
    print("没有可用的GPU。")

GPU显存已清空。


# **5. 训练所有表的前200行数据** #

# 导入必要的库
import torch
import pandas as pd
import glob
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset, DatasetDict

# ---
# 1. 从所有 CSV 文件中加载数据并分割
# ---

data_dir = "./output_data"
all_df = []

print(f"Loading first 200 rows from all CSV files in {data_dir}...")

for file in glob.glob(os.path.join(data_dir, '*.csv')):
    try:
        df = pd.read_csv(file, nrows=500)
        all_df.append(df)
        print(f"  Loaded {file} with {len(df)} rows.")
    except Exception as e:
        print(f"  Could not load {file}: {e}")

if all_df:
    merged_df = pd.concat(all_df, ignore_index=True)
    print(f"\nAll files merged into a single DataFrame with {len(merged_df)} rows.")
else:
    print("No CSV files found. Please check your data directory. Exiting.")
    exit()

# 格式化训练数据
merged_df['text'] = "### 用户: 告诉我关于这行数据的信息。\n### 助手: " + merged_df.astype(str).agg(' '.join, axis=1)

# 将 DataFrame 转换为 Hugging Face Dataset
full_dataset = Dataset.from_pandas(merged_df)

# 将数据集分割为训练集和验证集（90/10）
split_datasets = full_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']
print(f"Dataset split: {len(train_dataset)} training examples, {len(eval_dataset)} validation examples.")

# ---
# 2. 加载模型和分词器
# ---

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
print(f"Using device: {device}")

local_model_path = "/home/tywin/my_jupyter_project/gemma-2b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

print(f"Loading tokenizer from {local_model_path}...")
tokenizer = AutoTokenizer.from_pretrained(local_model_path)

print(f"Loading model from {local_model_path} with 4-bit quantization...")
model = AutoModelForCausalLM.from_pretrained(
    local_model_path,
    quantization_config=bnb_config,
    device_map="auto"
)

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)


# ---
# 3. 准备 LoRA 配置和分词数据
# ---

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "down_proj", "up_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
print("--- LoRA 模型配置完成 ---")
model.print_trainable_parameters()

def tokenize_function(examples):
    tokenized_output = tokenizer(examples['text'], truncation=True, padding="max_length", max_length=128)
    tokenized_output['labels'] = tokenized_output['input_ids'].copy()
    return tokenized_output

# 对训练集和验证集进行分词
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)


# ---
# 4. 训练模型
# ---

training_args = TrainingArguments(
    output_dir="./gemma_finetuned",
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    # 启用评估策略
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100, # 每隔100步打印一次日志
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset, # 传入验证集
)

print("--- 开始训练 Gemma 模型 ---")
trainer.train()
print("--- 训练完成 ---")

trainer.save_model("./my_gemma_lora_adapter")
print("LoRA 适配器已保存到 ./my_gemma_lora_adapter 目录。")

# **6. 训练模型打包** #

In [1]:
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# 1. 加载基础模型和分词器
base_model_path = "/home/tywin/my_jupyter_project/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
base_model = AutoModelForCausalLM.from_pretrained(base_model_path)

# 2. 加载 LoRA 适配器
lora_adapter_path = "./my_gemma_lora_adapter"
model = PeftModel.from_pretrained(base_model, lora_adapter_path)

# 3. 将适配器合并到基础模型中
# `merge_and_unload()` 方法会创建一个新的、完整的模型
merged_model = model.merge_and_unload(progressbar=True)

# 4. 保存合并后的模型和分词器
merged_model_dir = "../merged_gemma_model" #
merged_model.save_pretrained(merged_model_dir)
tokenizer.save_pretrained(merged_model_dir)

print(f"微调后的模型已合并并保存到 {merged_model_dir} 目录。")
print("现在你可以像使用普通模型一样加载这个目录进行推理。")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.04it/s]
Unloading and merging model: 100%|█████████████████████████████████████████████████| 367/367 [00:00<00:00, -2162.86it/s]


微调后的模型已合并并保存到 ../merged_gemma_model 目录。
现在你可以像使用普通模型一样加载这个目录进行推理。


# **7. 训练模型打包** #

In [16]:
import subprocess
import os

# Get the absolute path to your home directory
home_dir = os.path.expanduser("~")

# Define the absolute paths to the script and input model
script_path = os.path.join(home_dir, "my_jupyter_project", "venv", "llama.cpp", "convert_hf_to_gguf.py")
input_path = os.path.join(home_dir, "my_jupyter_project", "merged_gemma_model")

# Define the command and arguments
cmd = [
    "python",
    script_path,
    input_path,
    "--outfile",
    "gemma-2b-finetuned.gguf",
    "--outtype",
    "f16",
]

# Run the command from your home directory to avoid any relative path issues
try:
    print("Starting model conversion...")
    subprocess.run(cmd, check=True, cwd=os.path.join(home_dir, "my_jupyter_project"))
    print("Conversion successful!")
except subprocess.CalledProcessError as e:
    print(f"An error occurred: {e}")
except FileNotFoundError as e:
    print(f"An error occurred. Make sure your paths are correct. {e}")

Starting model conversion...


INFO:hf-to-gguf:Loading model: merged_gemma_model
INFO:hf-to-gguf:Model architecture: GemmaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00003.safetensors'
INFO:hf-to-gguf:token_embd.weight,         torch.float32 --> F16, shape = {2048, 256000}
INFO:hf-to-gguf:blk.0.attn_norm.weight,    torch.float32 --> F32, shape = {2048}
INFO:hf-to-gguf:blk.0.ffn_down.weight,     torch.float32 --> F16, shape = {16384, 2048}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,     torch.float32 --> F16, shape = {2048, 16384}
INFO:hf-to-gguf:blk.0.ffn_up.weight,       torch.float32 --> F16, shape = {2048, 16384}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,     torch.float32 --> F32, shape = {2048}
INFO:hf-to-gguf:blk.0.attn_k.weight,       torch.float32 --> F16, shape = {2048, 256}
INFO:hf-to-gguf:blk.0.attn_output

Conversion successful!
