In [1]:
# %% [markdown]
# # 1. 环境导入与依赖检查
# 本 cell 导入必要的库，并打印各库版本，确保环境配置正确。

# %%
import os
import torch

# Limit to 50% of total GPU memory for the current process
torch.cuda.set_per_process_memory_fraction(0.5, device=0)
print("PyTorch version:", torch.__version__)

from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import transformers
print("Transformers version:", transformers.__version__)

from datasets import load_dataset
print("Datasets library loaded successfully!")

PyTorch version: 2.4.1+cu121


  from .autonotebook import tqdm as notebook_tqdm


Transformers version: 4.46.3
Datasets library loaded successfully!


In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [3]:
# %% [markdown]
# # 2. 数据加载：从 data 文件夹读取 jssp_3m3j.json 文件
# 请确保在当前工作目录下存在 data/jssp_3m3j.json 文件，该文件采用 JSON 列表格式，
# 每个样例包含 "input" 和 "output" 字段。

# %%
data_file = "dataset/jssp_3m3j.json"
dataset = load_dataset("json", data_files=data_file)
print("Dataset loaded from", data_file)
print("Total examples:", len(dataset["train"]))
print("First example:")
print(dataset["train"][0])

Dataset loaded from dataset/jssp_3m3j.json
Total examples: 2
First example:
{'input': 'JSSP Problem:\nOptimize schedule for 3 Jobs across 3 Machines to minimize makespan.\nJob 0: Operation 0 on Machine 0 for 105 minutes, Operation 1 on Machine 1 for 29 minutes, Operation 2 on Machine 2 for 213 minutes.\nJob 1: Operation 0 on Machine 2 for 193 minutes, Operation 1 on Machine 1 for 18 minutes, Operation 2 on Machine 0 for 213 minutes.\nJob 2: Operation 0 on Machine 0 for 78 minutes, Operation 1 on Machine 2 for 74 minutes, Operation 2 on Machine 1 for 221 minutes.\nSolution:', 'output': 'Step-by-step solution:\n1. Identify operation sequences:\n   - Job 0: M0 (105) -> M1 (29) -> M2 (213)\n   - Job 1: M2 (193) -> M1 (18) -> M0 (213)\n   - Job 2: M0 (78) -> M2 (74) -> M1 (221)\n2. At time 0, schedule the first operations:\n   - Job 0 on M0 from 0 to 105.\n   - Job 1 on M2 from 0 to 193.\n   - Job 2 must wait for M0; start at 105 and finish at 183.\n3. Schedule subsequent operations based o

In [None]:
# %% [markdown]
# # 3. 划分训练集和测试集
# 为了观察训练进度和评估效果，我们将加载的数据集划分为训练集和测试集，
# 这里使用 80% 的数据用于训练，20% 用于评估。

# %%
# 使用 train_test_split 划分数据集
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

print("Training set size:", len(train_dataset))
print("Evaluation set size:", len(eval_dataset))#:3

Training set size: 1
Evaluation set size: 1


In [None]:
# %% [markdown]
# # 4. 加载模型和 Tokenizer
# 加载 "microsoft/Phi-3.5-mini-instruct" 模型及其对应的 tokenizer，
# 并将模型设置为训练模式，同时打印加载状态。

# %%
model_name = "microsoft/Phi-3-mini-4k-instruct"
print("Loading model and tokenizer:", model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             torch_dtype=torch.float16,

                                             device_map="auto",  # 自动选择设备
                                            )
model.train()  # 训练模式
print("Model and tokenizer loaded successfully!")

Loading model and tokenizer: microsoft/Phi-3-mini-4k-instruct


Downloading shards:  50%|█████     | 1/2 [17:58<17:58, 1078.37s/it]

In [None]:
# %% [markdown]
# # 5. 数据预处理：分词
# 定义分词函数，将每个样例的 input 和 output 拼接为一个完整的文本后进行编码，
# 设置最大长度为 512。分词结果将用于后续微调。

# %%
def tokenize_function(example):
    # 拼接 input 和 output 字段
    text = example["input"] + example["output"]
    return tokenizer(text, truncation=True, max_length=512)

# 对训练集和评估集进行分词
train_dataset = train_dataset.map(tokenize_function, batched=False)
eval_dataset = eval_dataset.map(tokenize_function, batched=False)

print("Tokenization complete. Example from training set:")
print(train_dataset[0])

Tokenization complete. Example from training set:
{'input': 'JSSP Problem:\nOptimize schedule for 3 Jobs across 3 Machines to minimize makespan.\nJob 0: Operation 0 on Machine 0 for 105 minutes, Operation 1 on Machine 1 for 29 minutes, Operation 2 on Machine 2 for 213 minutes.\nJob 1: Operation 0 on Machine 2 for 193 minutes, Operation 1 on Machine 1 for 18 minutes, Operation 2 on Machine 0 for 213 minutes.\nJob 2: Operation 0 on Machine 0 for 78 minutes, Operation 1 on Machine 2 for 74 minutes, Operation 2 on Machine 1 for 221 minutes.\nSolution:', 'output': 'Step-by-step solution:\n1. Identify operation sequences:\n   - Job 0: M0 (105) -> M1 (29) -> M2 (213)\n   - Job 1: M2 (193) -> M1 (18) -> M0 (213)\n   - Job 2: M0 (78) -> M2 (74) -> M1 (221)\n2. At time 0, schedule the first operations:\n   - Job 0 on M0 from 0 to 105.\n   - Job 1 on M2 from 0 to 193.\n   - Job 2 must wait for M0; start at 105 and finish at 183.\n3. Schedule subsequent operations based on machine availability:\n 

In [None]:
# %% [markdown]
# # 6. 构造 Data Collator
# 使用 DataCollatorForLanguageModeling 为因果语言模型构建数据整理器，
# 注意设置 mlm=False，因为我们不使用掩码任务。

# %%
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
print("Data Collator created successfully.")

Data Collator created successfully.


In [None]:
for param in model.parameters():
    if param.requires_grad:
        param.data = param.data.float()

In [None]:
# %% [markdown]
# # 7. 配置训练参数与初始化 Trainer
# 设置训练参数，如输出目录、每设备 batch size、训练轮数、日志打印频率、评估策略等，
# 并初始化 Trainer。日志和评估配置将帮助你在训练过程中观察进度与性能。

# %%
training_args = TrainingArguments(
    output_dir="./finetuned_model",
    per_device_train_batch_size=1,      # 根据你的硬件配置调整 batch size
    num_train_epochs=3,                   # 训练轮数，如有需要可调整
    logging_steps=10,                     # 每 10 步打印一次日志
    eval_steps=50,                        # 每 50 步进行一次评估
    evaluation_strategy="steps",          # 按步数进行评估
    save_steps=100,                       # 每 100 步保存一次模型
    fp16=True,                            # 如支持 fp16 则启用
    save_total_limit=2,                   # 最多保存 2 个检查点
)


print("Training parameters:")
print(training_args)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)
print("Trainer initialized successfully.")

Training parameters:
TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=50,
eval_strategy=steps,
eva



In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
# %% [markdown]
# # 8. Start Training and Monitor Progress
# This cell starts the fine-tuning process. Training logs (including loss, etc.) will be output during the process.
# After training, the fine-tuned model will be saved to the specified directory.

# %%
print("Starting training...")
train_result = trainer.train()  # Begin training; training logs will be printed automatically
print("Training complete!")
trainer.save_model()  # Save the fine-tuned model
print("Finetuned model saved to ./finetuned_model")

Starting training...


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 23.64 GiB of which 17.75 MiB is free. Including non-PyTorch memory, this process has 23.49 GiB memory in use. Of the allocated memory 23.00 GiB is allocated by PyTorch, and 33.77 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)