In [1]:
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, Trainer, TrainingArguments, DataCollatorForLanguageModeling

from transformers import BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig

import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


bin D:\.conda_env\yothalia\lib\site-packages\bitsandbytes\libbitsandbytes_cuda117.dll


## Load Model and Adapter

In [2]:
model = AutoModelForCausalLM.from_pretrained("D:/Projects/project_yothalia/yothalia/server/model_weights/internlm/internlm-chat-7b-finetune-int8", 
                                                load_in_4bit=True,
                                                #peft_config=config,
                                                trust_remote_code=True)


In [3]:
tokenizer = AutoTokenizer.from_pretrained("D:/Projects/project_yothalia/yothalia/server/model_weights/internlm/internlm-chat-7b-finetune-int8",
                                            trust_remote_code=True)

In [4]:
peft_model_id = "D:/Projects/project_yothalia/yothalia/server/model_weights/internlm/internlm-chat-7b-finetune-lora"
config = PeftConfig.from_pretrained(peft_model_id)
model = PeftModel.from_pretrained(model, peft_model_id)
#model.add_adapter(config)

In [7]:
for name, param in model.named_parameters():
    if 'lora' in name:
        param.requires_grad = True

In [9]:
model.print_trainable_parameters()

trainable params: 41,693,248 || all params: 7,363,674,176 || trainable%: 0.5662016950164418


## Set Trainer

#### Load dataset

In [10]:
from torch.utils.data import DataLoader
import pandas as pd

In [11]:
df = pd.read_csv('../train_sample/csv/train.csv',index_col=0)

In [12]:
df = df.map(lambda x: tokenizer(x, padding='max_length', truncation=True, max_length=512))

In [13]:
df.head()

Unnamed: 0,train
0,"[input_ids, attention_mask]"
1,"[input_ids, attention_mask]"
2,"[input_ids, attention_mask]"
3,"[input_ids, attention_mask]"
4,"[input_ids, attention_mask]"


In [26]:
training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,

  # Number of training epochs
  num_train_epochs=3,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  #max_steps=max_steps,

  # Batch size for training
  per_device_train_batch_size=4,

  # Directory to save model checkpoints
  output_dir='./ckp',

  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=1, # Number of update steps between two evaluations
  save_steps=1, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=4, # Batch size for evaluation
  evaluation_strategy="steps",
  logging_strategy="steps",
  logging_steps=1,
  optim="adamw_torch",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  greater_is_better=False
)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [27]:
from transformers import Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=df["train"][0:100],
    eval_dataset=df["train"][101:120].reset_index(drop=True),
    data_collator=data_collator,
)

In [28]:
trainer.train()

Step,Training Loss,Validation Loss
1,12.5938,13.421875
2,12.5488,13.117188
3,11.0645,12.835938
4,11.002,12.625
5,10.7695,12.554688
6,10.6875,12.429688
7,9.666,12.414062
8,11.1465,12.414062
9,10.5098,12.382812
10,10.7559,12.34375


TrainOutput(global_step=18, training_loss=10.986436631944445, metrics={'train_runtime': 5204.4489, 'train_samples_per_second': 0.058, 'train_steps_per_second': 0.003, 'total_flos': 6141024827080704.0, 'train_loss': 10.986436631944445, 'epoch': 2.88})

In [25]:
df["train"][100:120].reset_index(drop=True)

0     [input_ids, attention_mask]
1     [input_ids, attention_mask]
2     [input_ids, attention_mask]
3     [input_ids, attention_mask]
4     [input_ids, attention_mask]
5     [input_ids, attention_mask]
6     [input_ids, attention_mask]
7     [input_ids, attention_mask]
8     [input_ids, attention_mask]
9     [input_ids, attention_mask]
10    [input_ids, attention_mask]
11    [input_ids, attention_mask]
12    [input_ids, attention_mask]
13    [input_ids, attention_mask]
14    [input_ids, attention_mask]
15    [input_ids, attention_mask]
16    [input_ids, attention_mask]
17    [input_ids, attention_mask]
18    [input_ids, attention_mask]
19    [input_ids, attention_mask]
Name: train, dtype: object