In [13]:
import os
import sys
import warnings; warnings.filterwarnings("ignore")
from tqdm import (tqdm, trange)
import random as rd
import numpy as np
import pandas as pd
import torch as th
from datasets import (load_dataset, load_from_disk, Dataset)
from sentence_transformers import (SentenceTransformer, util, CrossEncoder)
from transformers import (AutoTokenizer, AutoModel, AutoModelForCausalLM, AutoModelForSequenceClassification,
                          BitsAndBytesConfig,
                          TrainingArguments, DataCollatorWithPadding, DataCollatorForLanguageModeling,
                          DataCollatorForSeq2Seq, DataCollatorForTokenClassification,
                          Trainer)
from peft import (LoraConfig, get_peft_model, PeftModel, TaskType, get_peft_model_state_dict)
from trl import SFTTrainer
from vllm import (LLM, SamplingParams)

In [2]:
device = th.device("cuda" if th.cuda.is_available() else "cpu")
devive_cnt = th.cuda.device_count()
print(f"device = {device}; devive_cnt = {devive_cnt}")
print(th.__version__)
print(th.version.cuda)

device = cuda; devive_cnt = 1
2.5.1+cu121
12.1


In [3]:
path_project = "C:/my_project/MyGit/Machine-Learning-Column/hugging_face"
path_data = os.path.join(os.path.dirname(path_project), "data")
path_model = "F:/LLM"
path_output = os.path.join(os.path.dirname(path_project), "output")

## step-1: 载入数据源

In [4]:
filename = "Rogendo/English-Swahili-Sentence-Pairs/ensw.csv"

In [5]:
df_csv = pd.read_csv(os.path.join(path_data, filename))

In [6]:
df_csv.head(3)

Unnamed: 0,English sentence,Swahili Translation
0,I am,mimi ni
1,U,wewe
2,him,yeye


In [7]:
df_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210471 entries, 0 to 210470
Data columns (total 2 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   English sentence     210470 non-null  object
 1   Swahili Translation  210470 non-null  object
dtypes: object(2)
memory usage: 3.2+ MB


In [8]:
df_csv["label"] = 1.0
df_new = pd.DataFrame()

for i in trange(100):
    positive = df_csv.iloc[[i]]
    
    idx_list = rd.sample(population=df_csv.index.tolist(), k=3)
    negative = df_csv.iloc[idx_list]
    negative.iloc[:, 0] = positive.iloc[0, 0]
    negative.iloc[:, 2] = 0.0
    
    df_new = pd.concat([df_new, positive], axis=0, ignore_index=True)
    df_new = pd.concat([df_new, negative], axis=0, ignore_index=True)
    

100%|██████████| 100/100 [00:00<00:00, 175.60it/s]


In [9]:
dataset = Dataset.from_pandas(df_new).train_test_split(test_size=0.2, shuffle=True, seed=0)
dataset_train, dataset_test = dataset["train"], dataset["test"]

In [10]:
dataset_train[0]

{'English sentence': 'She comes from California.',
 'Swahili Translation': 'Anatoka California.',
 'label': 1}

## step-2: tokenizer

In [11]:
checkpoint = "sentence-transformers/BAAI/bge-reranker-large"

In [12]:
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True
)

## step-3: 配置量化参数

## step-4: 载入基模

In [33]:
# model_base = CrossEncoder(
#     model_name=os.path.join(path_model, checkpoint),
#     device=device,
#     local_files_only=True,
#     # trust_remote_code=False,
#     # automodel_args={"torch_dtype": th.float32},
# )

In [35]:
model_base = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    trust_remote_code=True,
    device_map="auto",
    # torch_dtype=th.float16,
    # quantization_config=config_bnb
)

In [16]:
model_base.gradient_checkpointing_enable()
model_base.enable_input_require_grads()
model_base.config.use_cache = False

if th.cuda.device_count() > 1:
    model_base.is_parallelizable = True
    model_base.model_parallel = True

In [18]:
# check embedding_size
tokenizer_size = len(tokenizer)
embedding_size = model_base.get_input_embeddings().weight.shape[0]
if tokenizer_size > embedding_size:
    model_base.resize_token_embeddings(tokenizer_size)

## step-5: 配置模型参数

In [19]:
config_model = {
    "rank": 8,
    "lora_alpha": 32,
    "lora_dropout": 0.1,
    "use_rslora": True,
    "epochs": 2,
    "batch_size": 4,
    "gradient_steps": 1,
    "learning_rate": 0.00005,
    "weight_decay": 0.01,
    "max_seq_length": 512
}

## step-6: 配置LoRA模型

In [20]:
model_base

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, ou

In [21]:
config_lora = LoraConfig(
    r=config_model.get("rank"),
    lora_alpha=config_model.get("lora_alpha"),
    lora_dropout=config_model.get("lora_dropout"),
    use_rslora=config_model.get("use_rslora"),
    bias="none",
    task_type=TaskType.SEQ_CLS,
    target_modules=["query", "key", "value", "dense"]
)

In [22]:
model_lora = get_peft_model(model=model_base, peft_config=config_lora)

In [23]:
# print_trainable_parameters - 1
print(model_lora.print_trainable_parameters())

# print_trainable_parameters - 2
# trainable_params = 0
# all_params = 0

# for param in model_lora.parameters():
#     if param.requires_grad:
#         trainable_params += param.numel()
#     all_params += param.numel()

# print(f"trainable params: {trainable_params} || all params: {all_params} || trainable%: {100 * trainable_params / all_params:.4f}")

trainable params: 4,605,953 || all params: 564,513,794 || trainable%: 0.8159
None


In [24]:
model_lora

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): XLMRobertaForSequenceClassification(
      (roberta): XLMRobertaModel(
        (embeddings): XLMRobertaEmbeddings(
          (word_embeddings): Embedding(250002, 1024, padding_idx=1)
          (position_embeddings): Embedding(514, 1024, padding_idx=1)
          (token_type_embeddings): Embedding(1, 1024)
          (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): XLMRobertaEncoder(
          (layer): ModuleList(
            (0-23): 24 x XLMRobertaLayer(
              (attention): XLMRobertaAttention(
                (self): XLMRobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
     

## step-6: 定义整理函数

In [26]:
# task: text classification
def tokenize_function(sample):
    inputs = tokenizer(text=sample["English sentence"], text_pair=sample["Swahili Translation"], max_length=512, truncation=True)
    inputs["labels"] = sample["label"]
    return inputs

In [27]:
dataset_tokenized = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/320 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

In [28]:
dataset_train_tokenized = dataset_tokenized["train"]
dataset_test_tokenized = dataset_tokenized["test"]

In [29]:
collate_fn = DataCollatorWithPadding(tokenizer)

## step-7: 模型训练

In [30]:
args_train = TrainingArguments(
    output_dir=os.path.join(path_output, "model_sft"),
    num_train_epochs=config_model.get("epochs"),
    per_device_train_batch_size=config_model.get("batch_size"),
    per_device_eval_batch_size=config_model.get("batch_size"),
    gradient_accumulation_steps=config_model.get("gradient_steps"),
    gradient_checkpointing=True, 
    optim="adamw_torch",
    learning_rate=config_model.get("learning_rate"),
    weight_decay=config_model.get("weight_decay"),
    logging_strategy="epoch",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    save_total_limit=1,
    metric_for_best_model="eval_loss",
    load_best_model_at_end=True
)

In [31]:
trainer = Trainer(
    model=model_lora,
    tokenizer=tokenizer,
    args=args_train,
    data_collator=collate_fn,
    train_dataset=dataset_train_tokenized,
    eval_dataset=dataset_test_tokenized,
    # compute_metrics=compute_metrics
)

In [None]:
res_train = trainer.train()
# https://blog.csdn.net/m0_48716917/article/details/134895148

## step-8: 模型评估

## step-9: 模型保存

## step-10: 模型加载

## step-11: 模型推理