In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import copy
from dataclasses import dataclass

import numpy as np
import torch
from datasets import Dataset
from transformers import (
    BitsAndBytesConfig,
    Gemma2ForSequenceClassification,
    GemmaTokenizerFast,
    Gemma2Config,
    PreTrainedTokenizerBase,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from sklearn.metrics import log_loss, accuracy_score
from peft import PeftModel
from sklearn.model_selection import GroupKFold, StratifiedKFold
import pandas as pd
from transformers import set_seed
set_seed(42)
import joblib



@dataclass
class Config:
    output_dir: str = "ouput_gemma_9b"
    checkpoint: str = "/nfs/share/gemma-2-9b-it"#gemma-2-9b-it"#gemma-2-9b-it"  # 4-bit quantized gemma-2-9b-instruct
    max_length: int = 3000
    n_splits: int = 2
    fold_idx: int = 100
    optim_type: str = "paged_adamw_8bit"
    per_device_train_batch_size: int = 2
    gradient_accumulation_steps: int = 4  # global batch size is 8
    per_device_eval_batch_size: int = 8
    n_epochs: int = 1
    freeze_layers: int = 0  # there're 42 layers in total, we don't add adapters to the first 16 layers
    lr: float = 2e-4
    warmup_steps: int = 0
    lora_r: int = 64
    lora_alpha: float = 4
    lora_dropout: float = 0.0
    lora_bias: str = "none"
    max_grad_norm: float = 0.0
    lr_scheduler_type: str = "cosine"
    weight_decay: float = 0.01
    #fp16: bool = True

config = Config()
exp_name = f'model_{config.output_dir}_max_length_{config.max_length}_batch_size_{config.per_device_train_batch_size}_accumulation_steps_{config.gradient_accumulation_steps}_lora_r_{config.lora_r}_lora_alpha_{config.lora_alpha}'
training_args = TrainingArguments(
            output_dir=f"{config.output_dir}_{config.fold_idx}",
            overwrite_output_dir=True,
            gradient_checkpointing=True,
            save_total_limit=1,
            num_train_epochs=config.n_epochs,
            per_device_train_batch_size=config.per_device_train_batch_size,
            gradient_accumulation_steps=config.gradient_accumulation_steps,
            per_device_eval_batch_size=config.per_device_eval_batch_size,
            logging_steps=2,
            eval_strategy="epoch",
            save_strategy="epoch",
            optim=config.optim_type,
            lr_scheduler_type=config.lr_scheduler_type,
            #bf16=config.fp16,
            learning_rate=config.lr,
            report_to='none',  # 启用 wandb 日志记录
            run_name=exp_name,
            warmup_steps=config.warmup_steps,
           # deepspeed = 'zero_stage2_config.json' ,
            max_grad_norm=config.max_grad_norm,
            weight_decay=config.weight_decay,
            ddp_find_unused_parameters=False,
            load_best_model_at_end=True,
            metric_for_best_model='acc',
            greater_is_better=True,
            )

tokenizer = GemmaTokenizerFast.from_pretrained(config.checkpoint)
tokenizer.add_eos_token = True  # We'll add <eos> at the end
tokenizer.padding_side = "left"
tokenizer.truncation_side = "left"



ds = Dataset.from_parquet('hf-open-models-v1.parquet')


class CustomTokenizer:
    def __init__(
            self,
            tokenizer: PreTrainedTokenizerBase,
            max_length: int
    ) -> None:
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, batch: dict) -> dict:
        prompt = ["<prompt>: " + t for t in batch["prompt"]]
        response_a = ["\n\n<response_a>: " + t for t in batch["response_a"]]
        response_b = ["\n\n<response_b>: " + t for t in batch["response_b"]]
        texts = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        tokenized = self.tokenizer(texts, max_length=self.max_length, truncation=True)
        #labels = [0 if i == 'model_a' else 1 for i in batch["winner"]]
        return {**tokenized}



encode = CustomTokenizer(tokenizer, max_length=config.max_length)
ds = ds.map(encode, batched=True)#.select([i for i in range(500)])

def compute_metrics(eval_preds: EvalPrediction) -> dict:
    preds = eval_preds.predictions
    labels = eval_preds.label_ids
    probs = torch.from_numpy(preds).float().softmax(-1).numpy()
    loss = log_loss(y_true=labels, y_pred=probs)
    acc = accuracy_score(y_true=labels, y_pred=preds.argmax(-1))
    return {"acc": acc, "log_loss": loss}




In [2]:
# df = pd.read_parquet('train.parquet')#.head(100)
# sgkf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# folds = []
# for fold, (train_idx, test_idx) in enumerate(sgkf.split(df, df['language'])):
#     folds.append((train_idx, test_idx))

In [3]:
model = Gemma2ForSequenceClassification.from_pretrained(
    config.checkpoint,
    load_in_4bit=True,
    num_labels=2,
#    torch_dtype=torch.bfloat16,
#    device_map="auto",
)
model.config.use_cache = False
lora_config = LoraConfig(
        r=config.lora_r,
        lora_alpha=config.lora_alpha,
        # only target self-attention
        target_modules=["q_proj", "k_proj", "v_proj", "down_proj","up_proj","o_proj","gate_proj"],
        #layers_to_transform=[i for i in range(42) if i >= config.freeze_layers],
        lora_dropout=config.lora_dropout,
        bias=config.lora_bias,
        task_type=TaskType.SEQ_CLS,
        inference_mode=True,
        use_rslora = True,
        modules_to_save=[
                'score', 'lstm',
            ],
    )
model = PeftModel.from_pretrained(model, f'./ouput_gemma_9b_0', config=lora_config).eval()

trainer = Trainer(
    args=training_args,
    model=model,
    tokenizer=tokenizer,
    #train_dataset=ds.select(train_idx),
    #eval_dataset=ds.select(eval_idx),
    #compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)
predictions = trainer.predict(ds).predictions

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at /nfs/share/gemma-2-9b-it and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.119, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[2024-12-27 17:08:08,815] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/data/guanghan/miniconda3/envs/kaggle/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


In [4]:
ds_reverse = Dataset.from_parquet('hf-open-models-v1.parquet')


class RCustomTokenizer:
    def __init__(
            self,
            tokenizer: PreTrainedTokenizerBase,
            max_length: int
    ) -> None:
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, batch: dict) -> dict:
        prompt = ["<prompt>: " + t for t in batch["prompt"]]
        response_a = ["\n\n<response_a>: " + t for t in batch["response_b"]]
        response_b = ["\n\n<response_b>: " + t for t in batch["response_a"]]
        texts = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        tokenized = self.tokenizer(texts, max_length=self.max_length, truncation=True)
        #labels = [0 if i == 'model_a' else 1 for i in batch["winner"]]
        return {**tokenized}



r_encode = RCustomTokenizer(tokenizer, max_length=config.max_length)
ds_reverse = ds_reverse.map(r_encode, batched=True)#.select([i for i in range(100)])

In [5]:
predictions_reverse = trainer.predict(ds_reverse).predictions

In [6]:
predictions_reverse

array([[-0.2203 , -0.5903 ],
       [-1.248  ,  0.3547 ],
       [-1.037  , -0.284  ],
       ...,
       [-0.7236 , -0.48   ],
       [-0.927  , -0.05383],
       [ 0.5557 , -1.654  ]], dtype=float16)