In [None]:
# ! pip install accelerate bitsandbytes peft datasets scikit-learn pandas transformers hf_transfer

In [2]:
import numpy as np
import pandas as pd
import os
import torch
from transformers import (
    AutoModelForSequenceClassification, 
    BitsAndBytesConfig, 
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer, 
    TrainingArguments
)
from sklearn.metrics import cohen_kappa_score
from peft import prepare_model_for_kbit_training, LoraConfig, TaskType, get_peft_model
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm
2024-04-25 18:08:25.822427: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-25 18:08:27.510417: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-25 18:08:27.510459: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-25 18:08:27.511437: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-25 18:08:27.5

In [3]:
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
MODEL_ID = "/gemini/pretrain/Meta-Llama-3-8B"
MAX_LENGTH = 1024
SPLIT = 5
FOLD_NUM = 0
ACCESS_TOKEN = "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"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=ACCESS_TOKEN)

print(tokenizer.padding_side, tokenizer.pad_token)
tokenizer.pad_token = tokenizer.eos_token
print(tokenizer.padding_side, tokenizer.pad_token)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


right None
right <|end_of_text|>


In [5]:
df = pd.read_csv("../../dataset/30k_train.csv")

In [6]:
df["labels"] = df.score.map(lambda x: x - 1)

X = df[["essay_id", "full_text", "score"]]
y = df[["labels"]]

In [7]:
skf = StratifiedKFold(n_splits=SPLIT, random_state=3047, shuffle=True)

def tokenize(sample):
    return tokenizer(sample["full_text"], max_length=MAX_LENGTH, truncation=True)

global ds_train
global ds_eval

for fold_id, (train_index, val_index) in enumerate(skf.split(X, y)):
    if fold_id == FOLD_NUM:
        print(f"... Fold {fold_id} ...")
        X_train, X_eval = X.iloc[train_index], X.iloc[val_index]
        y_train, y_eval = y.iloc[train_index], y.iloc[val_index]

        df_train = pd.concat([X_train, y_train], axis=1)
        df_train.reset_index(drop=True, inplace=True)
        print(df_train["labels"].value_counts())

        df_eval = pd.concat([X_eval, y_eval], axis=1)
        df_eval.reset_index(drop=True, inplace=True)
        print(df_eval["labels"].value_counts())

        ds_train = Dataset.from_pandas(df_train)
        print(ds_train)
        ds_eval = Dataset.from_pandas(df_eval)
        print(ds_eval)

        ds_train = ds_train.map(tokenize).remove_columns(["essay_id", "full_text", "score"])
        ds_eval = ds_eval.map(tokenize).remove_columns(["essay_id", "full_text", "score"])

... Fold 0 ...
2    8032
3    6424
1    5368
4    2743
0    1068
5     710
Name: labels, dtype: int64
2    2009
3    1606
1    1342
4     685
0     267
5     178
Name: labels, dtype: int64
Dataset({
    features: ['essay_id', 'full_text', 'score', 'labels'],
    num_rows: 24345
})
Dataset({
    features: ['essay_id', 'full_text', 'score', 'labels'],
    num_rows: 6087
})


Map: 100%|██████████| 24345/24345 [00:26<00:00, 907.14 examples/s] 
Map: 100%|██████████| 6087/6087 [00:06<00:00, 896.75 examples/s] 


In [8]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    token=ACCESS_TOKEN,
    quantization_config=bnb_config,
    num_labels=6,
    device_map="auto",
    trust_remote_code=True,
    low_cpu_mem_usage=True
)
print(model.config.pad_token_id)
model.config.pad_token_id = model.config.eos_token_id
print(model.config.pad_token_id)

Loading checkpoint shards: 100%|██████████| 4/4 [03:56<00:00, 59.23s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /gemini/pretrain/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


None
128001


In [9]:
print(model)

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    

In [10]:
model = prepare_model_for_kbit_training(model)

model

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    

In [11]:
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.05,
    task_type=TaskType.SEQ_CLS,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ]
)

lora_model = get_peft_model(model, lora_config)
lora_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear4bit(
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
              )
              (k_proj): Linear4bit(
                (lora_

In [12]:
lora_model.print_trainable_parameters()

trainable params: 167,796,736 || all params: 7,672,745,984 || trainable%: 2.1869189511800213


In [13]:
print(torch.cuda.is_bf16_supported())

True


In [14]:
class DataCollator:
    def __call__(self, features):
        model_inputs = [
            {
                "input_ids": feature["input_ids"],
                "attention_mask": feature["attention_mask"],
                "labels": feature["labels"]
            } for feature in features
        ]
        batch = tokenizer.pad(
            model_inputs,
            padding="max_length",
            max_length=MAX_LENGTH,
            return_tensors="pt",
            pad_to_multiple_of=16
        )
        return batch

def compute_metrics(p):
    preds, labels = p
    score = cohen_kappa_score(
        labels,
        preds.argmax(-1),
        weights="quadratic"
    )
    return {"qwk": score}

training_args=TrainingArguments(
    output_dir="output",
    bf16=True if torch.cuda.is_bf16_supported() else False,
    fp16=False if torch.cuda.is_bf16_supported() else True,
    learning_rate=2e-4 * 64,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=256,
    optim="paged_adamw_8bit",
    num_train_epochs=3,
    weight_decay=0.001,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=100,
    save_total_limit=10,
    save_strategy="steps",
    save_steps=100,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="qwk",
    greater_is_better=True,
    save_only_model=True,
    lr_scheduler_type="cosine",
    report_to="none"
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_eval,
    tokenizer=tokenizer,
#     data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    data_collator=DataCollator(),
    compute_metrics=compute_metrics
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [15]:
# print("Evaluating the Model Before Training!")
# trainer.evaluate()

In [16]:
print("Training the Model")
trainer.train()

Training the Model


You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


RuntimeError: NVML_SUCCESS == DriverAPI::get()->nvmlDeviceGetHandleByPciBusId_v2_( pci_id, &nvml_device) INTERNAL ASSERT FAILED at "/opt/pytorch/pytorch/c10/cuda/CUDACachingAllocator.cpp":1135, please report a bug to PyTorch. 

In [None]:
print("Evaluating the Trained Model")
trainer.evaluate()