In [None]:
# ! pip install accelerate bitsandbytes peft datasets scikit-learn pandas transformers hf_transfer

In [None]:
import numpy as np
import pandas as pd
import os
import torch
from transformers import (
    AutoModelForSequenceClassification, 
    BitsAndBytesConfig, 
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer, 
    TrainingArguments
)
from sklearn.metrics import cohen_kappa_score
from peft import prepare_model_for_kbit_training, LoraConfig, TaskType, get_peft_model
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset

In [None]:
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
MODEL_ID = "/gemini/pretrain/Meta-Llama-3-8B"
MAX_LENGTH = 1024
SPLIT = 5
FOLD_NUM = 0
ACCESS_TOKEN = "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"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=ACCESS_TOKEN)

print(tokenizer.padding_side, tokenizer.pad_token)
tokenizer.pad_token = tokenizer.eos_token
print(tokenizer.padding_side, tokenizer.pad_token)

In [None]:
df = pd.read_csv("../../dataset/30k_train.csv")

In [None]:
df["labels"] = df.score.map(lambda x: x - 1)

X = df[["essay_id", "full_text", "score"]]
y = df[["labels"]]

In [None]:
skf = StratifiedKFold(n_splits=SPLIT, random_state=3047, shuffle=True)

def tokenize(sample):
    return tokenizer(sample["full_text"], max_length=MAX_LENGTH, truncation=True)

global ds_train
global ds_eval

for fold_id, (train_index, val_index) in enumerate(skf.split(X, y)):
    if fold_id == FOLD_NUM:
        print(f"... Fold {fold_id} ...")
        X_train, X_eval = X.iloc[train_index], X.iloc[val_index]
        y_train, y_eval = y.iloc[train_index], y.iloc[val_index]

        df_train = pd.concat([X_train, y_train], axis=1)
        df_train.reset_index(drop=True, inplace=True)
        print(df_train["labels"].value_counts())

        df_eval = pd.concat([X_eval, y_eval], axis=1)
        df_eval.reset_index(drop=True, inplace=True)
        print(df_eval["labels"].value_counts())

        ds_train = Dataset.from_pandas(df_train)
        print(ds_train)
        ds_eval = Dataset.from_pandas(df_eval)
        print(ds_eval)

        ds_train = ds_train.map(tokenize).remove_columns(["essay_id", "full_text", "score"])
        ds_eval = ds_eval.map(tokenize).remove_columns(["essay_id", "full_text", "score"])

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    token=ACCESS_TOKEN,
    quantization_config=bnb_config,
    num_labels=6,
    device_map="auto",
    trust_remote_code=True,
    low_cpu_mem_usage=True
)
print(model.config.pad_token_id)
model.config.pad_token_id = model.config.eos_token_id
print(model.config.pad_token_id)

In [None]:
print(model)

In [None]:
model = prepare_model_for_kbit_training(model)

model

In [None]:
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.05,
    task_type=TaskType.SEQ_CLS,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ]
)

lora_model = get_peft_model(model, lora_config)
lora_model

In [None]:
lora_model.print_trainable_parameters()

In [None]:
print(torch.cuda.is_bf16_supported())

In [None]:
class DataCollator:
    def __call__(self, features):
        model_inputs = [
            {
                "input_ids": feature["input_ids"],
                "attention_mask": feature["attention_mask"],
                "labels": feature["labels"]
            } for feature in features
        ]
        batch = tokenizer.pad(
            model_inputs,
            padding="max_length",
            max_length=MAX_LENGTH,
            return_tensors="pt",
            pad_to_multiple_of=16
        )
        return batch

def compute_metrics(p):
    preds, labels = p
    score = cohen_kappa_score(
        labels,
        preds.argmax(-1),
        weights="quadratic"
    )
    return {"qwk": score}

training_args=TrainingArguments(
    output_dir="output",
    bf16=True if torch.cuda.is_bf16_supported() else False,
    fp16=False if torch.cuda.is_bf16_supported() else True,
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    num_train_epochs=3,
    weight_decay=0.001,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=100,
    save_total_limit=10,
    save_strategy="steps",
    save_steps=100,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="qwk",
    greater_is_better=True,
    save_only_model=True,
    lr_scheduler_type="cosine",
    report_to="none"
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_eval,
    tokenizer=tokenizer,
#     data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    data_collator=DataCollator(),
    compute_metrics=compute_metrics
)

In [None]:
# print("Evaluating the Model Before Training!")
# trainer.evaluate()

In [None]:
print("Training the Model")
trainer.train()

In [None]:
print("Evaluating the Trained Model")
trainer.evaluate()