In [1]:
!pip install accelerate bitsandbytes peft datasets scikit-learn pandas transformers hf_transfer

[0m

In [2]:
import numpy as np
import pandas as pd
import os
import torch
from transformers import (
    AutoModelForSequenceClassification, 
    BitsAndBytesConfig, 
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer, 
    TrainingArguments
)
from sklearn.metrics import cohen_kappa_score
from peft import prepare_model_for_kbit_training, LoraConfig, TaskType, get_peft_model
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset

In [3]:
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
MODEL_ID = "google/gemma-2b"
MAX_LENGTH = 1536
SPLIT = 10
FOLD_NUM = 0
ACCESS_TOKEN = "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"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=ACCESS_TOKEN)

print(tokenizer.padding_side, tokenizer.pad_token)
tokenizer.pad_token = tokenizer.eos_token
print(tokenizer.padding_side, tokenizer.pad_token)

tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

left <pad>
left <eos>


In [5]:
df = pd.read_csv("train.csv")

In [6]:
df["labels"] = df.score.map(lambda x: x - 1)

X = df[["essay_id", "full_text", "score"]]
y = df[["labels"]]

In [7]:
skf = StratifiedKFold(n_splits=SPLIT, random_state=3047, shuffle=True)

def tokenize(sample):
    return tokenizer(sample["full_text"], max_length=MAX_LENGTH, truncation=True)

global ds_train
global ds_eval

for fold_id, (train_index, val_index) in enumerate(skf.split(X, y)):
    if fold_id == FOLD_NUM:
        print(f"... Fold {fold_id} ...")
        X_train, X_eval = X.iloc[train_index], X.iloc[val_index]
        y_train, y_eval = y.iloc[train_index], y.iloc[val_index]

        df_train = pd.concat([X_train, y_train], axis=1)
        df_train.reset_index(drop=True, inplace=True)
        print(df_train["labels"].value_counts())

        df_eval = pd.concat([X_eval, y_eval], axis=1)
        df_eval.reset_index(drop=True, inplace=True)
        print(df_eval["labels"].value_counts())

        ds_train = Dataset.from_pandas(df_train)
        print(ds_train)
        ds_eval = Dataset.from_pandas(df_eval)
        print(ds_eval)

        ds_train = ds_train.map(tokenize).remove_columns(["essay_id", "full_text", "score"])
        ds_eval = ds_eval.map(tokenize).remove_columns(["essay_id", "full_text", "score"])

... Fold 0 ...
labels
2    5652
1    4251
3    3533
0    1126
4     873
5     141
Name: count, dtype: int64
labels
2    628
1    472
3    393
0    126
4     97
5     15
Name: count, dtype: int64
Dataset({
    features: ['essay_id', 'full_text', 'score', 'labels'],
    num_rows: 15576
})
Dataset({
    features: ['essay_id', 'full_text', 'score', 'labels'],
    num_rows: 1731
})


Map:   0%|          | 0/15576 [00:00<?, ? examples/s]

Map:   0%|          | 0/1731 [00:00<?, ? examples/s]

In [8]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    token=ACCESS_TOKEN,
    quantization_config=bnb_config,
    num_labels=6,
    device_map="auto",
    trust_remote_code=True,
    low_cpu_mem_usage=True
)
print(model.config.pad_token_id)
model.config.pad_token_id = model.config.eos_token_id
print(model.config.pad_token_id)

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of GemmaForSequenceClassification were not initialized from the model checkpoint at google/gemma-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0
1


In [9]:
print(model)

GemmaForSequenceClassification(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSN

In [10]:
model = prepare_model_for_kbit_training(model)

model

GemmaForSequenceClassification(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSN

In [11]:
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.05,
    task_type=TaskType.SEQ_CLS,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ]
)

lora_model = get_peft_model(model, lora_config)
lora_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): GemmaForSequenceClassification(
      (model): GemmaModel(
        (embed_tokens): Embedding(256000, 2048, padding_idx=0)
        (layers): ModuleList(
          (0-17): 18 x GemmaDecoderLayer(
            (self_attn): GemmaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bi

In [12]:
lora_model.print_trainable_parameters()

trainable params: 78,458,880 || all params: 2,584,643,584 || trainable%: 3.0355783089665644


In [13]:
print(torch.cuda.is_bf16_supported())

True


In [14]:
class DataCollator:
    def __call__(self, features):
        model_inputs = [
            {
                "input_ids": feature["input_ids"],
                "attention_mask": feature["attention_mask"],
                "labels": feature["labels"]
            } for feature in features
        ]
        batch = tokenizer.pad(
            model_inputs,
            padding="max_length",
            max_length=MAX_LENGTH,
            return_tensors="pt",
            pad_to_multiple_of=16
        )
        return batch

def compute_metrics(p):
    preds, labels = p
    score = cohen_kappa_score(
        labels,
        preds.argmax(-1),
        weights="quadratic"
    )
    return {"qwk": score}

training_args=TrainingArguments(
    output_dir="output",
    bf16=True if torch.cuda.is_bf16_supported() else False,
    fp16=False if torch.cuda.is_bf16_supported() else True,
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    num_train_epochs=2,
    weight_decay=0.001,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=100,
    save_total_limit=10,
    save_strategy="steps",
    save_steps=100,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="qwk",
    greater_is_better=True,
    save_only_model=True,
    lr_scheduler_type="cosine",
    report_to="none"
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_eval,
    tokenizer=tokenizer,
#     data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    data_collator=DataCollator(),
    compute_metrics=compute_metrics
)

In [15]:
# print("Evaluating the Model Before Training!")
# trainer.evaluate()

In [16]:
print("Training the Model")
trainer.train()

Training the Model


You're using a GemmaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss,Qwk
100,1.2102,0.9229,0.769296
200,0.8798,0.827005,0.767235
300,0.8296,0.786983,0.812785
400,0.7891,0.72657,0.827487
500,0.7593,0.736562,0.840671
600,0.6637,0.720653,0.841516
700,0.669,0.70542,0.841107
800,0.653,0.686509,0.851742
900,0.6371,0.677846,0.854224



Cannot access gated repo for url https://huggingface.co/google/gemma-2b/resolve/main/config.json.
Access to model google/gemma-2b is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in google/gemma-2b.

Cannot access gated repo for url https://huggingface.co/google/gemma-2b/resolve/main/config.json.
Access to model google/gemma-2b is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in google/gemma-2b.

Cannot access gated repo for url https://huggingface.co/google/gemma-2b/resolve/main/config.json.
Access to model google/gemma-2b is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in google/gemma-2b.

Cannot access gated repo for url https://huggingface.co/google/gemma-2b/resolve/main/config.json.
Access to model google/gemma-2b is restricted. You must be authenticated to access it. - silently ignoring the look

TrainOutput(global_step=972, training_loss=0.7756836659623763, metrics={'train_runtime': 19747.0613, 'train_samples_per_second': 1.578, 'train_steps_per_second': 0.049, 'total_flos': 5.90610125580927e+17, 'train_loss': 0.7756836659623763, 'epoch': 1.9969183359013867})

In [None]:
print("Evaluating the Trained Model")
trainer.evaluate()

Evaluating the Trained Model
