In [16]:
import functools
import warnings

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import wandb
from datasets import Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, \
    Trainer

warnings.filterwarnings("ignore")
wandb.init(mode="disabled")
tqdm.pandas()

In [17]:
df_food = pd.read_parquet("../dataset/cisbox_food_gpt4.parquet")
df_beverage = pd.read_parquet("../dataset/cisbox_beverage_gpt4.parquet")
df_other = pd.read_parquet("../dataset/cisbox_other_gpt4.parquet")

In [18]:
df_other = df_other.sample(n=len(df_food)).reset_index(drop=True)

In [19]:
df_beverage["labels"] = 0
df_food["labels"] = 1
df_other["labels"] = 2

df = pd.concat([df_food, df_beverage, df_other])

In [20]:
df.dropna(inplace=True)
df = df.sample(frac=1).reset_index(drop=True)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(df["artikelbezeichnung1"], df["labels"], test_size=0.2, random_state=19)

In [22]:
ds = DatasetDict({
    'train': Dataset.from_dict({'text': X_train, 'labels': y_train}),
    'val': Dataset.from_dict({'text': X_test, 'labels': y_test})
})

In [23]:
# model name
model_name = 'mistralai/Mistral-7B-v0.1'


# preprocess dataset with tokenizer
def tokenize_examples(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['text'])
    tokenized_inputs['labels'] = examples['labels']
    return tokenized_inputs


tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenized_ds = ds.map(functools.partial(tokenize_examples, tokenizer=tokenizer), batched=True)
tokenized_ds = tokenized_ds.with_format('torch')

# qunatization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,  # enable 4-bit quantization
    bnb_4bit_quant_type='nf4',  # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant=True,  # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype=torch.bfloat16  # optimized fp format for ML
)

# lora config
lora_config = LoraConfig(
    r=16,  # the dimension of the low-rank matrices
    lora_alpha=8,  # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout=0.05,  # dropout probability of the LoRA layers
    bias='none',  # wether to train bias weights, set to 'none' for attention layers
    task_type='SEQ_CLS'
)

# load model
model = AutoModelForSequenceClassification.from_pretrained(model_name, quantization_config=quantization_config,
                                                           num_labels=3)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.config.pad_token_id = tokenizer.pad_token_id

tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Map:   0%|          | 0/539110 [00:00<?, ? examples/s]

Map:   0%|          | 0/134778 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


In [24]:
# define custom batch preprocessor
def collate_fn(batch, tokenizer):
    dict_keys = ['input_ids', 'attention_mask', 'labels']
    d = {k: [dic[k] for dic in batch] for k in dict_keys}
    d['input_ids'] = torch.nn.utils.rnn.pad_sequence(d['input_ids'], batch_first=True,
                                                     padding_value=tokenizer.pad_token_id)
    d['attention_mask'] = torch.nn.utils.rnn.pad_sequence(d['attention_mask'], batch_first=True, padding_value=0)
    d['labels'] = torch.stack(d['labels'])
    return d


# define which metrics to compute for evaluation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    f1_weighted = f1_score(labels, predictions, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1_weighted,
    }

In [25]:
class CustomTrainer(Trainer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = F.cross_entropy(logits, labels, reduction='mean')
        return (loss, outputs) if return_outputs else loss

In [26]:
batch_size = 16
epochs = 100
weight_decay = 0.01

In [27]:
# define training args
training_args = TrainingArguments(
    output_dir='../outputs/llm/mistral',
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=weight_decay,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='epoch',
    metric_for_best_model='eval_f1',
    load_best_model_at_end=True
)

# train
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['val'],
    tokenizer=tokenizer,
    data_collator=functools.partial(collate_fn, tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

In [28]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2061,0.180215,0.946935,0.94684
2,0.1687,0.182787,0.947966,0.947863


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)

KeyboardInterrupt

