# Fine-tuning language part of model

### Installing requirements

In [None]:
!pip install -q --upgrade \
    pip \
    setuptools \
    wheel
!pip install -q \
    torch \
    numpy \
    tqdm \
    "transformers>=4.41.0" \
    "huggingface_hub>=0.23.2" \
    peft \
    accelerate \
    matplotlib \
    git+https://github.com/tingofurro/summac \
    "datasets<=2.14.6"
!pip install -q --upgrade bitsandbytes

### Necessary imports

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel, get_scheduler, BitsAndBytesConfig
from datasets import load_dataset
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, TaskType
from accelerate import Accelerator
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

### Defining variables

In [None]:
MODEL_NAME = "OpenGVLab/InternVL2_5-4B"
DATASET_NAME = "RussianNLP/Mixed-Summarization-Dataset"
OUTPUT_DIR = "./finetuned_model"
BATCH_SIZE = 2
EPOCHS = 3
LR = 2e-4  # learning rate
MAX_LENGTH = 128
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Uploading model and tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True, use_fast=True)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# Model 4bit с bnb
full_model = AutoModel.from_pretrained(
    MODEL_NAME,
    device_map='auto',
    quantization_config=quantization_config,
    trust_remote_code=True
)

# Connecting language part of model
model = full_model.language_model
# Preparing model for QLoRA
model = prepare_model_for_kbit_training(model)

### LoRA configuration

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],  # for LLMs
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

### Preparing the dataset

In [None]:
train_data = load_dataset(DATASET_NAME, split='train')

train_data = train_data.select(range(10000))

### Preprocessing

In [None]:
def preprocess_function(examples):
    inputs = examples["text"]
    targets = examples["summary"]
    model_inputs = tokenizer(inputs, max_length=MAX_LENGTH, padding="max_length", truncation=True)  # no return_tensors="pt"
    labels = tokenizer(targets, max_length=MAX_LENGTH, padding="max_length", truncation=True)["input_ids"]
    # replace pad tokens with -100 in labels (for ignoring in loss)
    labels = [[(token if token != tokenizer.pad_token_id else -100) for token in label] for label in labels]
    model_inputs["labels"] = labels
    return model_inputs

In [None]:
processed_dataset = train_data.map(
    preprocess_function,
    batched=True,
    batch_size=16,
    remove_columns=train_data.column_names,
    load_from_cache_file=False
)

# Splitting on training/validation
split = processed_dataset.train_test_split(test_size=0.1)
train_dataset = split["train"]
val_dataset = split["test"]

### Collating

In [None]:
def collate_fn(batch):
    input_ids = torch.tensor([sample['input_ids'] for sample in batch], dtype=torch.long)
    attention_mask = torch.tensor([sample['attention_mask'] for sample in batch], dtype=torch.long)
    labels = torch.tensor([sample['labels'] for sample in batch], dtype=torch.long)
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

### Batching

In [None]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

### Defining metrics

In [None]:
from summac.model_summac import SummaCConv

In [None]:
summac_model = SummaCConv(
    granularity="sentence",
    models=["vitc"],
    device=DEVICE,
    start_file=None,
    use_con=False
)

In [None]:
train_losses = []
val_losses = []
summac_scores = []

### Optimizer, scheduler, accelerator

In [None]:
# Optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
num_training_steps = EPOCHS * len(train_loader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Accelerator
accelerator = Accelerator(mixed_precision="fp16")
model, optimizer, train_loader, val_loader, lr_scheduler = accelerator.prepare(
    model, optimizer, train_loader, val_loader, lr_scheduler
)

### Final preparations

### Generating text for validation

In [None]:
def generate_text(input_ids, attention_mask):
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=32,
        num_beams=1
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

### Training loop

In [None]:
for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}/{EPOCHS}")
    model.train()
    total_train_loss = 0
    for batch in tqdm(train_loader):
        batch = {k: v.to(model.device) for k, v in batch.items()}
    
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"]
        )
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    model.eval()
    total_val_loss = 0
    preds = []
    sources = []
    with torch.no_grad():
        for batch in tqdm(val_loader):
            batch = {k: v.to(model.device) for k, v in batch.items()}
    
            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                labels=batch["labels"]
            )
            loss = outputs.loss
            total_val_loss += loss.item()

            generated_texts = generate_text(batch["input_ids"], batch["attention_mask"])
            preds.extend(generated_texts)
            for input_id in batch["input_ids"]:
                sources.append(tokenizer.decode(input_id, skip_special_tokens=True))

    avg_val_loss = total_val_loss / len(val_loader)
    val_losses.append(avg_val_loss)

    summac_result = summac_model.score(sources, preds, batch_size=8)
    summac_score = np.mean(summac_result["scores"])
    summac_scores.append(summac_score)

    print(f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | SummaC: {summac_score:.4f})

### Saving model

In [None]:
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Model is saved: {OUTPUT_DIR}")

### Graphical visualization

In [None]:
epochs = np.arange(1, EPOCHS+1)

plt.figure(figsize=(12, 8))

plt.subplot(2, 2, 1)
plt.plot(epochs, train_losses, label="Train Loss")
plt.plot(epochs, val_losses, label="Val Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.title("Loss")

plt.subplot(2, 2, 2)
plt.plot(epochs, summac_scores, label="SummaC", color="orange")
plt.xlabel("Epoch")
plt.ylabel("Score")
plt.legend()
plt.title("SummaC")

plt.tight_layout()
plt.show()