In [None]:
import os
import time
import pandas as pd

import bitsandbytes as bnb
import torch
import torch.nn as nn

from transformers import (
    AutoConfig,
    BitsAndBytesConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
)
from transformers import (
    pipeline, 
    TrainingArguments, 
    Trainer
)
from peft import ( 
		LoraConfig, 
		PeftConfig, 
		PeftModel, 
		get_peft_model, 
		prepare_model_for_kbit_training
)

import transformers
from datasets import Dataset

In [None]:
MODEL_NAME = "/kaggle/input/llama-3/transformers/8b-hf/1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
model1 = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config
)

tokenizer1 = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer1.pad_token = tokenizer1.eos_token

In [None]:
df = pd.read_csv("/kaggle/input/data-k10-new-filter/new_filtered_df_10k.csv", nrows=1000)
df.columns = [str(q).strip() for q in df.columns]

data = Dataset.from_pandas(df)

In [None]:
def generate_prompt(data_point):
    return f"""
            Напиши резюме тексту: 
            {data_point["text"]}. 
            Резюме: {data_point["summary"]}
            """.strip()

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenizer1(full_prompt, padding=True, truncation=True)
    return tokenized_full_prompt

data = data.shuffle().map(generate_and_tokenize_prompt)

In [None]:
model1 = prepare_model_for_kbit_training(model1)

In [None]:
config = LoraConfig(
    r=128,
    lora_alpha=32,
    target_modules=['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model1 = get_peft_model(model1, config)

In [None]:
generation_config = model1.generation_config
generation_config.max_new_tokens = 120
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer1.eos_token_id
generation_config.eos_token_id = tokenizer1.eos_token_id

In [None]:
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1, 
    logging_steps=100,
    eval_steps=100,
    num_train_epochs=2,
    learning_rate=1e-4,
    save_strategy="epoch",
    fp16=True,
    output_dir="finetune_summary2",
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.01,
    report_to="none"
)

trainer = transformers.Trainer(
    model=model1,
    train_dataset=data,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer1, mlm=False)
)
model1.config.use_cache = False
trainer.train()