In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["WANDB_MODE"]="disabled"
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
!grep -Ev '^(torch|numpy|pandas)' /kaggle/input/besstie/requirements.txt > /kaggle/working/requirements_no_torch_numpy_pandas.txt
!pip install -r /kaggle/working/requirements_no_torch_numpy_pandas.txt

In [None]:
from tqdm import tqdm

import torch
from torch.nn import functional as F

import pandas as pd
from datasets import Dataset

import bitsandbytes as bnb
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig, EarlyStoppingCallback
from trl import SFTTrainer
import warnings

warnings.filterwarnings("ignore")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def find_all_linear_names(model):
  cls = bnb.nn.Linear4bit
  lora_module_names = set()
  for name, module in model.named_modules():
    if isinstance(module, cls):
      names = name.split('.')
      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names: # needed for 16-bit
      lora_module_names.remove('lm_head')
  return list(lora_module_names)

In [None]:
def get_prompt(data, task):
    prompts = []
    for (text, label) in zip(data["text"], data["label"]):
        if task == "Sentiment":
            prompt = f"""<s>[INST] Generate the sentiment of the given text. 1 for positive sentiment, and 0 for negative sentiment. Do not give an explanation.\nText:{text}[/INST] {label} </s>"""
        elif task == "Sarcasm":
            prompt = f"""<s>[INST] Predict if the given text is sarcastic. 1 if the text is sarcastic, and 0 if the text is not sarcastic. Do not give an explanation.\nText:{text}[/INST] {label} </s>"""
        prompts.append(prompt)
    return prompts

In [None]:
data_section = {"variety": "en-UK", "source": "Google", "task": "Sentiment"}

train = pd.read_csv("/kaggle/input/besstie/train.csv")
train = train.dropna(subset=['text', 'label', 'variety', 'source', 'task'])

train = train[
    (train['variety'] == data_section["variety"]) &
    (train['source'] == data_section["source"]) &
    (train['task'] == data_section["task"])
]

train["prompt"] = get_prompt(train, data_section["task"])

train = Dataset.from_pandas(train[["prompt"]], split="train")

In [None]:
models_dir = "/kaggle/working/mistral"

model_id = "mistralai/Mistral-7B-Instruct-v0.3"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    attn_implementation="eager",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
tokenizer.padding_side="left"
tokenizer.pad_token=tokenizer.eos_token

In [None]:
modules = find_all_linear_names(model)
print(modules)

In [None]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=8,
        bias="none",
        target_modules=modules,
        task_type="CAUSAL_LM",
)

model = get_peft_model(model, peft_config)

In [None]:
model.print_trainable_parameters()

In [None]:
args = TrainingArguments(
    output_dir=models_dir+"/cache",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",
    logging_strategy="epoch",
    evaluation_strategy="no",
    save_strategy="epoch",
    save_steps=0.3,
    learning_rate=1e-5,
    bf16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    overwrite_output_dir=True,
    lr_scheduler_type="cosine"
)

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=train,
    dataset_text_field="prompt",
    peft_config=peft_config,
    max_seq_length=2048,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

In [None]:
trainer.model.save_pretrained(models_dir)