In [None]:
import torch, os, pandas as pd, numpy as np
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from unsloth import FastLanguageModel, is_bfloat16_supported, tokenizer_utils
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, Trainer, TrainingArguments, DataCollatorForLanguageModeling, DataCollatorWithPadding, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset, Dataset, DatasetDict
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from typing import List, Union, Any, Dict
from huggingface_hub import create_repo, upload_folder, notebook_login, login
from utils_dl import set_global_seed
from sklearn.model_selection import GroupShuffleSplit, StratifiedGroupKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix


torch.cuda.empty_cache()

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
# helper funct. needed as this function doesn't like it when the lm_head has its size changed
def do_nothing(*args, **kwargs):
    pass
    
tokenizer_utils.fix_untrained_tokens = do_nothing

SEED=42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SUBTASK2_PATH = 'new_data\subtask2'

set_global_seed(SEED)
notebook_login()

In [None]:
# Configuration parameters
language = 'spa'
model_type = 'dl'
stemming = False
lemmatization = False
remove_duplicates = False
cased = True 
description = False

desc = ''
if description:
    desc = '_with_description'

data_config = f"lang_{language}_model_{model_type}_stem_{stemming}_lem_{lemmatization}_dup_{remove_duplicates}_cased_{cased}{desc}"
file_name = 'subtask2_all_aug' 
db_file_name = f"{file_name}_{data_config}.csv"
file_path = os.path.join(SUBTASK2_PATH, db_file_name)

if os.path.exists(file_path):
    full_data = pd.read_csv(file_path, encoding='utf-8')
    print("File found:")
    print(full_data.info())
else:
    raise FileNotFoundError(f"File not found at {file_path}")

original_data = full_data[full_data['is_augmented'] != True].copy() 

In [None]:
cond_aug_nr = (full_data['is_augmented'] == True) & (full_data['label'] == 'NR')
cond_aug_s = (full_data['is_augmented'] == True) & (full_data['label'] == 'S')


full_data = full_data[~(cond_aug_nr | cond_aug_s)] 

In [None]:
text_column  = "lyrics_clean"
label_column = "label"
group_column = "id"             # all augmented variants share this id
aug_col      = "is_augmented"   # bool
final_training = False 
val_split_size = 0.1 if final_training else 0.2

if full_data[label_column].dtype == object:
    unique_labels = sorted(full_data[label_column].unique())
    label2id      = {lbl: idx for idx, lbl in enumerate(unique_labels)}
else:
    unique_labels = sorted(full_data[label_column].unique())
    label2id      = {int(lbl): int(lbl) for lbl in unique_labels}

id2label = {v: k for k, v in label2id.items()}
full_data[label_column] = full_data[label_column].map(label2id)
original_data[label_column] = original_data[label_column].map(label2id)

full_data = full_data.sample(frac=1, random_state=SEED).reset_index(drop=True)
original_data = original_data.sample(frac=1, random_state=SEED).reset_index(drop=True)

n_splits = int(1 / val_split_size)
sgkf = StratifiedGroupKFold(
    n_splits=n_splits,
    shuffle=True,
    random_state=SEED
)
train_val_idx, test_idx = next(
    sgkf.split(
        original_data,
        original_data[label_column],   # stratify on true labels
        original_data[group_column]    # keep groups intact
    )
)


train_val_df = original_data.iloc[train_val_idx].reset_index(drop=True)
test_df      = original_data.iloc[test_idx].reset_index(drop=True)

if not final_training:
    sgkf_val = StratifiedGroupKFold(
        n_splits=8,
        shuffle=True,
        random_state=SEED
    )
    trn_idx, val_idx = next(
        sgkf_val.split(
            train_val_df,
            train_val_df[label_column],
            train_val_df[group_column]
        )
    )

    train_df = train_val_df.iloc[trn_idx].reset_index(drop=True)
    val_df   = train_val_df.iloc[val_idx].reset_index(drop=True)
    # print(f"train={len(train_df)},  val={len(val_df)},  test={len(test_df)}")

else:
    print('Dataset for final model training')
    train_df = train_val_df.copy()
    val_df   = test_df.copy()
    # print(f"train={len(train_df)},  val={len(val_df)}")

In [None]:
train_df = pd.merge(train_df[['id']], full_data, on = "id", how = "inner").copy()
train_df = train_df.sample(frac=1, random_state=SEED).reset_index(drop=True)
train_df.head()

In [None]:
# **Train / Val / Test**
save_datasets = True
if not final_training: 

    train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
    val_dataset  = Dataset.from_pandas(val_df.reset_index(drop=True))
    test_dataset  = Dataset.from_pandas(test_df.reset_index(drop=True))
    
    ds = DatasetDict({
        'train': train_dataset,
        'val': val_dataset,
        'test': test_dataset
    })
    if save_datasets:
        splits_folder_path = f"{file_name}_{data_config}"
        if not os.path.exists(splits_folder_path):
            os.makedirs(splits_folder_path)
            
        train_df.to_csv(os.path.join(splits_folder_path, 'train.csv'), index=False, encoding='utf-8')
        unique_ids_df = pd.DataFrame(train_df['id'].unique(), columns=['id'])
        unique_ids_df.to_csv(os.path.join(splits_folder_path, 'train_ids.csv'), index=False, encoding='utf-8')
        
        val_df.to_csv(os.path.join(splits_folder_path, 'val.csv'), index=False, encoding='utf-8')
        unique_ids_df = pd.DataFrame(val_df['id'].unique(), columns=['id'])
        unique_ids_df.to_csv(os.path.join(splits_folder_path, 'val_ids.csv'), index=False, encoding='utf-8')
        
        test_df.to_csv(os.path.join(splits_folder_path, 'test.csv'), index=False, encoding='utf-8')
        unique_ids_df = pd.DataFrame(test_df['id'].unique(), columns=['id'])
        unique_ids_df.to_csv(os.path.join(splits_folder_path, 'test_ids.csv'), index=False, encoding='utf-8')
    
    
else: # **Train / Val**
    
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset  = Dataset.from_pandas(val_df)
    
    ds = DatasetDict({
        'train': train_dataset,
        'val': val_dataset
    })
    
    if save_datasets:
        splits_folder_path = f"{file_name}_{data_config}_competition"
        if not os.path.exists(splits_folder_path):
            os.makedirs(splits_folder_path)
            
        train_df.to_csv(os.path.join(splits_folder_path, 'train_competition.csv'), index=False, encoding='utf-8')
        unique_ids_df = pd.DataFrame(train_df['id'].unique(), columns=['id'])
        unique_ids_df.to_csv(os.path.join(splits_folder_path, 'train_competition_ids.csv'), index=False, encoding='utf-8')
        
        val_df.to_csv(os.path.join(splits_folder_path, 'val_competition.csv'), index=False, encoding='utf-8')
        unique_ids_df = pd.DataFrame(val_df['id'].unique(), columns=['id'])
        unique_ids_df.to_csv(os.path.join(splits_folder_path, 'val_competition_ids.csv'), index=False, encoding='utf-8')

# print(ds)

## Instruction Dataset Construction

**Template**

In [None]:
PROMPT = """\
### Instruction: 
Classify the misogyny subtype of the following lyric.
Categories:
- S: describe or suggest sexual acts, sexual language, or insinuations
- V: physical or verbal aggression, threats, or violent actions
- H: hoffensive or discriminatory language, expressions of contempt, or hostility towards a group or individual
- NR  : none of the above
Return only one label from: S, V, H, NR
### Input:
{lyrics_clean}
### Response:
{label}"""

In [None]:
#Lora and model downloading parameters
lora_parameters = {
    'lora_r' : 16,
    'target_modules' : ["lm_head", "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    'lora_alpha' : 16,
    'lora_dropout' : 0, # available also another implementations, but 0 is optimized in unsloth
    'lora_bias' : "none",
    'lora_use_gradient_checkpointing' : "unsloth",
    'lora_random_state' : 3407,
    'lora_use_rslora' : True,
    # 'lora_loftq_config' : None
}
model_parameters = {
    'model_name' : 'unsloth/Qwen3-14B-Base-unsloth-bnb-4bit',
    'model_max_seq_length' : 4096,
    'model_dtype' : None ,
    'model_load_in_4bit' : True
}


In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_parameters['model_name'],
    max_seq_length = model_parameters['model_max_seq_length'],
    dtype = model_parameters['model_dtype'],
    load_in_4bit = model_parameters['model_load_in_4bit'],
    force_download=True, 
)
# Replace lm_head with 4 token head torch.Size([4, 5120])
label_token_ids: Dict[int,int] = {}
for i, spaced_lbl in id2label.items():
    toks = tokenizer.encode(spaced_lbl, add_special_tokens=False)
    assert len(toks)==1, f"{spaced_lbl!r} is still {toks}"
    label_token_ids[toks[0]] = i

orig_head  = model.lm_head                   # [vocab × hidden]
old_shape = model.lm_head.weight.shape
old_size = old_shape[0]
hidden_dim = orig_head.weight.shape[1]
classifier = nn.Linear(hidden_dim, len(label2id), bias=False)

# copy rows in class‐order:
row_ids = [tok_id for tok_id, cls in sorted(label_token_ids.items(), key=lambda kv: kv[1])]
classifier.weight.data = orig_head.weight.data[row_ids]
model.lm_head = classifier      


print(model.lm_head.weight.shape)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_parameters['lora_r'],
    target_modules = lora_parameters['target_modules'],
    lora_alpha = lora_parameters['lora_alpha'],
    lora_dropout = lora_parameters['lora_dropout'],
    bias = lora_parameters['lora_bias'],
    use_gradient_checkpointing = lora_parameters['lora_use_gradient_checkpointing'],
    random_state = lora_parameters['lora_random_state'],
    use_rslora = lora_parameters['lora_use_rslora'],  
)

model.print_trainable_parameters()

In [None]:
#  Datacollator for last token 4 class loss
class DataCollatorForLastToken4Way(DataCollatorForLanguageModeling):
    def __init__(
        self,
        tokenizer,
        mlm: bool = False,
        ignore_index: int = -100,
        label_token_ids: Dict[int, int] = None,
        **kwargs,
    ):
        super().__init__(tokenizer=tokenizer, mlm=mlm, **kwargs)
        self.ignore_index = ignore_index
        self.label_token_ids = label_token_ids or {}

    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        batch = super().torch_call(examples)
        labels = batch["labels"]  # [B, seq_len]
        for i in range(labels.size(0)):
            seq = labels[i]
            last_idx = (seq != self.ignore_index).nonzero()[-1].item()
            seq[:last_idx] = self.ignore_index
            tok = seq[last_idx].item()
            seq[last_idx] = self.label_token_ids.get(tok, self.ignore_index)
        batch["labels"] = labels
        return batch



collator = DataCollatorForLastToken4Way(
    tokenizer=tokenizer,
    mlm=False,
    ignore_index=-100,
    label_token_ids=label_token_ids,
)

In [None]:
def build_prompt(examples):
    texts = []
    for lyric, lbl in zip(examples["lyrics_clean"], examples["label"]):
        texts.append(
            PROMPT.format(
                lyrics_clean=lyric, 
                label=id2label[int(lbl)], 
            ) 
        )
    return {"text": texts}


def build_prompt_test(examples):
    texts = []
    for lyric, lbl in zip(examples["lyrics_clean"], examples["label"]):
        texts.append(
            PROMPT.format(
                lyrics_clean=lyric, 
                label=""
            ) 
        )
    return {"text": texts}

ds_train = ds["train"].map(
    build_prompt,
    batched=True,
    remove_columns=[
      "id", "lyrics", 
      "augmentation_type", 
        "is_augmented"
    ]
)

ds_val = ds["val"].map(
    build_prompt_test,
    batched=True,
    remove_columns=[
      "id", "lyrics", 
      "augmentation_type", 
        "is_augmented"
    ]
)

ds_test = ds["test"].map(
    build_prompt_test,
    batched=True,
    remove_columns=[
      "id", "lyrics", 
      "augmentation_type", 
        "is_augmented"
    ]
)

## Model Learning (Instruction Tuning)

In [None]:
training_arguments = {
    'evaluation_strategy' : "steps",
    'logging_strategy' : "steps",
    'save_strategy' : "epoch",
    'eval_steps' : 10,
    'per_device_train_batch_size' : 4,
    'evaluation_strategy' : "steps",
    'logging_strategy' : "steps",
    'save_strategy' : "epoch",
    'gradient_accumulation_steps' : 2,
    'warmup_steps' : 10,
    'max_grad_norm':0.3,
    'learning_rate' : 1e-4,
    'fp16' : not is_bfloat16_supported(),
    'bf16' : is_bfloat16_supported(),
    'logging_steps' : 1,
    'optim' : "adamw_8bit",
    'weight_decay' : 0.001,
    'lr_scheduler_type' : "cosine",
    'seed' : 3407,
    'output_dir' : "outputs",
    'num_train_epochs' : 3,
}

params = {**lora_parameters, **model_parameters, **training_arguments}

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset=ds_train,     
    eval_dataset = ds_val,       
    dataset_text_field = "text",
    max_seq_length = model_parameters['model_max_seq_length'],
    dataset_num_proc = 2,
    packing = False,
    
    args = TrainingArguments(
        eval_strategy=training_arguments['evaluation_strategy'],
        logging_strategy=training_arguments['logging_strategy'],
        save_strategy=training_arguments['save_strategy'],
        eval_steps=training_arguments['eval_steps'],
        per_device_train_batch_size = training_arguments['per_device_train_batch_size'],
        gradient_accumulation_steps = training_arguments['gradient_accumulation_steps'],
        warmup_steps = training_arguments['warmup_steps'], 
        max_grad_norm = training_arguments['max_grad_norm'],
        learning_rate = training_arguments['learning_rate'],
        fp16 = training_arguments['fp16'],
        bf16 = training_arguments['bf16'],
        logging_steps = training_arguments['logging_steps'],
        optim = training_arguments['optim'],
        weight_decay = training_arguments['weight_decay'],
        lr_scheduler_type = training_arguments['lr_scheduler_type'],
        seed = training_arguments['seed'],
        output_dir = training_arguments['output_dir'],
        num_train_epochs=training_arguments['num_train_epochs'],
        group_by_length = True, 
    ),
    data_collator           = collator,
)

trainer.model.print_trainable_parameters()
trainer.train()

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

In [None]:
enc = tokenizer(ds_val["text"], truncation=True, padding="longest",
                max_length=4096, return_tensors="pt", add_special_tokens=False)
eval_ds = TensorDataset(enc["input_ids"], enc["attention_mask"],
                       torch.tensor(ds_val["label"], dtype=torch.long))
loader  = DataLoader(eval_ds, batch_size=8, shuffle=False)

device = next(model.parameters()).device
all_preds, all_labels = [], []

with torch.no_grad():
    for in_ids, attn, labs in loader:
        in_ids, attn = in_ids.to(device), attn.to(device)
        out = model(input_ids=in_ids, attention_mask=attn).logits
        if out.ndim == 3:
            out = out[:, -1, :]           # pick last‐token logits
        probs = torch.softmax(out, dim=-1)
        preds = torch.argmax(probs, dim=-1).cpu().tolist()
        all_preds.extend(preds)
        all_labels.extend(labs.tolist())

In [None]:
accuracy = accuracy_score(all_labels, all_preds)
print(f"Validation accuracy: {accuracy*100:.2f}% ({sum(p==t for p,t in zip(all_preds, all_labels))}/{len(all_labels)})")

target_names = [id2label[i] for i in range(len(id2label))]
print("\nClassification report:\n")
print(classification_report(all_labels, all_preds, target_names=target_names, digits=4))

cm = confusion_matrix(all_labels, all_preds)
print("\nConfusion matrix:\n", cm)


Validation accuracy: 69.23% (81/117)

Classification report:

              precision    recall  f1-score   support

           H     0.5714    0.5714    0.5714         7
          NR     0.8776    0.7288    0.7963        59
           S     0.5455    0.8571    0.6667        35
           V     0.6667    0.2500    0.3636        16

    accuracy                         0.6923       117
   macro avg     0.6653    0.6018    0.5995       117
weighted avg     0.7311    0.6923    0.6849       117


Confusion matrix:
 [[ 4  2  1  0]
 [ 1 43 13  2]
 [ 2  3 30  0]
 [ 0  1 11  4]]


## Save Adapter to Hugging Face

In [None]:
MODEL_NAME = model_parameters['model_name'].split('/')[-1]
REPO_ID = f"repo_name"

model.save_pretrained(MODEL_NAME, safe_serialization=True)

model.push_to_hub(
    repo_id     = REPO_ID,   # creates repo if it doesn’t exist
    token       = "your_token",                     # or rely on `huggingface-cli login`
    commit_message = (
        f"Adapter. Classes: {len(id2label)}. "
        f"Hyper‑parameters: {params}"
    ),
    safe_serialization = True,        
)