In [2]:
print("Test!")

Test!


# Load Dataset and Model

In [4]:
from datasets import load_dataset

dataset_name = "OneFly7/llama2-sst2-fine-tuning-without-system-info"
validataion_dataset = load_dataset(dataset_name, split="validation")

Found cached dataset parquet (/home/9130/.cache/huggingface/datasets/OneFly7___parquet/OneFly7--llama2-sst2-fine-tuning-without-system-info-ff57b5730490a513/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [5]:
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

## Version 2-7b for finetuning
base_model_name = "meta-llama/Llama-2-13b-hf"
adapters_name = "./models/llama-2-13b-hf-SFT-5000-1ep"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

device_map = {"": 0}

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    trust_remote_code=True,
    use_auth_token=True
)

base_model.config.use_cache = False

# More info: https://github.com/huggingface/transformers/pull/24906
base_model.config.pretraining_tp = 1 



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
peft_model = PeftModel.from_pretrained(base_model, adapters_name)
# This method merges the LoRa layers into the base model. 
# This is needed if someone wants to use the base model as a standalone model.
# peft_model = peft_model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

## Inference

In [7]:
validataion_dataset

Dataset({
    features: ['label_text', 'text'],
    num_rows: 872
})

In [8]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        val = self.dataset[idx]
        label_text = val['label_text']
        sentence = val['text']  # Here, sentence is already in the format of llama prompt tamplate
   
        inputs = self.tokenizer(sentence, return_tensors="pt").to("cuda")
        labels = self.tokenizer(label_text, return_tensors="pt").to("cuda")
        
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': labels['input_ids'].squeeze()
        }


In [9]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    input_ids = [item['input_ids'].tolist() for item in batch]
    attention_mask = [item['attention_mask'].tolist() for item in batch]
    labels = [item['labels'] for item in batch]

    # Left Padding
    max_length = max([len(item) for item in input_ids])
    input_ids = [[0]*(max_length - len(item)) + item for item in input_ids]
    attention_mask = [[0]*(max_length - len(item)) + item for item in attention_mask]

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)
    # Usually, labels are not padded
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

In [14]:
import re
from tqdm import tqdm

def evaluate_SFT(dataset, tokenizer, batch_size=16):
    chatDataset = CustomDataset(dataset, tokenizer)
    data_loader = DataLoader(chatDataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    peft_model.eval()

    compared_result = []
    invalid_label = []

    for i, batch in enumerate(tqdm(data_loader)):
        # Move batch to GPU
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")
        labels = batch["labels"].to("cuda")

        # Generate for the entire batch
        outputs = peft_model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=80,
            pad_token_id=tokenizer.eos_token_id
        )

        # Decode the generated text and labels
        outputs_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        label_decoded = tokenizer.batch_decode(labels, skip_special_tokens=True)

        print(outputs_text[0])
        assert 1 == 0

        # Evaluate the generated text
        for idx in range(len(outputs_text)):
            # Extract the last sentence
            selected_sentiment = outputs_text[idx].split("Sentiment: ")[1].strip()
            selected_sentiment = selected_sentiment.split(" ")[0].lower()
            
            if selected_sentiment not in ['positive', 'negative']:
                invalid_label.append(selected_sentiment)
                compared_result.append(0)
                continue
            
            if selected_sentiment == label_decoded[idx]:
                compared_result.append(1)
            else:
                compared_result.append(0)

    return compared_result, invalid_label


In [20]:
import re
from tqdm import tqdm

def evaluate_SFT(dataset, tokenizer, batch_size=16):
    chatDataset = CustomDataset(dataset, tokenizer)
    data_loader = DataLoader(chatDataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    peft_model.eval()

    compared_result = []
    invalid_label = []

    for i, batch in enumerate(tqdm(data_loader)):
        # Move batch to GPU
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")
        labels = batch["labels"].to("cuda")

        # Generate for the entire batch
        outputs = peft_model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=80,
            pad_token_id=tokenizer.eos_token_id
        )

        # Decode the generated text and labels
        outputs_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        label_decoded = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Evaluate the generated text
        for idx in range(len(outputs_text)):
            # Extract the last sentence
            selected_sentiment = outputs_text[idx].split("Sentiment: ")[1].strip()
            selected_sentiment = selected_sentiment.split(" ")[1].lower()
            
            if selected_sentiment not in ['positive', 'negative']:
                invalid_label.append(selected_sentiment)
                compared_result.append(0)
                continue
            
            if selected_sentiment == label_decoded[idx]:
                compared_result.append(1)
            else:
                compared_result.append(0)

    return compared_result, invalid_label


In [21]:
comp_res, invalid_label = evaluate_SFT(validataion_dataset, tokenizer, 16)

 16%|█▋        | 9/55 [09:17<47:55, 62.51s/it]

In [27]:
from collections import Counter
def showEvalResults(compare_results, invalid_label):
    counted_elements = Counter(invalid_label)
    accuracy = compare_results.count(1)/len(compare_results)
    print("Accuracy:", accuracy)
    print("# of Invalid labels:", len(invalid_label), "out of", len(compare_results), "samples")
    print("Invalid labels:", counted_elements)


In [28]:
showEvalResults(comp_res, invalid_label)

Accuracy: 0.9128440366972477
# of Invalid labels: 3 out of 872 samples
Invalid labels: Counter({'[/sent]': 2, '[/sent]\n\nsentence:': 1})
