In [2]:
import pandas as pd
from transformers import LlamaTokenizer, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, PeftModel, TaskType
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType, PeftModel, PeftConfig, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
from trl import SFTTrainer
import torch
from accelerate import Accelerator
from transformers import LlamaForSequenceClassification
from datetime import datetime


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import os
def evaluate_model(base_model, save_path, tokenizer, test_dataset):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    compute_dtype = getattr(torch, "float16")
    use_4bit = True
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=use_4bit, # Load model in 4bit
        bnb_4bit_quant_type="nf4", # Use 4bit quantization. NormalFloat4 
        bnb_4bit_compute_dtype=compute_dtype, # Use float16 for computation
        bnb_4bit_use_double_quant=False, # Use double quantization
    )
    
    base_model = LlamaForSequenceClassification.from_pretrained(base_model, num_labels=7, device_map=None)
    base_model.eval()
    
    base_model.config.pad_token_id = base_model.config.eos_token_id
    # base_model.config.num_labels = 7
    # Load the fine-tuned model
    # model = LlamaForSequenceClassification.from_pretrained(save_path, num_labels=3, quantization_config=bnb_config)
    # model.eval()
    # model.resize_token_embeddings(len(tokenizer))
    # model.config.pad_token_id = model.config.eos_token_id
    # model.config.num_labels = 3
    
    model = PeftModel.from_pretrained(base_model, save_path)
    model = model.merge_and_unload()
    model.to(device)
    model.eval()

    # Tokenize the test dataset
    def tokenize_function(examples):
      return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)
    
    tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
    tokenized_test_dataset = tokenized_test_dataset.rename_column("label", "labels") 
    tokenized_test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    # Create DataLoader for evaluation
    from torch.utils.data import DataLoader
    eval_dataloader = DataLoader(tokenized_test_dataset, batch_size=16)

    # Move model to device
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # model.to(device)

    all_preds = []
    all_labels = []
        
    test_dataset_arr = []
    # Evaluation loop
    with torch.no_grad():
        for batch in eval_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, axis=1)
            

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            
            # save the text and label
            for i in range(len(preds)):
                test_dataset_arr.append({
                    'text': tokenizer.decode(input_ids[i], skip_special_tokens=True),
                    'pred': preds[i].item(),
                    'label': labels[i].item()
                })

    # Calculate accuracy
    accuracy = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds)#, target_names=["Negative", "Neutral", "Positive"])
    confusion = confusion_matrix(all_labels, all_preds)
    
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(report)
    
    print("Confusion Matrix:")
    print(confusion)
    
    # kill model
    # del model
    # del base_model
    # torch.cuda.empty_cache()

    return accuracy, report, test_dataset_arr

In [3]:
test_df = pd.read_csv('/home/sd3528/hetav-2/data/eval_30_each.csv')
test_df["text"] = test_df["text"].fillna("").astype(str)
test_dataset = Dataset.from_pandas(test_df)

In [4]:
# tokenizer = AutoTokenizer.from_pretrained("/home/sd3528/hetav-2/experiments/llama3-8b-qlora-v2-majority-sampling/model")
# tokenizer.pad_token = tokenizer.eos_token

In [5]:
# Base
test_df = pd.read_csv('/home/sd3528/hetav-2/data/eval_30_each.csv')
test_df["text"] = test_df["text"].fillna("").astype(str)
test_dataset = Dataset.from_pandas(test_df)
tokenizer = AutoTokenizer.from_pretrained("/home/sd3528/hetav-2/experiments/llama3-8b-qlora-org-vfinal/model")
tokenizer.pad_token = tokenizer.eos_token
evaluate_model(
    base_model="meta-llama/Meta-Llama-3-8B",
    save_path="/home/sd3528/hetav-2/experiments/llama3-8b-qlora-org-vfinal/model",
    tokenizer=tokenizer,
    test_dataset=test_dataset
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.53it/s]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 210/210 [00:00<00:00, 14547.44 examples/s]


Accuracy: 0.6286
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.70      0.69        30
           1       0.91      0.33      0.49        30
           2       0.85      0.73      0.79        30
           3       1.00      0.70      0.82        30
           4       0.38      0.67      0.48        30
           5       0.39      0.60      0.47        30
           6       0.91      0.67      0.77        30

    accuracy                           0.63       210
   macro avg       0.73      0.63      0.64       210
weighted avg       0.73      0.63      0.64       210

Confusion Matrix:
[[21  0  0  0  7  2  0]
 [ 2 10  0  0 10  8  0]
 [ 0  0 22  0  4  4  0]
 [ 1  0  1 21  3  4  0]
 [ 6  1  1  0 20  2  0]
 [ 1  0  2  0  7 18  2]
 [ 0  0  0  0  2  8 20]]


(0.6285714285714286,
 '              precision    recall  f1-score   support\n\n           0       0.68      0.70      0.69        30\n           1       0.91      0.33      0.49        30\n           2       0.85      0.73      0.79        30\n           3       1.00      0.70      0.82        30\n           4       0.38      0.67      0.48        30\n           5       0.39      0.60      0.47        30\n           6       0.91      0.67      0.77        30\n\n    accuracy                           0.63       210\n   macro avg       0.73      0.63      0.64       210\nweighted avg       0.73      0.63      0.64       210\n')

In [6]:
# Majority Sample
test_df = pd.read_csv('/home/sd3528/hetav-2/data/eval_30_each.csv')
test_df["text"] = test_df["text"].fillna("").astype(str)
test_dataset = Dataset.from_pandas(test_df)
tokenizer = AutoTokenizer.from_pretrained("/home/sd3528/hetav-2/experiments/llama3-8b-qlora-v2-certainty-vfinal/model")
tokenizer.pad_token = tokenizer.eos_token
evaluate_model(
    base_model="meta-llama/Meta-Llama-3-8B",
    save_path="/home/sd3528/hetav-2/experiments/llama3-8b-qlora-v2-certainty-vfinal/model",
    tokenizer=tokenizer,
    test_dataset=test_dataset
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.46it/s]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 210/210 [00:00<00:00, 20761.92 examples/s]


Accuracy: 0.7048
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.53      0.59        30
           1       0.85      0.73      0.79        30
           2       0.81      0.73      0.77        30
           3       1.00      0.70      0.82        30
           4       0.44      0.70      0.54        30
           5       0.59      0.80      0.68        30
           6       0.96      0.73      0.83        30

    accuracy                           0.70       210
   macro avg       0.76      0.70      0.72       210
weighted avg       0.76      0.70      0.72       210

Confusion Matrix:
[[16  0  1  0 12  1  0]
 [ 1 22  1  0  3  3  0]
 [ 1  1 22  0  3  3  0]
 [ 1  1  0 21  4  3  0]
 [ 5  2  1  0 21  1  0]
 [ 0  0  2  0  3 24  1]
 [ 0  0  0  0  2  6 22]]


(0.7047619047619048,
 '              precision    recall  f1-score   support\n\n           0       0.67      0.53      0.59        30\n           1       0.85      0.73      0.79        30\n           2       0.81      0.73      0.77        30\n           3       1.00      0.70      0.82        30\n           4       0.44      0.70      0.54        30\n           5       0.59      0.80      0.68        30\n           6       0.96      0.73      0.83        30\n\n    accuracy                           0.70       210\n   macro avg       0.76      0.70      0.72       210\nweighted avg       0.76      0.70      0.72       210\n')

In [7]:
# # minority sample

# test_df = pd.read_csv('/home/sd3528/hetav-2/data/eval_30_each.csv')
# test_df["text"] = test_df["text"].fillna("").astype(str)
# test_dataset = Dataset.from_pandas(test_df)

# tokenizer = AutoTokenizer.from_pretrained("/home/sd3528/hetav-2/experiments/llama3-8b-qlora-v2-minority-sampling/model")
# tokenizer.pad_token = tokenizer.eos_token
# evaluate_model(
#     base_model="meta-llama/Meta-Llama-3-8B",
#     save_path="/home/sd3528/hetav-2/experiments/llama3-8b-qlora-v2-minority-sampling/model",
#     tokenizer=tokenizer,
#     test_dataset=test_dataset
# )

In [5]:
# minority sample

test_df = pd.read_csv('/home/sd3528/hetav-2/data/margin/valid_minority_sampling_w_topic.csv')
test_df["text"] = test_df["text"].fillna("").astype(str)
test_dataset = Dataset.from_pandas(test_df)
tokenizer = AutoTokenizer.from_pretrained("/home/sd3528/hetav-2/experiments/llama3-8b-qlora-v2-minority-sampling-v2/model")
tokenizer.pad_token = tokenizer.eos_token
_ , _ ,test_dataset_arr = evaluate_model(
    base_model="meta-llama/Meta-Llama-3-8B",
    save_path="/home/sd3528/hetav-2/experiments/llama3-8b-qlora-v2-minority-sampling-v2/model",
    tokenizer=tokenizer,
    test_dataset=test_dataset
)
pd.DataFrame(test_dataset_arr).to_csv('/home/sd3528/hetav-2/data/margin/margin_inference.csv', index=False)

Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.48it/s]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 402/402 [00:00<00:00, 23599.45 examples/s]


Accuracy: 0.7139
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.45      0.56        33
           1       0.67      0.83      0.74        12
           2       0.67      0.85      0.75        47
           3       0.75      0.82      0.78        33
           4       0.62      0.70      0.66       105
           5       0.80      0.68      0.74       133
           6       0.83      0.77      0.80        39

    accuracy                           0.71       402
   macro avg       0.72      0.73      0.72       402
weighted avg       0.72      0.71      0.71       402

Confusion Matrix:
[[15  0  0  0 16  2  0]
 [ 0 10  0  0  2  0  0]
 [ 0  0 40  0  5  2  0]
 [ 0  0  0 27  5  1  0]
 [ 3  2  6  6 74 11  3]
 [ 3  3 13  3 17 91  3]
 [ 0  0  1  0  1  7 30]]
