# 3. Evaluation

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from peft import PeftModel
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm

## Evaluation on the finetuned model

In [2]:
base_model = "unsloth/llama-3.2-1b-instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    device_map="auto",
    torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(base_model)

model = PeftModel.from_pretrained(
    model,
    "../finetuned_model_weights/run2",
    is_trainable=False,
    local_files_only=True   
)
model.to(device)
model.eval()

print("Merging PEFT weights into base model…")
model = model.merge_and_unload()
print("Merge complete.")

gen_config = GenerationConfig(
    max_new_tokens=50,
    temperature=0.1,
    top_p=0.9,
)

Merging PEFT weights into base model…


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Merge complete.


In [15]:
eval_df = pd.read_csv("../data/combined_val_data.csv")

### Evaluation on entire dataset

In [8]:
sentiment_words = ["negative", "positive", "neutral"]
preds, actuals = [], []
count = 0

# loop over entire val set
for _, row in tqdm(eval_df.iterrows(), total=len(eval_df)):
      messages = [{"role": "user", "content": row["user_msg"]}]
      prompt = tokenizer.apply_chat_template(
          messages, tokenize=False, add_generation_prompt=True
      )
      inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
      outputs = model.generate(**inputs, generation_config=gen_config)
      response = tokenizer.decode(outputs[0], skip_special_tokens=True).lower()

      last_word = response.split()[-1].strip(".,!?;:")
      pred = last_word
      preds.append(pred)
      actuals.append(row["output"])
      if pred == row["output"]:
          count += 1

print(f"\nExact‐match count: {count}/{len(preds)}\n")

print("CLASSIFICATION REPORT:")
print(classification_report(actuals, preds, labels=sentiment_words))

print("CONFUSION MATRIX:")
print(confusion_matrix(actuals, preds, labels=sentiment_words))

100%|██████████| 500/500 [06:31<00:00,  1.28it/s]


Exact‐match count: 427/500

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

    negative       0.92      0.86      0.89       166
    positive       0.87      0.85      0.86       167
     neutral       0.79      0.85      0.82       167

    accuracy                           0.85       500
   macro avg       0.86      0.85      0.85       500
weighted avg       0.86      0.85      0.85       500

CONFUSION MATRIX:
[[143   6  17]
 [  4 142  21]
 [  9  16 142]]





### Evaluation on english and vietnamese respectively

In [16]:
eval_df_eng = eval_df[eval_df["language"] == "eng"]
print(f"\nEnglish examples: {len(eval_df_eng)}")




English examples: 250


In [17]:
sentiment_words = ["negative", "positive", "neutral"]
preds, actuals = [], []
count = 0

# loop over eng val set
for _, row in tqdm(eval_df_eng.iterrows(), total=len(eval_df_eng)):
      messages = [{"role": "user", "content": row["user_msg"]}]
      prompt = tokenizer.apply_chat_template(
          messages, tokenize=False, add_generation_prompt=True
      )
      inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
      outputs = model.generate(**inputs, generation_config=gen_config)
      response = tokenizer.decode(outputs[0], skip_special_tokens=True).lower()

      last_word = response.split()[-1].strip(".,!?;:")
      pred = last_word
      preds.append(pred)
      actuals.append(row["output"])
      if pred == row["output"]:
          count += 1

print(f"\nExact‐match count: {count}/{len(preds)}\n")

print("CLASSIFICATION REPORT:")
print(classification_report(actuals, preds, labels=sentiment_words))

print("CONFUSION MATRIX:")
print(confusion_matrix(actuals, preds, labels=sentiment_words))

100%|██████████| 250/250 [03:51<00:00,  1.08it/s]


Exact‐match count: 204/250

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

    negative       0.90      0.84      0.87        83
    positive       0.81      0.77      0.79        84
     neutral       0.75      0.83      0.79        83

    accuracy                           0.82       250
   macro avg       0.82      0.82      0.82       250
weighted avg       0.82      0.82      0.82       250

CONFUSION MATRIX:
[[70  6  7]
 [ 3 65 16]
 [ 5  9 69]]





In [18]:
eval_df_viet = eval_df[eval_df["language"] == "vi"]
print(f"\nVietnamese examples: {len(eval_df_viet)}")


Vietnamese examples: 250


In [19]:
sentiment_words = ["negative", "positive", "neutral"]
preds, actuals = [], []
count = 0

# loop over entire val set
for _, row in tqdm(eval_df_viet.iterrows(), total=len(eval_df_viet)):
      messages = [{"role": "user", "content": row["user_msg"]}]
      prompt = tokenizer.apply_chat_template(
          messages, tokenize=False, add_generation_prompt=True
      )
      inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
      outputs = model.generate(**inputs, generation_config=gen_config)
      response = tokenizer.decode(outputs[0], skip_special_tokens=True).lower()

      last_word = response.split()[-1].strip(".,!?;:")
      pred = last_word
      preds.append(pred)
      actuals.append(row["output"])
      if pred == row["output"]:
          count += 1

print(f"\nExact‐match count: {count}/{len(preds)}\n")

print("CLASSIFICATION REPORT:")
print(classification_report(actuals, preds, labels=sentiment_words))

print("CONFUSION MATRIX:")
print(confusion_matrix(actuals, preds, labels=sentiment_words))

100%|██████████| 250/250 [02:57<00:00,  1.41it/s]


Exact‐match count: 222/250

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

    negative       0.92      0.88      0.90        83
    positive       0.92      0.93      0.92        83
     neutral       0.83      0.86      0.84        84

    accuracy                           0.89       250
   macro avg       0.89      0.89      0.89       250
weighted avg       0.89      0.89      0.89       250

CONFUSION MATRIX:
[[73  0 10]
 [ 1 77  5]
 [ 5  7 72]]





## Evaluation on Base Model

In [9]:
model_base = AutoModelForCausalLM.from_pretrained(
    base_model,
    device_map="auto",
    torch_dtype=torch.float16,
).to(device)
tokenizer = AutoTokenizer.from_pretrained(base_model)

# 4) Prepare generation config
gen_config = GenerationConfig(
    max_new_tokens=200,
    temperature=0.1,
    top_p=0.9,
)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [10]:
import re

In [11]:
def extract_sentiment_from_response(response: str) -> str:
    """
    1) Drop everything up to and including the first 'assistant'
    2) Find and return the first sentiment token (negative|neutral|positive)
    """
    # 1) Split on 'assistant' and keep the tail
    parts = response.split("assistant", 1)
    tail = parts[1] if len(parts) > 1 else response

    # 2) Search for sentiment words
    match = re.search(r'\b(negative|neutral|positive)\b', tail.lower())
    return match.group(1) if match else ""

In [12]:
sentiment_words = ["negative", "positive", "neutral"]
preds, actuals = [], []
count = 0

# loop over entire val set
for _, row in tqdm(eval_df.iterrows(), total=len(eval_df)):
      messages = [{"role": "user", "content": row["user_msg"]}]
      prompt = tokenizer.apply_chat_template(
          messages, tokenize=False, add_generation_prompt=True
      )
      inputs = tokenizer(prompt, return_tensors="pt").to(device)
      outputs = model_base.generate(**inputs, generation_config=gen_config)
      response = tokenizer.decode(outputs[0], skip_special_tokens=True).lower()

      pred = extract_sentiment_from_response(response)
      preds.append(pred)
      actuals.append(row["output"])
      if pred == row["output"]:
          count += 1

print(f"\nExact‐match count: {count}/{len(preds)}\n")

print("CLASSIFICATION REPORT:")
print(classification_report(actuals, preds, labels=sentiment_words))

print("CONFUSION MATRIX:")
print(confusion_matrix(actuals, preds, labels=sentiment_words))

100%|██████████| 500/500 [18:52<00:00,  2.26s/it]


Exact‐match count: 212/500

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

    negative       0.56      0.42      0.48       166
    positive       0.47      0.41      0.44       167
     neutral       0.36      0.44      0.40       167

   micro avg       0.45      0.42      0.44       500
   macro avg       0.47      0.42      0.44       500
weighted avg       0.47      0.42      0.44       500

CONFUSION MATRIX:
[[70 27 60]
 [23 68 69]
 [32 49 74]]





### Evcaluate on English and Vietnamese Dataset

In [20]:
sentiment_words = ["negative", "positive", "neutral"]
preds, actuals = [], []
count = 0

# loop over eng val set
for _, row in tqdm(eval_df_eng.iterrows(), total=len(eval_df_eng)):
      messages = [{"role": "user", "content": row["user_msg"]}]
      prompt = tokenizer.apply_chat_template(
          messages, tokenize=False, add_generation_prompt=True
      )
      inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
      outputs = model_base.generate(**inputs, generation_config=gen_config)
      response = tokenizer.decode(outputs[0], skip_special_tokens=True).lower()

      last_word = response.split()[-1].strip(".,!?;:")
      pred = last_word
      preds.append(pred)
      actuals.append(row["output"])
      if pred == row["output"]:
          count += 1

print(f"\nExact‐match count: {count}/{len(preds)}\n")

print("CLASSIFICATION REPORT:")
print(classification_report(actuals, preds, labels=sentiment_words))

print("CONFUSION MATRIX:")
print(confusion_matrix(actuals, preds, labels=sentiment_words))

100%|██████████| 250/250 [14:29<00:00,  3.48s/it]


Exact‐match count: 52/250

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

    negative       0.82      0.22      0.34        83
    positive       0.52      0.32      0.40        84
     neutral       1.00      0.08      0.16        83

   micro avg       0.64      0.21      0.31       250
   macro avg       0.78      0.21      0.30       250
weighted avg       0.78      0.21      0.30       250

CONFUSION MATRIX:
[[18 11  0]
 [ 1 27  0]
 [ 3 14  7]]





In [21]:
sentiment_words = ["negative", "positive", "neutral"]
preds, actuals = [], []
count = 0

# loop over entire val set
for _, row in tqdm(eval_df_viet.iterrows(), total=len(eval_df_viet)):
      messages = [{"role": "user", "content": row["user_msg"]}]
      prompt = tokenizer.apply_chat_template(
          messages, tokenize=False, add_generation_prompt=True
      )
      inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
      outputs = model_base.generate(**inputs, generation_config=gen_config)
      response = tokenizer.decode(outputs[0], skip_special_tokens=True).lower()

      last_word = response.split()[-1].strip(".,!?;:")
      pred = last_word
      preds.append(pred)
      actuals.append(row["output"])
      if pred == row["output"]:
          count += 1

print(f"\nExact‐match count: {count}/{len(preds)}\n")

print("CLASSIFICATION REPORT:")
print(classification_report(actuals, preds, labels=sentiment_words))

print("CONFUSION MATRIX:")
print(confusion_matrix(actuals, preds, labels=sentiment_words))

100%|██████████| 250/250 [06:00<00:00,  1.44s/it]


Exact‐match count: 77/250

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

    negative       0.52      0.16      0.24        83
    positive       0.80      0.05      0.09        83
     neutral       0.33      0.71      0.45        84

   micro avg       0.36      0.31      0.33       250
   macro avg       0.55      0.31      0.26       250
weighted avg       0.55      0.31      0.26       250

CONFUSION MATRIX:
[[13  0 57]
 [ 2  4 67]
 [10  1 60]]



