# evaluate the fine tuned model

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import AutoPeftModelForCausalLM
import torch
from datasets import load_dataset, Dataset
import multiprocessing
import json
import time

import transformers
transformers.logging.set_verbosity_info()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# load the base model tokenizer
checkpoint = "HuggingFaceTB/SmolLM-360M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

loading file vocab.json from cache at /Users/harishl/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM-360M/snapshots/59f7ef243ee09a72cbc14cb054393a3e3b771d41/vocab.json
loading file merges.txt from cache at /Users/harishl/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM-360M/snapshots/59f7ef243ee09a72cbc14cb054393a3e3b771d41/merges.txt
loading file tokenizer.json from cache at /Users/harishl/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM-360M/snapshots/59f7ef243ee09a72cbc14cb054393a3e3b771d41/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /Users/harishl/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM-360M/snapshots/59f7ef243ee09a72cbc14cb054393a3e3b771d41/special_tokens_map.json
loading file tokenizer_config.json from cache at /Users/harishl/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM-360M/snapshots/59f7ef243ee09a72cbc14cb054393a3e3b771d41/tokenizer_config.json
loading file chat_template.ji

In [3]:
print(tokenizer.pad_token, tokenizer.eos_token, tokenizer.unk_token)
print(tokenizer.pad_token_id, tokenizer.eos_token_id, tokenizer.unk_token_id)

None <|endoftext|> <|endoftext|>
None 0 0


In [4]:
# config for smollm 360m
# set the pad token as eos token so that the finetuned model knows when to stop generating tokens
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

In [5]:
print(tokenizer.pad_token, tokenizer.eos_token, tokenizer.unk_token)
print(tokenizer.pad_token_id, tokenizer.eos_token_id, tokenizer.unk_token_id)

<|endoftext|> <|endoftext|> <|endoftext|>
0 0 0


# load the base model and the fine tuned adapter

In [None]:
model_path = "lora_weights/checkpoint-500"
peft_model = AutoPeftModelForCausalLM.from_pretrained(model_path, device_map="cpu", torch_dtype=torch.float16)

print(f"Total No of params (Model + Adapter) : {peft_model.num_parameters() / 1e6} M Params")

# * merge the adapter onto the base model itself
merged_model = peft_model.merge_and_unload()

# * The adapters are merged now and it is transformers class again
print(type(merged_model))

print(merged_model.num_parameters() / 1e6, " M Params")

loading configuration file config.json from cache at /Users/harishl/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM-360M/snapshots/59f7ef243ee09a72cbc14cb054393a3e3b771d41/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "eos_token_id": 0,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 960,
  "initializer_range": 0.02,
  "intermediate_size": 2560,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 15,
  "num_hidden_layers": 32,
  "num_key_value_heads": 5,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "float16",
  "transformers_version": "4.53.2",
  "use_cache": true,
  "vocab_size": 49152
}

loading weights file model.safetensors from cache at /Users/harishl/.cache/huggingface/hub/models--HuggingFaceTB-

'NoneType' object has no attribute 'cadam32bit_grad_fp32'
Total No of params (Model + Adapter) : 379.18816 M Params
<class 'transformers.models.llama.modeling_llama.LlamaForCausalLM'>
361.82112  M Params


### There are two models to experiment, merged model and adapter added model

# load the validation dataset

In [8]:
# read the valid dataset
valid_data = []
with open('data/valid.jsonl', 'r') as file:
    for line in file:
        valid_data.append(json.loads(line))


In [9]:
valid_data[1]["text"]

'<user>December 5, 2024 5pm holiday planning with Lily, Ryan, Mason on Zoom</user><output>{"action": "holiday planning", "attendees": ["Lily", "Ryan", "Mason"], "date": "05/12/2024", "duration": null, "location": "Zoom", "notes": null, "recurrence": null, "time": "5:00 PM"}</output><|endoftext|>'

# batch inference

In [10]:
count = 0
missed = 0
field_wise_acc = {key: 0 for key in valid_data[0]["output"].keys()}

for ind in range(len(valid_data)):

    input_text = f"<user>{valid_data[ind]['event_text']}</user><output>"

    inputs = tokenizer(input_text, return_tensors="pt")
    # print(f"\nInput Text : {input_text}")
    # print(f"Length of input tokens processed : {len(inputs['input_ids'][0])}")
    
    output_tokens = merged_model.generate(**inputs, max_new_tokens=200, do_sample=False, pad_token_id=tokenizer.eos_token_id)

    # print(f"Length of output tokens generated : {len(output_tokens[0]) - len(inputs['input_ids'][0])}")

    truncated_output = tokenizer.decode(output_tokens[0][len(inputs[0]):])
    output = tokenizer.decode(output_tokens[0])
    valid_data[ind]["predicted"] = output

    print(f"Output text : {output}")
    resp = output.split("</output>")[0].split("<output>")[1]

    try:
        pred = json.loads(resp)
        valid_data[ind]["predicted"] = json.dumps(pred)

    except:
        missed += 1

Output text : <user>Teams meeting to review plans March 4, 2024, 2:30 pm, 1 hr.</user><output>{"action": "Teams meeting", "attendees": null, "date": "04/03/2024", "duration": "1 hr", "location": null, "notes": null, "recurrence": null, "time": "2:30 PM"}</output><|endoftext|>
Output text : <user>December 5, 2024 5pm holiday planning with Lily, Ryan, Mason on Zoom</user><output>{"action": "holiday planning", "attendees": ["Lily", "Ryan", "Mason"], "date": "05/12/2024", "duration": null, "location": "Zoom", "notes": null, "recurrence": null, "time": "5:00 PM"}</output><|endoftext|>
Output text : <user>Review plans w/ Harry, Isla, and Zoe on June 15, 2024, 3pm</user><output>{"action": "Review plans", "attendees": ["Harry", "Isla", "Zoe"], "date": "15/06/2024", "duration": null, "location": null, "notes": null, "recurrence": null, "time": "3:00 PM"}</output><|endoftext|>
Output text : <user>2023-12-19 1 PM Presentation session with Mia at Central YMCA, 2 hours.</user><output>{"action": "Pr

In [None]:

# * save the predicted values
with open("smollm_360m_r32_la64_ckpt500_merged.jsonl", "w") as f:
    for item in valid_data:
        f.write(json.dumps(item) + "\n")

In [11]:
missed

1

# evaluation function

In [None]:
# function to calculate acc, prec and recall
def validate_char_wise(data):

    def get_values(datapoint):
        entities = set()
        # Extract the entities
        for value in datapoint.values():
            if isinstance(value, list):
                entities.update(value)
            else:
                entities.add(value)

        return entities

    count = 0
    missed = 0
    field_wise_acc = {key: 0 for key in valid_data[0]["output"].keys()}

    true_positives = 0
    false_positives = 0
    false_negatives = 0

    for ind in range(len(data)):
        try:
            pred = json.loads(data[ind]["predicted"])
        except:
            missed += 1
            pass
        gt = data[ind]["output"]

        correct = True
        for key in pred.keys():
            # * if the key is name we check for list
            if key =="attendees" and (type(pred[key]) == list) and (type(gt[key]) == list):
                # * if list is found in both,  then we sort and compare the strings.
                # * the sorting reduces error when correct names get shuffled
                gt_names = str(sorted(gt[key])).lower().strip()
                pred_names = str(sorted(pred[key])).lower().strip()
                
                if gt_names == pred_names:
                    field_wise_acc[key] += 1
                else:
                    print("Found an incorrect prediction on a name level")
                    print(f"User Input : {data[ind]['event_text']}")
                    print(f"Ground Truth : \n{gt}\n")
                    print(f"Predicted Value : \n{pred}\n\n")
                    count += 1

            else:
                # * all the other keys gets processed in this block
                # * if the attendees are none for both, then this block is executed
                if str(pred[key]).lower().strip() == str(gt[key]).lower().strip():
                    field_wise_acc[key] += 1
                else:
                    print("Found an incorrect prediction")
                    print(f"User Input : {data[ind]['event_text']}")
                    print(f"Ground Truth : \n{gt}\n")
                    print(f"Predicted Value : \n{pred}\n\n")
                    correct = False
        
        # Extract ground truths
        gt_entities = get_values(gt)
        
        # Extract predictions
        pred_entities = get_values(pred)
    
        # Calculate TP, FP, FN for each sample in test data
        true_positives += len(gt_entities & pred_entities)
        false_positives += len(pred_entities - gt_entities)
        false_negatives += len(gt_entities - pred_entities)

        if correct:
            count += 1

    acc = (count / len(data)) * 100

    if (true_positives + false_positives) > 0:
        precision = true_positives / (true_positives + false_positives)
    else:
        precision = 0
    
    if (true_positives + false_negatives) > 0:
        recall = true_positives / (true_positives + false_negatives)
    else:
        recall = 0

    return acc, precision * 100, recall * 100, field_wise_acc, missed

In [12]:
# read the valid dataset
test_data = []
path = "smollm_360m_old_ckpt300_merged.jsonl"
path = "smollm_360m_old_ckpt450_merged.jsonl"
path = "smollm_360m_r32_la64_ckpt500_peft.jsonl"
path = "smollm_360m_r32_la64_ckpt500_merged.jsonl"
with open(path, 'r') as file:
    for line in file:
        test_data.append(json.loads(line))


In [15]:
acc, precision, recall, field_wise_acc, missed = validate_char_wise(test_data)

Found an incorrect prediction
User Input : Teams meeting to review plans March 4, 2024, 2:30 pm, 1 hr.
Ground Truth : 
{'action': 'Teams meeting to review plans', 'attendees': None, 'date': '04/03/2024', 'duration': '1 hr', 'location': None, 'notes': None, 'recurrence': None, 'time': '2:30 PM'}

Predicted Value : 
{'action': 'Teams meeting', 'attendees': None, 'date': '04/03/2024', 'duration': '1 hr', 'location': None, 'notes': None, 'recurrence': None, 'time': '2:30 PM'}


Found an incorrect prediction
User Input : Organize client meeting 2024-02-12 3pm with Alex for 1 hour.
Ground Truth : 
{'action': 'Organize client meeting', 'attendees': ['Alex'], 'date': '12/02/2024', 'duration': '1 hour', 'location': None, 'notes': None, 'recurrence': None, 'time': '3:00 PM'}

Predicted Value : 
{'action': 'Organize client meeting', 'attendees': ['Alex'], 'date': '2024-02-12', 'duration': '1 hour', 'location': None, 'notes': None, 'recurrence': None, 'time': '3:00 PM'}


Found an incorrect predic

In [16]:
missed

1

In [17]:
print(f"Sample wise validation Accuracy : {acc} %")
print(f"Sample wise validation Precision : {precision} %")
print(f"Sample wise validation Recall : {recall} %")

Sample wise validation Accuracy : 73.58490566037736 %
Sample wise validation Precision : 96.0 %
Sample wise validation Recall : 95.47920433996383 %


In [18]:
print("Field wise accuracy calculation")
for key in field_wise_acc.keys():
    print(f"{key} : {(field_wise_acc[key] / len(valid_data)) * 100} %")

Field wise accuracy calculation
action : 91.19496855345912 %
attendees : 97.48427672955975 %
date : 93.71069182389937 %
duration : 96.85534591194968 %
location : 94.33962264150944 %
notes : 99.37106918238993 %
recurrence : 97.48427672955975 %
time : 96.85534591194968 %
