In [44]:
!pip install -q evaluate bert_score

In [45]:
import torch
from PIL import Image
import torch

from transformers.modeling_outputs import BaseModelOutput
import matplotlib.pyplot as plt
import numpy as np
from functools import partial
from huggingface_hub import PyTorchModelHubMixin

# Load Test Predictions

In [46]:
from datasets import load_dataset

raw_results = load_dataset("MehdiJmlkh/SmolVLM-Results", split="train")
fine_tune_results = load_dataset("MehdiJmlkh/SmolVLM-FT-Results", split="train")
smol_driver_results = load_dataset("MehdiJmlkh/SmolDriver-Results", split="train")

In [65]:
from datasets import load_dataset
import re


def no_tuple_pattern(example):
    return not re.search(r"\([^)]*,[^)]*\)", example["question"])

raw_results = raw_results.filter(no_tuple_pattern)
smol_driver_results = smol_driver_results.filter(no_tuple_pattern)
fine_tune_results = fine_tune_results.filter(no_tuple_pattern)

Filter:   0%|          | 0/1305 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1305 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1305 [00:00<?, ? examples/s]

# Load Metrics

In [59]:
import evaluate
bleu=evaluate.load("bleu")
bertscore=evaluate.load("bertscore")

In [60]:
class Evaluate:
    def __init__(self, results) -> None:
        self.preds = list(results["prediction"])
        self.refs = list(results["answer"])
        self.questions = list(results["question"])

        self.bleu_result = bleu.compute(predictions=self.preds, references=self.refs)
        self.bert_result = bertscore.compute(predictions=self.preds, references=self.refs, lang="en")

    def get_avg_scores(self):
        scores = {
              "BLEU": self.bleu_result["bleu"],
              "BERTScore_P": sum(self.bert_result["precision"]) / len(self.bert_result["precision"]),
              "BERTScore_R": sum(self.bert_result["recall"]) / len(self.bert_result["recall"]),
              "BERTScore_F1": sum(self.bert_result["f1"]) / len(self.bert_result["f1"])
        }
        return scores

    def print_lowest_bert(self, score_type, k=5):
        lowest_scores, lowest_indexes = self.__get_lowest_k(self.bert_result[score_type], k)

        for idx, score in zip(lowest_indexes, lowest_scores):
            print(f"{score_type} score:", score)
            print("Index:" ,idx)
            print("Question:", self.questions[idx])
            print(f"Label:", self.refs[idx])
            print(f"Answer:", self.preds[idx])
            print("-" * 50)

    def __get_lowest_k(self, scores, k=5):
        scores = np.array(scores)
        lowest_indices = scores.argsort()[:k]
        lowest_scores = scores[lowest_indices]
        return list(lowest_scores), list(lowest_indices)

In [66]:
raw_eval = Evaluate(raw_results)
raw_eval.get_avg_scores()

{'BLEU': 0.08588538709157112,
 'BERTScore_P': 0.8704354751155751,
 'BERTScore_R': 0.9188978484307212,
 'BERTScore_F1': 0.8935234272617033}

In [67]:
fine_tune_eval = Evaluate(fine_tune_results)
fine_tune_eval.get_avg_scores()

{'BLEU': 0.2143340725645085,
 'BERTScore_P': 0.9451123196959952,
 'BERTScore_R': 0.9409183467484982,
 'BERTScore_F1': 0.9427071601951716}

In [68]:
smol_driver_eval = Evaluate(smol_driver_results)
smol_driver_eval.get_avg_scores()

{'BLEU': 0.40599144319905867,
 'BERTScore_P': 0.9723249873895755,
 'BERTScore_R': 0.9615717893816046,
 'BERTScore_F1': 0.9667517475241445}

In [69]:
smol_driver_eval.print_lowest_bert("precision")

precision score: 0.7873367667198181
Index: 703
Question: What is the traffic signal that the ego vehicle should pay attention to?
Label: None.
Answer: There is one pedestrian to the ego vehicle to the front left of the ego vehicle to the front left of the ego vehicle to the front left of the ego vehicle
--------------------------------------------------
precision score: 0.8320485949516296
Index: 682
Question: In this scenario, what are dangerous actions to take for the ego vehicle?
Label: Back up, brake suddenly.
Answer: Accelerate and go ahead, turn left, and turn right are dangerous actions to take for the ego vehicle in this scenario.
--------------------------------------------------
precision score: 0.8441763520240784
Index: 702
Question: Are there motorcycles without riders to the front left of the ego car?
Label: No.
Answer: There are many pedestrians to the front left of the ego car.
--------------------------------------------------
precision score: 0.846842348575592
Index: 78

In [70]:
smol_driver_eval.print_lowest_bert("recall")

recall score: 0.8247115612030029
Index: 882
Question: What is the target action of the ego vehicle?
Label: Brake gently to a stop.
Answer: Go straight.
--------------------------------------------------
recall score: 0.8328759670257568
Index: 757
Question: Please describe the current scene.
Label: There is one moving car to the back of the ego car, one parked car to the front of the ego car, one parked car to the front right of the ego car, many parked cars to the back of the ego car, one parked truck to the front of the ego car, one parked truck to the back right of the ego car, two moving trucks to the back of the ego car, and one moving pedestrian to the front of the ego car.
Answer: There is one moving car in front of the ego car.
--------------------------------------------------
recall score: 0.8345067501068115
Index: 920
Question: Please describe the current scene.
Label: There is one parked car to the front right of the ego car, one bicycle without a rider to the front right of

In [71]:
smol_driver_eval.print_lowest_bert("f1")

f1 score: 0.8466175198554993
Index: 682
Question: In this scenario, what are dangerous actions to take for the ego vehicle?
Label: Back up, brake suddenly.
Answer: Accelerate and go ahead, turn left, and turn right are dangerous actions to take for the ego vehicle in this scenario.
--------------------------------------------------
f1 score: 0.8501881957054138
Index: 1163
Question: In this scenario, what are safe actions to take for the ego vehicle?
Label: Brake suddenly, brake gently to a stop, slightly offset to the left, change to the left lane, turn left, and back up.
Answer: Keep going at the same speed, decelerate gradually without braking.
--------------------------------------------------
f1 score: 0.8514255285263062
Index: 154
Question: In this scenario, what are safe actions to take for the ego vehicle?
Label: Slightly offset to the left, and slightly offset to the right are safe actions to take for the ego vehicle in this scenario.
Answer: Keep going at the same speed, decel

---

In [72]:
fine_tune_eval.print_lowest_bert("precision")

precision score: 0.8336473703384399
Index: 555
Question: In this scenario, what are safe actions to take for the ego vehicle?
Label: Brake gently to a stop.
Answer: The safe actions for the ego vehicle in this scenario are to wait for other vehicles to pass and then proceed when it is safe to do so.
--------------------------------------------------
precision score: 0.8356881141662598
Index: 48
Question: Are there motorcycles without riders to the front left of the ego car?
Label: Yes.
Answer: No, there are no motorcycles without riders to the front left of the ego car.
--------------------------------------------------
precision score: 0.8358253240585327
Index: 232
Question: In this scenario, what are safe actions to take for the ego vehicle?
Label: Keep going at the same speed, decelerate gradually without braking.
Answer: The safe actions for the ego vehicle would be to proceed straight through the current traffic light and then proceed straight through the subsequent intersection c

In [73]:
fine_tune_eval.print_lowest_bert("recall")

recall score: 0.7843673229217529
Index: 998
Question: In this scenario, what are safe actions to take for the ego vehicle?
Label: Keep going at the same speed, accelerate and proceed, brake gently to a stop, decelerate gradually without braking, slightly offset to the left, change to the left lane, turn left.
Answer: The ego vehicle should proceed forward.
--------------------------------------------------
recall score: 0.790134072303772
Index: 1210
Question: In this scenario, what are dangerous actions to take for the ego vehicle?
Label: Brake suddenly, brake gently to a stop, back up, change to the right lane, change to the left lane, turn right, turn left.
Answer: The ego vehicle should not take any dangerous actions.
--------------------------------------------------
recall score: 0.8142334222793579
Index: 882
Question: What is the target action of the ego vehicle?
Label: Brake gently to a stop.
Answer: Go.
--------------------------------------------------
recall score: 0.81581550

In [74]:
fine_tune_eval.print_lowest_bert("f1")

f1 score: 0.8189147710800171
Index: 1210
Question: In this scenario, what are dangerous actions to take for the ego vehicle?
Label: Brake suddenly, brake gently to a stop, back up, change to the right lane, change to the left lane, turn right, turn left.
Answer: The ego vehicle should not take any dangerous actions.
--------------------------------------------------
f1 score: 0.8251954913139343
Index: 998
Question: In this scenario, what are safe actions to take for the ego vehicle?
Label: Keep going at the same speed, accelerate and proceed, brake gently to a stop, decelerate gradually without braking, slightly offset to the left, change to the left lane, turn left.
Answer: The ego vehicle should proceed forward.
--------------------------------------------------
f1 score: 0.8312506675720215
Index: 555
Question: In this scenario, what are safe actions to take for the ego vehicle?
Label: Brake gently to a stop.
Answer: The safe actions for the ego vehicle in this scenario are to wait f