In [None]:
!pip install -q evaluate bert_score

In [None]:
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import torch

from transformers.modeling_outputs import BaseModelOutput
import matplotlib.pyplot as plt
import numpy as np
from functools import partial
from huggingface_hub import PyTorchModelHubMixin

# Load Test Predictions

In [None]:
from datasets import load_dataset

raw_results = load_dataset("MehdiJmlkh/SmolVLM-Results")
fine_tune_results = load_dataset("MehdiJmlkh/SmolVLM-FT-Results")
smol_driver_results = load_dataset("MehdiJmlkh/SmolDriver-Results")

In [None]:
import matplotlib.pyplot as plt

index = 0

def display_sample_and_output(index, model):
    sample = test_dataset["test"][index]

    def display_image(image, title):
        plt.imshow(image)
        plt.axis('off')
        plt.title(title)

    plt.figure(figsize=(20, 8))
    cameras = ['CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT']
    for i, camera in enumerate(cameras):
        plt.subplot(2, 3, i + 1)
        display_image(sample['images'][camera], camera)
    plt.tight_layout()
    plt.show()

    for key, value in sample.items():
        if type(value) is str:
            print(f"{key}: {value}")

    output = model.generate(**test_ds[index].to("cuda", dtype=torch.bfloat16), max_new_tokens=32)
    generated_texts = processor.batch_decode(output, skip_special_tokens=True)
    print("model answer: " + generated_texts[0].split("Assistant: ")[-1])

In [None]:
import evaluate

def get_scores(results):
    bleu = evaluate.load("bleu")
    bertscore = evaluate.load("bertscore")
    
    preds = list(results["prediction"])
    refs = list(results["answer"])
    
    bleu_result = bleu.compute(predictions=preds, references=refs)
    
    bert_result = bertscore.compute(predictions=preds, references=refs, lang="en")
    
    scores = {
          "BLEU": bleu_result["bleu"],
          "BERTScore_P": sum(bert_result["precision"]) / len(bert_result["precision"]),
          "BERTScore_R": sum(bert_result["recall"]) / len(bert_result["recall"]),
          "BERTScore_F1": sum(bert_result["f1"]) / len(bert_result["f1"])
    }
    
    return scores 

In [None]:
bert_result

{'precision': [0.878101646900177,
  0.889178991317749,
  0.8323870897293091,
  0.8557569980621338,
  0.8509165048599243,
  0.8319003582000732,
  0.9160628318786621,
  0.8792918920516968,
  0.9305887818336487,
  0.8836015462875366,
  0.8922060132026672,
  0.8323870897293091,
  0.8716104030609131,
  0.8462576866149902,
  0.8437557220458984,
  0.8531808257102966,
  0.8417874574661255,
  0.9014779329299927,
  0.8403003811836243,
  0.8247056007385254,
  0.8766394853591919,
  0.8511621952056885,
  0.8526315689086914,
  0.8862093687057495,
  0.8505718111991882,
  0.8567560911178589,
  0.9469602108001709,
  0.8992578983306885,
  0.8377057909965515,
  0.8515875339508057,
  0.8985310792922974,
  0.9178350567817688,
  0.8381560444831848,
  0.8540467619895935,
  0.8459174633026123,
  0.8873710632324219,
  0.9116246700286865,
  0.8510081171989441,
  0.8349460363388062,
  0.9079267978668213,
  0.9226725697517395,
  0.840327799320221,
  0.8559938073158264,
  0.8928244113922119,
  0.849031925201416,
 

In [None]:
bleu_result

{'bleu': 0.09006756129883932,
 'precisions': [0.2525930445393533,
  0.09616634178037686,
  0.06045865184155664,
  0.04480955937266617],
 'brevity_penalty': 1.0,
 'length_ratio': 1.5959104186952289,
 'translation_length': 1639,
 'reference_length': 1027}

In [None]:
scores

{'BLEU': 0.09006756129883932,
 'BERTScore_P': 0.8692312473058701,
 'BERTScore_R': 0.9168985331058502,
 'BERTScore_F1': 0.8917563009262085}

In [None]:
import numpy as np

def get_lowest_k(scores, k=5):
    scores = np.array(scores)
    lowest_indices = scores.argsort()[:k]
    lowest_scores = scores[lowest_indices]
    return list(lowest_scores), list(lowest_indices)

low_prec, low_prec_idx = get_lowest_k(bert_result["precision"])
low_rec, low_rec_idx   = get_lowest_k(bert_result["recall"])
low_f1, low_f1_idx     = get_lowest_k(bert_result["f1"])

print(f"Lowest 5 BERTScore Precision: {low_prec} at indices {low_prec_idx}")
print(f"Lowest 5 BERTScore Recall: {low_rec} at indices {low_rec_idx}")
print(f"Lowest 5 BERTScore F1: {low_f1} at indices {low_f1_idx}")
print(f"Lowest 5 BELU F1: {low_f1} at indices {low_f1_idx}")

Lowest 5 BERTScore Precision: [np.float64(0.8247056007385254), np.float64(0.8268681764602661), np.float64(0.8319003582000732), np.float64(0.8323870897293091), np.float64(0.8323870897293091)] at indices [np.int64(19), np.int64(50), np.int64(5), np.int64(11), np.int64(2)]
Lowest 5 BERTScore Recall: [np.float64(0.7762110233306885), np.float64(0.7776010036468506), np.float64(0.8244717121124268), np.float64(0.8257663249969482), np.float64(0.8266342282295227)] at indices [np.int64(9), np.int64(7), np.int64(94), np.int64(38), np.int64(25)]
Lowest 5 BERTScore F1: [np.float64(0.8253257870674133), np.float64(0.8264322280883789), np.float64(0.8303307890892029), np.float64(0.8312388062477112), np.float64(0.8414256572723389)] at indices [np.int64(7), np.int64(9), np.int64(38), np.int64(19), np.int64(25)]
Lowest 5 BELU F1: [np.float64(0.8253257870674133), np.float64(0.8264322280883789), np.float64(0.8303307890892029), np.float64(0.8312388062477112), np.float64(0.8414256572723389)] at indices [np.int

In [None]:
for idx in low_prec_idx:
    print(f"Lowest precision label {idx}: " + refs[idx] + "\nLowest precision answer: " + preds[idx])

Lowest precision label 19: Keep going at the same speed, slightly offset to the left.
Lowest precision answer: In this scenario, the safe actions for the ego vehicle would be to follow the traffic rules, such as stopping at the designated stop line, waiting
Lowest precision label 50: Yes.
Lowest precision answer: There are no motorcycles without riders to the front left of the ego car.
Lowest precision label 5: None.
Lowest precision answer: The traffic signal that the ego vehicle should pay attention to is the one on the right side of the road.
Lowest precision label 11: Yes.
Lowest precision answer: There are no moving pedestrians to the back left of the ego car.
Lowest precision label 2: Yes.
Lowest precision answer: There are no moving pedestrians to the back left of the ego car.


In [None]:
for idx in low_rec_idx:
    print(f"Lowest recall label {idx}: " + refs[idx] + "\nLowest recall answer: " + preds[idx])

Lowest recall label 9: There are many traffic elements in the front view. The information of these traffic elements is [(road sign, go straight, 1010.76, 618.58, 1186.13, 720.46), (road sign, turn right, 1010.76, 618.58, 1186.13, 720.46), (road sign, go straight, 733.46, 511.74, 764.7, 519.99), (road sign, turn left, 733.46, 511.74, 764.7, 519.99), (road sign, turn right, 733.46, 511.74, 764.7, 519.99)].
Lowest recall answer: The traffic elements in the front view are:
1. A black car on the left side of the road.
2. A blue car
Lowest recall label 7: There are many traffic elements in the front view. The information of these traffic elements is [(road sign, go straight, 940.32, 543.31, 985.43, 563.8), (road sign, turn right, 940.32, 543.31, 985.43, 563.8), (road sign, go straight, 879.14, 506.74, 893.64, 513.52), (road sign, turn right, 879.14, 506.74, 893.64, 513.52)].
Lowest recall answer: The traffic elements in the front view are:
1. A car on the road, categorized as a vehicle, with

In [None]:
for idx in low_f1_idx:
    print(f"Lowest F1 label {idx}: " + refs[idx] + "\nLowest F1 answer: " + preds[idx])

Lowest F1 label 7: There are many traffic elements in the front view. The information of these traffic elements is [(road sign, go straight, 940.32, 543.31, 985.43, 563.8), (road sign, turn right, 940.32, 543.31, 985.43, 563.8), (road sign, go straight, 879.14, 506.74, 893.64, 513.52), (road sign, turn right, 879.14, 506.74, 893.64, 513.52)].
Lowest F1 answer: The traffic elements in the front view are:
1. A car on the road, categorized as a vehicle, with a status of moving,
Lowest F1 label 9: There are many traffic elements in the front view. The information of these traffic elements is [(road sign, go straight, 1010.76, 618.58, 1186.13, 720.46), (road sign, turn right, 1010.76, 618.58, 1186.13, 720.46), (road sign, go straight, 733.46, 511.74, 764.7, 519.99), (road sign, turn left, 733.46, 511.74, 764.7, 519.99), (road sign, turn right, 733.46, 511.74, 764.7, 519.99)].
Lowest F1 answer: The traffic elements in the front view are:
1. A black car on the left side of the road.
2. A blue

In [21]:
bert_result

{'precision': [0.8747718334197998,
  0.9641767144203186,
  1.0,
  0.8708701729774475,
  0.9999995231628418,
  1.000000238418579,
  0.9563087224960327,
  0.8079574108123779,
  0.9830548167228699,
  0.8036064505577087,
  1.0,
  1.0,
  0.9999998807907104,
  0.9945030808448792,
  0.8430725336074829,
  0.9945030808448792,
  0.8852244019508362,
  0.9455838203430176,
  0.9999995231628418,
  0.8296487331390381,
  0.9471440315246582,
  0.9945030808448792,
  1.0,
  0.9612855911254883,
  0.9999995231628418,
  0.9124995470046997,
  0.9865326881408691,
  0.9352561831474304,
  0.9999995231628418,
  1.0,
  0.9552502632141113,
  1.0000001192092896,
  0.9999995231628418,
  1.0,
  0.8734886050224304,
  0.9820294380187988,
  0.9479755163192749,
  1.0,
  0.8326952457427979,
  1.0000001192092896,
  0.9999998807907104,
  1.0,
  0.8867825865745544,
  0.973015546798706,
  1.0,
  1.0,
  0.9308980107307434,
  0.9177933931350708,
  0.9999995231628418,
  1.0,
  0.9945030808448792,
  0.9469122886657715,
  0.929520

In [22]:
bleu_result

{'bleu': 0.20444531587037948,
 'precisions': [0.6416184971098265,
  0.39864864864864863,
  0.25609756097560976,
  0.18493150684931506],
 'brevity_penalty': 0.616249080175129,
 'length_ratio': 0.6738072054527751,
 'translation_length': 692,
 'reference_length': 1027}

In [23]:
scores

{'BLEU': 0.20444531587037948,
 'BERTScore_P': 0.9595827841758728,
 'BERTScore_R': 0.9487088853120804,
 'BERTScore_F1': 0.9539469361305237}

In [24]:
import numpy as np

def get_lowest_k(scores, k=5):
    scores = np.array(scores)
    lowest_indices = scores.argsort()[:k]
    lowest_scores = scores[lowest_indices]
    return list(lowest_scores), list(lowest_indices)

low_prec, low_prec_idx = get_lowest_k(bert_result["precision"])
low_rec, low_rec_idx   = get_lowest_k(bert_result["recall"])
low_f1, low_f1_idx     = get_lowest_k(bert_result["f1"])

print(f"Lowest 5 BERTScore Precision: {low_prec} at indices {low_prec_idx}")
print(f"Lowest 5 BERTScore Recall: {low_rec} at indices {low_rec_idx}")
print(f"Lowest 5 BERTScore F1: {low_f1} at indices {low_f1_idx}")
print(f"Lowest 5 BELU F1: {low_f1} at indices {low_f1_idx}")

Lowest 5 BERTScore Precision: [np.float64(0.7797519564628601), np.float64(0.8028346300125122), np.float64(0.8036064505577087), np.float64(0.8079574108123779), np.float64(0.8296487331390381)] at indices [np.int64(65), np.int64(94), np.int64(9), np.int64(7), np.int64(19)]
Lowest 5 BERTScore Recall: [np.float64(0.771761417388916), np.float64(0.7768005132675171), np.float64(0.8044871687889099), np.float64(0.8302687406539917), np.float64(0.8433520197868347)] at indices [np.int64(9), np.int64(7), np.int64(94), np.int64(38), np.int64(65)]
Lowest 5 BERTScore F1: [np.float64(0.7873620390892029), np.float64(0.7920726537704468), np.float64(0.8036600947380066), np.float64(0.8103059530258179), np.float64(0.8314802050590515)] at indices [np.int64(9), np.int64(7), np.int64(94), np.int64(65), np.int64(38)]
Lowest 5 BELU F1: [np.float64(0.7873620390892029), np.float64(0.7920726537704468), np.float64(0.8036600947380066), np.float64(0.8103059530258179), np.float64(0.8314802050590515)] at indices [np.int6

In [25]:
for idx in low_prec_idx:
    print(f"Lowest precision label {idx}: " + refs[idx] + "\nLowest precision answer: " + preds[idx])

Lowest precision label 65: Keep going at the same speed, decelerate gradually without braking.
Lowest precision answer: Keep going straight, go left, go right, go straight, go right, go straight, go right, go straight, go straight, go
Lowest precision label 94: There are two traffic elements in the front view. The information of these traffic elements is [(road sign, go straight, 980.11, 559.58, 1026.43, 578.19), (road sign, turn left, 980.11, 559.58, 1026.43, 578.19)].
Lowest precision answer: [('CAM_FRONT_LEFT', 'S', 100.0, 100.0, 10
Lowest precision label 9: There are many traffic elements in the front view. The information of these traffic elements is [(road sign, go straight, 1010.76, 618.58, 1186.13, 720.46), (road sign, turn right, 1010.76, 618.58, 1186.13, 720.46), (road sign, go straight, 733.46, 511.74, 764.7, 519.99), (road sign, turn left, 733.46, 511.74, 764.7, 519.99), (road sign, turn right, 733.46, 511.74, 764.7, 519.99)].
Lowest precision answer: [('car', 'd', 100, 100

In [26]:
for idx in low_rec_idx:
    print(f"Lowest recall label {idx}: " + refs[idx] + "\nLowest recall answer: " + preds[idx])

Lowest recall label 9: There are many traffic elements in the front view. The information of these traffic elements is [(road sign, go straight, 1010.76, 618.58, 1186.13, 720.46), (road sign, turn right, 1010.76, 618.58, 1186.13, 720.46), (road sign, go straight, 733.46, 511.74, 764.7, 519.99), (road sign, turn left, 733.46, 511.74, 764.7, 519.99), (road sign, turn right, 733.46, 511.74, 764.7, 519.99)].
Lowest recall answer: [('car', 'd', 100, 100, 100, 100), ('car
Lowest recall label 7: There are many traffic elements in the front view. The information of these traffic elements is [(road sign, go straight, 940.32, 543.31, 985.43, 563.8), (road sign, turn right, 940.32, 543.31, 985.43, 563.8), (road sign, go straight, 879.14, 506.74, 893.64, 513.52), (road sign, turn right, 879.14, 506.74, 893.64, 513.52)].
Lowest recall answer: [('car', 'd', 100, 100, 100, 100), ('car
Lowest recall label 94: There are two traffic elements in the front view. The information of these traffic elements i

In [27]:
for idx in low_f1_idx:
    print(f"Lowest F1 label {idx}: " + refs[idx] + "\nLowest F1 answer: " + preds[idx])

Lowest F1 label 9: There are many traffic elements in the front view. The information of these traffic elements is [(road sign, go straight, 1010.76, 618.58, 1186.13, 720.46), (road sign, turn right, 1010.76, 618.58, 1186.13, 720.46), (road sign, go straight, 733.46, 511.74, 764.7, 519.99), (road sign, turn left, 733.46, 511.74, 764.7, 519.99), (road sign, turn right, 733.46, 511.74, 764.7, 519.99)].
Lowest F1 answer: [('car', 'd', 100, 100, 100, 100), ('car
Lowest F1 label 7: There are many traffic elements in the front view. The information of these traffic elements is [(road sign, go straight, 940.32, 543.31, 985.43, 563.8), (road sign, turn right, 940.32, 543.31, 985.43, 563.8), (road sign, go straight, 879.14, 506.74, 893.64, 513.52), (road sign, turn right, 879.14, 506.74, 893.64, 513.52)].
Lowest F1 answer: [('car', 'd', 100, 100, 100, 100), ('car
Lowest F1 label 94: There are two traffic elements in the front view. The information of these traffic elements is [(road sign, go st

In [28]:
refs

['There is one moving car in front of the ego car, one moving car behind the ego car, and three parked cars in front of the ego car.',
 'There are many cars and one pedestrian in front of the ego car.',
 'Yes.',
 'The ego vehicle is going straight. The ego vehicle is not moving.',
 'No.',
 'None.',
 'The pedestrian to the front left of the ego car is moving.',
 'There are many traffic elements in the front view. The information of these traffic elements is [(road sign, go straight, 940.32, 543.31, 985.43, 563.8), (road sign, turn right, 940.32, 543.31, 985.43, 563.8), (road sign, go straight, 879.14, 506.74, 893.64, 513.52), (road sign, turn right, 879.14, 506.74, 893.64, 513.52)].',
 'The pedestrian in front of the ego car is standing.',
 'There are many traffic elements in the front view. The information of these traffic elements is [(road sign, go straight, 1010.76, 618.58, 1186.13, 720.46), (road sign, turn right, 1010.76, 618.58, 1186.13, 720.46), (road sign, go straight, 733.46, 

In [29]:
preds

['There are two pedestrians, one car, and one fire hydrant in front of the ego vehicle. There are two cars, one pedestrian, and',
 'There are two cars in front of the ego car.',
 'Yes.',
 'The ego vehicle will stop and wait for the traffic ahead to move.',
 'No.',
 'None.',
 'One pedestrian is standing.',
 "[('car', 'd', 100, 100, 100, 100), ('car",
 'One pedestrian is standing.',
 "[('car', 'd', 100, 100, 100, 100), ('car",
 'One pedestrian is standing.',
 'Yes.',
 'One car is moving.',
 'Yes.',
 'Two cars are parked on the side of the road. One car is in front of the other. There are two pedestrians. One pedestrian is in',
 'Yes.',
 'Keep going straight, go right, and go left.',
 'There are many pedestrians to the back of the ego car.',
 'No.',
 'Keep the ego vehicle at a safe distance from the following vehicle, and keep the ego vehicle at a safe distance from the left and right lane boundaries',
 'Many pedestrians are moving.',
 'Yes.',
 'Yes.',
 'One bus is moving, and one bus is 