In [7]:
import os, sys

PROJECT_ROOT = os.path.abspath('..')
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

from llm_providers.models.Answer import *
from llm_providers.views import ollama as provider
from llm_providers.models.ollama import OllamaOptions

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

In [3]:
eval_df = pd.read_csv('../../data/dataset-1/data_eval.csv')
eval_df.head()

Unnamed: 0,question,answer,image_id
0,what is the colour of the bag on the chair,pink,image399
1,what is at the right bottom,table,image1341
2,what are found on the rack,toy,image1320
3,what is left of printer,mirror,image529
4,what is the colour of television,black,image201


In [4]:
from tqdm import tqdm

predictions, references = list(), list()
data_gen = eval_df.iterrows()

for idx, row in tqdm(data_gen, desc="LLM answering", total=len(eval_df)):

    answer = provider.answer(
        query = ImageAnswer(
            query=row['question'],
            paths=[Path('../../data/dataset-1/images').resolve() / f"{row['image_id']}.png"],
            other_dict=[{
                'role': 'system',
                'content': "\n".join([
                    "# Role",
                    "You are an AI assitant, that answer on user question for provided image",
                    "",
                    "# Instructions",
                    "1. Answer short and clear.",
                    f"2. Answer in {len(row['answer'].split(' '))} word, that exactly answer on user question",
                    "",
                ])
            }]
        ),
        model="ministral-3:14b",
        options=OllamaOptions(
            temperature=0
        )
    )
    
    predictions.append(answer.answer)
    references.append(row['answer'])

LLM answering: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2494/2494 [08:32<00:00,  4.87it/s]


In [5]:
from evaluate import load 

bertscore = load('bertscore')

results = bertscore.compute(
    predictions=predictions, 
    references=references, 
    model_type="distilbert-base-uncased", 
    rescale_with_baseline=True, 
    lang='en'
)


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
metrics = {
    "Precision": np.mean(results['precision']).item(),
    "Reccal": np.mean(results['recall']).item(),
    "F1": np.mean(results['f1']).item(),
}
metrics

{'Precision': 0.3031761201779303,
 'Reccal': 0.3751325358685759,
 'F1': 0.3364317751703809}