All model evaluations use 'gemini-2.5-flash-preview-04-17' model for the LLM evaluation 

Evaluating Baseline qwen-2.5 3B model output







In [None]:
import json
from typing import Any, Dict, List
import ntpath
import evals
import csv
import xml.etree.ElementTree as ET
import tqdm
import statistics

print("Baseline 3B")
def parse_jsonl(file_path: str) -> List[Dict[str, Any]]:
    """
    Parse a JSONL file where each line is a JSON object containing:
      - image_path: str
      - recipe_xml: str
      - ing_vecs: List[List[float]]
      - step_vecs: List[List[float]]

    Returns a list of dictionaries with those keys.
    """
    entries: List[Dict[str, Any]] = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            data = json.loads(line)
            entry = {
                'image_path': 'Food Images/Food Images/' + ntpath.basename(data.get('image_path', '')),
                'recipe_xml': data.get('recipe_xml', ''),
                'ing_vecs': data.get('ing_vecs', []),
                'step_vecs': data.get('step_vecs', []),
            }
            entries.append(entry)
    return entries

data = parse_jsonl('/Users/BenChung/Desktop/CSCI 467/vision-r1/data/dev_stage/dev_pairs_vec.jsonl')
# print(data[0]['image_path'])

results = []
csv_path = '/Users/BenChung/Desktop/CSCI 467/Generated Recipes Baseline 3B.csv'
with open(csv_path, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        xml_str = row.get("response", "")
        category = None
        if xml_str:
            try:
                root = ET.fromstring(xml_str)
                cat_elem = root.find('.//response_category')
                if cat_elem is not None:
                    category = cat_elem.text
            except ET.ParseError:
                # malformed XML—leave category as None
                pass
        # row['response_category'] = category
        results.append(xml_str)



model_xml_outputs = results

# print(model_xml_outputs)

aggregate_evals = []

# print(data)

for i, example in tqdm.tqdm(enumerate(data)):
    # print(model_xml_outputs[i])
    temp = evals.compute_evals(model_xml_outputs[i], example)
    if temp == None: continue
    else: 
        aggregate_evals.append(temp)
        print("cosine_similarity: ")
        print(temp['cosine_similarity'])
        print("bleu_scores: ")
        print(temp["bleu_scores"])
        print("rouge_scores: ")
        print(temp['rouge_scores'])
        print("llm_evaluation: ")
        print(temp['llm_evaluation'])

cos_steps       = [e['cosine_similarity']['steps']       for e in aggregate_evals]
cos_ing         = [e['cosine_similarity']['ingredients'] for e in aggregate_evals]
bleu_steps      = [e['bleu_scores']['steps']             for e in aggregate_evals]
bleu_ing        = [e['bleu_scores']['ingredients']       for e in aggregate_evals]

rouge_s1_steps  = [e['rouge_scores']['steps']['rouge1']       for e in aggregate_evals]
rougeL_steps    = [e['rouge_scores']['steps']['rougeL']       for e in aggregate_evals]
rouge_s1_ing    = [e['rouge_scores']['ingredients']['rouge1'] for e in aggregate_evals]
rougeL_ing      = [e['rouge_scores']['ingredients']['rougeL'] for e in aggregate_evals]

feas_scores     = [e['llm_evaluation']['feasibility_score']   for e in aggregate_evals]

def stats(vals: List[float]) -> Dict[str, float]:
    return {
        'mean': statistics.mean(vals),
        'std':  statistics.pstdev(vals)
    }

print(aggregate_evals)
print()
print()
print("Final averages and std deviations:")

new_dict = {'cosine_similarity': {
            'steps':       stats(cos_steps),
            'ingredients': stats(cos_ing),
        },
        'bleu_scores': {
            'steps':       stats(bleu_steps),
            'ingredients': stats(bleu_ing)
        },
        'rouge_scores': {
            'steps': {
                'rouge1': stats(rouge_s1_steps),
                'rougeL': stats(rougeL_steps),
            },
            'ingredients': {
                'rouge1': stats(rouge_s1_ing),
                'rougeL': stats(rougeL_ing),
            }
        },
        'llm_feasibility_score': stats(feas_scores)
}
print("cosine_similarity:\n\tsteps: ", end="")
print(new_dict['cosine_similarity']['steps'])
print("\tingredients: ", end="")
print(new_dict['cosine_similarity']['ingredients'])
print("bleu_scores:\n\tsteps: ", end="")
print(new_dict['bleu_scores']['steps'])
print("\tingredients: ", end="")
print(new_dict['bleu_scores']['ingredients'])
print("rouge_scores:\n\tsteps: ", end="")
print(new_dict['rouge_scores']['steps'])
print("\tingredients: ", end="")
print(new_dict['rouge_scores']['ingredients'])
print("llm_feasibility_score: ", end="")
print(new_dict['llm_feasibility_score'])



Baseline 3B


0it [00:00, ?it/s]

Calculating cosine similarity...
Calculating BLEU scores...
Calculating ROUGE scores...
Calculating LLM evaluation...


1it [00:12, 12.96s/it]

cosine_similarity: 
{'steps': np.float64(0.5906911656512902), 'ingredients': np.float64(0.5511275781087667)}
bleu_scores: 
{'steps': 1.1970014442835451e-05, 'ingredients': 0.04539285423716415}
rouge_scores: 
{'steps': {'rouge1': 0.10783855384135889, 'rougeL': 0.08991740688795387}, 'ingredients': {'rouge1': 0.4327283827283827, 'rougeL': 0.41487123987123986}}
llm_evaluation: 
{'feasibility_score': np.float64(0.3), 'comment': 'The predicted recipe is a plausible method for making a simple cheesy pasta with tomato sauce, but it significantly deviates from the ground truth recipe and the dish depicted in the image. Crucially, it omits the lentils, which are a central component visible in the photo and included in the ground truth. The cooking method is also too short to develop a rich sauce like a bolognese, whether meat-based or lentil-based. It would not produce the dish shown.', 'full': '{\n  "evaluation": {\n    "feasibility_score": "0.3",\n    "comment": "The predicted recipe is a plau

2it [00:20, 10.04s/it]

cosine_similarity: 
{'steps': np.float64(0.36140391475406075), 'ingredients': np.float64(0.6867100856673856)}
bleu_scores: 
{'steps': 0.0002497639499834126, 'ingredients': 0.14447203951651358}
rouge_scores: 
{'steps': {'rouge1': 0.09511847736713955, 'rougeL': 0.07920213353052034}, 'ingredients': {'rouge1': 0.5859410430839002, 'rougeL': 0.5859410430839002}}
llm_evaluation: 
{'feasibility_score': np.float64(0.0), 'comment': 'The predicted recipe for Tomato and Onion Salad describes a completely different dish from the one shown in the image and the ground-truth recipe. The image clearly shows sliced beets, likely Chioggia beets given the internal rings, with scallions and mint, while the predicted recipe lists cherry tomatoes, red onions, and basil. The preparation methods also differ significantly, requiring cooking the beets in the ground truth versus assembling raw ingredients in the prediction. As a result, the predicted recipe is entirely infeasible for creating the dish pictured.',

3it [00:29,  9.24s/it]

cosine_similarity: 
{'steps': np.float64(0.4850524208011245), 'ingredients': np.float64(0.5742135179575927)}
bleu_scores: 
{'steps': 0.0080304356005434, 'ingredients': 0.0443742811749824}
rouge_scores: 
{'steps': {'rouge1': 0.18349198596186547, 'rougeL': 0.11887566281140578}, 'ingredients': {'rouge1': 0.5068542568542569, 'rougeL': 0.5068542568542569}}
llm_evaluation: 
{'feasibility_score': np.float64(0.1), 'comment': 'The predicted recipe is for a pastry tart with a baked crust topped with berries. This fundamentally differs from the image and ground truth, which depict a rectangular slice with a creamy, baked ricotta/cream cheese filling. The ingredients, base type, shape, and overall structure of the resulting dish would be entirely different. While the predicted recipe is practical for making a tart, it is highly infeasible for producing the specific dish shown in the image.', 'full': '{\n "evaluation": {\n  "feasibility_score": "0.1",\n  "comment": "The predicted recipe is for a pa

4it [00:38,  9.15s/it]

cosine_similarity: 
{'steps': np.float64(0.5066600267324357), 'ingredients': np.float64(0.521930743070376)}
bleu_scores: 
{'steps': 0.0006061530378332202, 'ingredients': 0.028711828241620333}
rouge_scores: 
{'steps': {'rouge1': 0.08582779483148983, 'rougeL': 0.06956994787854422}, 'ingredients': {'rouge1': 0.3178921932768087, 'rougeL': 0.3025075778921933}}
llm_evaluation: 
{'feasibility_score': np.float64(0.9), 'comment': "The predicted recipe for slow-cooked beef short ribs aligns remarkably well with the visual cues in the image, showing tender, braised meat in a rich sauce served over a creamy base. The cooking method, ingredients like beef ribs, broth, and aromatics, and the resulting texture are consistent with the picture. The ground-truth recipe, for seared steak, is completely unrelated to the image. While the specific horseradish cream flavor isn't visible, the overall process would yield a dish that looks like the one shown.", 'full': '{\n "evaluation": {\n  "feasibility_score

5it [00:47,  9.21s/it]

cosine_similarity: 
{'steps': np.float64(0.51783706951212), 'ingredients': np.float64(0.6542937480264422)}
bleu_scores: 
{'steps': 0.0011466893136224913, 'ingredients': 0.0917684438374025}
rouge_scores: 
{'steps': {'rouge1': 0.14855851336987808, 'rougeL': 0.12450089371374885}, 'ingredients': {'rouge1': 0.5849816849816849, 'rougeL': 0.5849816849816849}}
llm_evaluation: 
{'feasibility_score': np.float64(0.4), 'comment': 'The predicted recipe is a functional recipe for a basic creamy potato salad, but it significantly deviates from the image and ground truth recipe. It misses key ingredients clearly visible in the image and present in the ground truth, such as peas, celery, and the unpeeled red potato skins. The dressing ingredients and herbs (dill and green onions vs. chives, mustard, garlic) also differ substantially. While feasible as a standalone recipe, it would not produce the specific dish shown.', 'full': '{\n "evaluation": {\n  "feasibility_score": "0.4",\n  "comment": "The predi

6it [00:54,  8.53s/it]

cosine_similarity: 
{'steps': np.float64(0.49452328114811267), 'ingredients': np.float64(0.4647877634680921)}
bleu_scores: 
{'steps': 0.007515458254027316, 'ingredients': 0.028051332165836856}
rouge_scores: 
{'steps': {'rouge1': 0.08057029177718833, 'rougeL': 0.07194960212201593}, 'ingredients': {'rouge1': 0.27889928698752225, 'rougeL': 0.27889928698752225}}
llm_evaluation: 
{'feasibility_score': np.float64(0.1), 'comment': 'The predicted recipe is completely different from the ground-truth recipe and the image. It lists ingredients like mixed greens, tomatoes, feta, and almonds, none of which are visible in the image or present in the correct recipe. The image clearly shows jicama matchsticks and specific types of leaves like parsley and celery. While the predicted recipe describes a feasible salad in isolation, it would produce a dish entirely different from the one pictured, rendering it infeasible for this specific target.', 'full': '{\n "evaluation": {\n  "feasibility_score": "0.1

8it [01:01,  5.90s/it]

cosine_similarity: 
{'steps': np.float64(0.46592400723845945), 'ingredients': np.float64(0.5563828082275293)}
bleu_scores: 
{'steps': 5.535426726445417e-08, 'ingredients': 0.04402798630829202}
rouge_scores: 
{'steps': {'rouge1': 0.05610288895506898, 'rougeL': 0.04258067546280199}, 'ingredients': {'rouge1': 0.4781144781144781, 'rougeL': 0.4781144781144781}}
llm_evaluation: 
{'feasibility_score': np.float64(0.0), 'comment': 'The predicted recipe describes a chocolate mousse tart, which is completely different from the sour cream apple pie with crumble topping shown in the image and detailed in the ground truth recipe. The ingredients and cooking methods bear no resemblance. The predicted recipe is entirely incorrect for the given visual and ground truth context. The described baking method for the mousse/ganache mixture is also questionable for producing a standard tart texture.', 'full': '{\n  "evaluation": {\n    "feasibility_score": "0.0",\n    "comment": "The predicted recipe describ

9it [01:06,  5.73s/it]

cosine_similarity: 
{'steps': np.float64(0.4485385688668613), 'ingredients': np.float64(0.4142722552416016)}
bleu_scores: 
{'steps': 0.0010537073956837998, 'ingredients': 0.03624273864023976}
rouge_scores: 
{'steps': {'rouge1': 0.14051545668863147, 'rougeL': 0.09048235786443395}, 'ingredients': {'rouge1': 0.2760989010989011, 'rougeL': 0.2760989010989011}}
llm_evaluation: 
{'feasibility_score': np.float64(0.0), 'comment': 'The predicted recipe is for Chicken Stir-Fry, which is completely different from the Crab Cakes shown in the image and described in the ground truth recipe. The ingredients, cooking method, and final dish appearance bear no resemblance. Therefore, the predicted recipe would not realistically produce the dish pictured. This significant mismatch results in a feasibility score of 0.0.', 'full': '{\n  "evaluation": {\n    "feasibility_score": "0.0",\n    "comment": "The predicted recipe is for Chicken Stir-Fry, which is completely different from the Crab Cakes shown in th

10it [01:15,  6.64s/it]

cosine_similarity: 
{'steps': np.float64(0.47231994549680156), 'ingredients': np.float64(0.5222306831588549)}
bleu_scores: 
{'steps': 0.0008750873633471262, 'ingredients': 0.048623383699987754}
rouge_scores: 
{'steps': {'rouge1': 0.13061563605952217, 'rougeL': 0.0839217775307374}, 'ingredients': {'rouge1': 0.37979560122417266, 'rougeL': 0.3593874379588665}}
llm_evaluation: 
{'feasibility_score': np.float64(0.35), 'comment': "The predicted recipe provides a feasible method for making braised pork chops with a lentil base. However, the cooking method (braising) and ingredients (herbs, vinegar) are fundamentally different from the ground truth (marinating, broiling with dark sauces like hoisin). This difference means the predicted recipe will not produce the dark, glazed, and potentially charred exterior of the pork chop seen in the image. While the lentil base might align somewhat, the main protein's appearance is mismatched, making it infeasible for replicating the pictured dish.", 'ful

11it [01:27,  8.16s/it]

cosine_similarity: 
{'steps': np.float64(0.5104701980883879), 'ingredients': np.float64(0.5547673690559293)}
bleu_scores: 
{'steps': 4.993261286762248e-12, 'ingredients': 0.07055323632400982}
rouge_scores: 
{'steps': {'rouge1': 0.04084440981399883, 'rougeL': 0.033477488193310895}, 'ingredients': {'rouge1': 0.38905677655677656, 'rougeL': 0.38905677655677656}}
llm_evaluation: 
{'feasibility_score': np.float64(0.4), 'comment': "The predicted recipe provides a basic structure for chicken enchiladas, but it significantly diverges from both the ground truth and the image. The image shows a rich, red sauce and melted white cheese, whereas the predicted recipe uses cheddar cheese (orange) and lists 'diced tomatoes' and missing 'enchilada sauce'. The instruction to cut corn tortillas into 'half-moon shapes' is impractical for rolling enchiladas as shown in the image, which typically uses whole tortillas. While it's a feasible recipe for *some* baked chicken dish, it would not realistically prod

12it [01:35,  8.03s/it]

cosine_similarity: 
{'steps': np.float64(0.5720687912208332), 'ingredients': np.float64(0.8107030705401455)}
bleu_scores: 
{'steps': 1.1611073351654431e-07, 'ingredients': 0.15299592963828967}
rouge_scores: 
{'steps': {'rouge1': 0.060244907443508584, 'rougeL': 0.04396401715796678}, 'ingredients': {'rouge1': 0.727579365079365, 'rougeL': 0.6859126984126984}}
llm_evaluation: 
{'feasibility_score': np.float64(0.1), 'comment': 'This predicted recipe would not produce the dish shown in the image or described in the ground truth. The image shows a free-form galette with arranged sliced fruit, while the predicted recipe describes a tart baked in a pan with a liquid filling and contains no fruit ingredients. The method and ingredients are fundamentally different from both the visual target and the ground-truth recipe, making it highly impractical for the intended dish.', 'full': '{\n "evaluation": {\n  "feasibility_score": "0.1",\n  "comment": "This predicted recipe would not produce the dish s

13it [01:43,  8.11s/it]

cosine_similarity: 
{'steps': np.float64(0.5515232839144343), 'ingredients': np.float64(0.6432431979727075)}
bleu_scores: 
{'steps': 0.0005477229225639828, 'ingredients': 0.05098860776721022}
rouge_scores: 
{'steps': {'rouge1': 0.12872680168388306, 'rougeL': 0.09399613606218206}, 'ingredients': {'rouge1': 0.493104514533086, 'rougeL': 0.493104514533086}}
llm_evaluation: 
{'feasibility_score': np.float64(0.1), 'comment': 'The predicted recipe describes a cranberry bake made from a flour-based batter, which is fundamentally different from the bread-based strata/pudding shown in the image and described in the ground truth. The core ingredients (flour vs. bread) and preparation method (mixing batter vs. soaking bread) do not align at all. While the predicted recipe might produce an edible bake, it would not resemble the visual appearance or texture of the dish presented. This lack of alignment results in a very low feasibility score.', 'full': '{\n  "evaluation": {\n    "feasibility_score":

14it [01:52,  8.43s/it]

cosine_similarity: 
{'steps': np.float64(0.4891379507798324), 'ingredients': np.float64(0.6893801367039929)}
bleu_scores: 
{'steps': 7.230084049043369e-06, 'ingredients': 0.14281667813575974}
rouge_scores: 
{'steps': {'rouge1': 0.06332235455574461, 'rougeL': 0.04484243123157995}, 'ingredients': {'rouge1': 0.5693034238488784, 'rougeL': 0.5693034238488784}}
llm_evaluation: 
{'feasibility_score': np.float64(0.0), 'comment': 'This predicted recipe describes a Pumpkin Cheesecake, which is completely unrelated to the Lemon Custard Pie in the ground truth and the distinct desserts shown in the image. The ingredients listed (cream cheese, pumpkin) and the steps outlined do not correspond to any of the items pictured. The ingredient list is confusingly repetitive, and the instructions are vague regarding the layering process. This recipe is entirely infeasible for reproducing the visual content or aligning with the ground truth.', 'full': '{\n "evaluation": {\n  "feasibility_score": "0.0",\n  "

15it [02:01,  8.46s/it]

cosine_similarity: 
{'steps': np.float64(0.5268101968988936), 'ingredients': np.float64(0.6931299582538545)}
bleu_scores: 
{'steps': 8.170910549755091e-05, 'ingredients': 0.06987383280368291}
rouge_scores: 
{'steps': {'rouge1': 0.11500567532385064, 'rougeL': 0.07934836963423927}, 'ingredients': {'rouge1': 0.5887445887445888, 'rougeL': 0.5887445887445888}}
llm_evaluation: 
{'feasibility_score': np.float64(0.35), 'comment': "The predicted recipe would produce a mustard, but it significantly deviates from the ground truth and the likely product shown in the image. It lacks key ingredients like eggs, brown sugar, and honey, which are crucial for the thickness, specific sweetness, and richness of the target mustard. The simple boil/simmer method is insufficient to achieve the thick, smooth, emulsified texture expected from an egg-based recipe cooked via double boiler, as in the ground truth. While it's not entirely infeasible as a basic mustard, it won't replicate the specific dish pictured

17it [02:07,  5.90s/it]

cosine_similarity: 
{'steps': np.float64(0.579536517301945), 'ingredients': np.float64(0.41306274479236027)}
bleu_scores: 
{'steps': 3.8616373748248454e-10, 'ingredients': 0.06247873011072971}
rouge_scores: 
{'steps': {'rouge1': 0.062086895413743036, 'rougeL': 0.0493808374856841}, 'ingredients': {'rouge1': 0.2814814814814815, 'rougeL': 0.2814814814814815}}
llm_evaluation: 
{'feasibility_score': np.float64(0.0), 'comment': 'The predicted recipe is completely misaligned with the image and ground truth. It describes a Simple Caesar Salad, using ingredients like romaine lettuce, croutons, and Parmesan cheese. The image and ground truth clearly show a Thai Beef Salad with steak, butter lettuce, cucumber, tomatoes, onion, herbs, and peanuts, dressed in a chile-lime sauce. The predicted recipe would produce a entirely different dish, making it completely infeasible for creating the salad pictured.', 'full': '{\n "evaluation": {\n  "feasibility_score": "0.0",\n  "comment": "The predicted recip

18it [02:15,  6.46s/it]

cosine_similarity: 
{'steps': np.float64(0.5613000236520391), 'ingredients': np.float64(0.5664145607110855)}
bleu_scores: 
{'steps': 2.726652657747824e-07, 'ingredients': 0.05211236971927989}
rouge_scores: 
{'steps': {'rouge1': 0.06558047464878089, 'rougeL': 0.050570620631680205}, 'ingredients': {'rouge1': 0.4028860028860029, 'rougeL': 0.4028860028860029}}
llm_evaluation: 
{'feasibility_score': np.float64(0.3), 'comment': "The predicted recipe for Bacon and Tomato Salad does not realistically produce the dish shown in the image. While it includes bacon and tomatoes, it completely misses the lettuce, the creamy horseradish mayonnaise sauce, and the crispy breadcrumbs (or nuts) clearly visible. The predicted recipe's dressing is a vinaigrette with honey, and it adds feta cheese, which are not present in the visual or the ground truth. Furthermore, the image shows components laid out for assembly, whereas the predicted recipe describes a simple mixed salad.", 'full': '{\n "evaluation": {\

20it [02:23,  7.20s/it]

cosine_similarity: 
{'steps': np.float64(0.5014523289865175), 'ingredients': np.float64(0.5461490578487844)}
bleu_scores: 
{'steps': 0.0025732887108971805, 'ingredients': 0.06466914919557828}
rouge_scores: 
{'steps': {'rouge1': 0.16875404239745323, 'rougeL': 0.12632188368622477}, 'ingredients': {'rouge1': 0.42875679875679873, 'rougeL': 0.42875679875679873}}
llm_evaluation: 
{'feasibility_score': np.float64(0.2), 'comment': "The predicted recipe is feasible as a standalone recipe, but it completely fails to match the image or the ground-truth recipe. The image and ground truth feature green and wax beans, while the prediction uses white beans, fundamentally altering the dish's appearance and texture. It also uses a different mix of herbs and a lemon-honey dressing instead of the vinegar-based one with shallots and basil specified in the ground truth. Although the steps are logical for the ingredients listed, the core ingredient discrepancy makes it irrelevant to the intended dish.", 'fu




Evaluating Baseline qwen-2.5 7B model output







In [4]:
import json
from typing import Any, Dict, List
import ntpath
import evals
import csv
import xml.etree.ElementTree as ET
import tqdm
import statistics

print("Baseline 3B")
def parse_jsonl(file_path: str) -> List[Dict[str, Any]]:
    """
    Parse a JSONL file where each line is a JSON object containing:
      - image_path: str
      - recipe_xml: str
      - ing_vecs: List[List[float]]
      - step_vecs: List[List[float]]

    Returns a list of dictionaries with those keys.
    """
    entries: List[Dict[str, Any]] = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            data = json.loads(line)
            entry = {
                'image_path': 'Food Images/Food Images/' + ntpath.basename(data.get('image_path', '')),
                'recipe_xml': data.get('recipe_xml', ''),
                'ing_vecs': data.get('ing_vecs', []),
                'step_vecs': data.get('step_vecs', []),
            }
            entries.append(entry)
    return entries

data = parse_jsonl('/Users/BenChung/Desktop/CSCI 467/vision-r1/data/dev_stage/dev_pairs_vec.jsonl')
# print(data[0]['image_path'])

results = []
csv_path = '/Users/BenChung/Desktop/CSCI 467/Generated Recipes Baseline 7B.csv'
with open(csv_path, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        xml_str = row.get("response", "")
        category = None
        if xml_str:
            try:
                root = ET.fromstring(xml_str)
                cat_elem = root.find('.//response_category')
                if cat_elem is not None:
                    category = cat_elem.text
            except ET.ParseError:
                # malformed XML—leave category as None
                pass
        # row['response_category'] = category
        results.append(xml_str)



model_xml_outputs = results

# print(model_xml_outputs)

aggregate_evals = []

# print(data)

for i, example in tqdm.tqdm(enumerate(data)):
    # print(model_xml_outputs[i])
    temp = evals.compute_evals(model_xml_outputs[i], example)
    if temp == None: continue
    else: 
        aggregate_evals.append(temp)
        print("cosine_similarity: ")
        print(temp['cosine_similarity'])
        print("bleu_scores: ")
        print(temp["bleu_scores"])
        print("rouge_scores: ")
        print(temp['rouge_scores'])
        print("llm_evaluation: ")
        print(temp['llm_evaluation'])

cos_steps       = [e['cosine_similarity']['steps']       for e in aggregate_evals]
cos_ing         = [e['cosine_similarity']['ingredients'] for e in aggregate_evals]
bleu_steps      = [e['bleu_scores']['steps']             for e in aggregate_evals]
bleu_ing        = [e['bleu_scores']['ingredients']       for e in aggregate_evals]

rouge_s1_steps  = [e['rouge_scores']['steps']['rouge1']       for e in aggregate_evals]
rougeL_steps    = [e['rouge_scores']['steps']['rougeL']       for e in aggregate_evals]
rouge_s1_ing    = [e['rouge_scores']['ingredients']['rouge1'] for e in aggregate_evals]
rougeL_ing      = [e['rouge_scores']['ingredients']['rougeL'] for e in aggregate_evals]

feas_scores     = [e['llm_evaluation']['feasibility_score']   for e in aggregate_evals]

def stats(vals: List[float]) -> Dict[str, float]:
    return {
        'mean': statistics.mean(vals),
        'std':  statistics.pstdev(vals)
    }

print(aggregate_evals)
print()
print()
print("Final averages and std deviations:")

new_dict = {'cosine_similarity': {
            'steps':       stats(cos_steps),
            'ingredients': stats(cos_ing),
        },
        'bleu_scores': {
            'steps':       stats(bleu_steps),
            'ingredients': stats(bleu_ing)
        },
        'rouge_scores': {
            'steps': {
                'rouge1': stats(rouge_s1_steps),
                'rougeL': stats(rougeL_steps),
            },
            'ingredients': {
                'rouge1': stats(rouge_s1_ing),
                'rougeL': stats(rougeL_ing),
            }
        },
        'llm_feasibility_score': stats(feas_scores)
}
print("cosine_similarity:\n\tsteps: ", end="")
print(new_dict['cosine_similarity']['steps'])
print("\tingredients: ", end="")
print(new_dict['cosine_similarity']['ingredients'])
print("bleu_scores:\n\tsteps: ", end="")
print(new_dict['bleu_scores']['steps'])
print("\tingredients: ", end="")
print(new_dict['bleu_scores']['ingredients'])
print("rouge_scores:\n\tsteps: ", end="")
print(new_dict['rouge_scores']['steps'])
print("\tingredients: ", end="")
print(new_dict['rouge_scores']['ingredients'])
print("llm_feasibility_score: ", end="")
print(new_dict['llm_feasibility_score'])



Baseline 3B


0it [00:00, ?it/s]

Calculating cosine similarity...
Calculating BLEU scores...
Calculating ROUGE scores...
Calculating LLM evaluation...


1it [00:09,  9.70s/it]

cosine_similarity: 
{'steps': np.float64(0.5041167389064803), 'ingredients': np.float64(0.6091995971775844)}
bleu_scores: 
{'steps': 1.764921602125214e-05, 'ingredients': 0.06920452110926026}
rouge_scores: 
{'steps': {'rouge1': 0.10603896705918883, 'rougeL': 0.08000717791625267}, 'ingredients': {'rouge1': 0.4761327561327562, 'rougeL': 0.4332756132756133}}
llm_evaluation: 
{'feasibility_score': np.float64(0.5), 'comment': 'The predicted recipe is a feasible set of instructions for making a standard Macaroni with Meat Sauce. However, it significantly deviates from the ground-truth recipe which is a Lentil Bolognese, using lentils instead of meat. While the visual result of a chunky sauce with cheese could be similar, the predicted recipe specifies elbow macaroni, which differs from the shaped pasta shown and suggested in the ground truth. The method is practical, but the core ingredients and specific pasta shape are mismatched compared to the provided context.', 'full': '{\n  "evaluation

2it [00:17,  8.33s/it]

cosine_similarity: 
{'steps': np.float64(0.34157418424376984), 'ingredients': np.float64(0.7566647438236264)}
bleu_scores: 
{'steps': 7.098492336939158e-05, 'ingredients': 0.16482224523239655}
rouge_scores: 
{'steps': {'rouge1': 0.07758031605326578, 'rougeL': 0.06210648856907961}, 'ingredients': {'rouge1': 0.6520104895104895, 'rougeL': 0.6520104895104895}}
llm_evaluation: 
{'feasibility_score': np.float64(0.1), 'comment': "The predicted recipe for 'Tomato and Olive Salad' is completely misaligned with the image and ground truth recipe. The image and ground truth clearly depict a beet salad (specifically Chioggia beets with scallions and mint), while the prediction focuses on tomatoes and olives which are not visible. While the predicted recipe is a plausible salad on its own, it would not produce the dish shown. The core ingredients are entirely different, making it highly infeasible as a prediction for this visual.", 'full': '{\n "evaluation": {\n  "feasibility_score": "0.1",\n  "comm

3it [00:23,  7.42s/it]

cosine_similarity: 
{'steps': np.float64(0.434488264617425), 'ingredients': np.float64(0.5647867286602131)}
bleu_scores: 
{'steps': 0.0024165133126875867, 'ingredients': 0.04534788592998151}
rouge_scores: 
{'steps': {'rouge1': 0.17026389986135682, 'rougeL': 0.10929151233812424}, 'ingredients': {'rouge1': 0.4455555555555556, 'rougeL': 0.4455555555555556}}
llm_evaluation: 
{'feasibility_score': np.float64(0.2), 'comment': 'The predicted recipe describes making a puff pastry tart with baked fruit. This is a completely different dish than the one shown in the image and described by the ground truth recipe, which is a baked ricotta and cream cheese slice with fresh berries. The predicted recipe is missing the essential ingredients for the creamy filling seen in the image. While the steps are coherent for making *a* tart, it would not produce the pictured item, making it highly inaccurate.', 'full': '{\n  "evaluation": {\n    "feasibility_score": "0.2",\n    "comment": "The predicted recipe 

4it [00:32,  8.16s/it]

cosine_similarity: 
{'steps': np.float64(0.5614863138232877), 'ingredients': np.float64(0.611314261682544)}
bleu_scores: 
{'steps': 0.002014273776539634, 'ingredients': 0.04188849352804676}
rouge_scores: 
{'steps': {'rouge1': 0.1384866921679066, 'rougeL': 0.09316565096767501}, 'ingredients': {'rouge1': 0.3632100223009314, 'rougeL': 0.3450282041191132}}
llm_evaluation: 
{'feasibility_score': np.float64(0.95), 'comment': 'As a chef, I can say the predicted recipe for Beef Stew is a much better match for the image than the ground-truth recipe. The ingredients and slow-cooking method outlined in the predicted recipe are perfect for achieving the tender, saucy beef cuts shown. The image depicts slow-cooked meat over a creamy base, which aligns well with serving beef stew over mashed potatoes or polenta, even though the recipe includes potatoes in the stew itself. This is a practical and standard recipe that would realistically produce a dish very similar to the one pictured, hence the high 

5it [00:39,  7.78s/it]

cosine_similarity: 
{'steps': np.float64(0.7000409698281352), 'ingredients': np.float64(0.6913137859286291)}
bleu_scores: 
{'steps': 0.0006172451946478969, 'ingredients': 0.06923940623335964}
rouge_scores: 
{'steps': {'rouge1': 0.1684607622362817, 'rougeL': 0.13699411316687427}, 'ingredients': {'rouge1': 0.5277563706135134, 'rougeL': 0.5277563706135134}}
llm_evaluation: 
{'feasibility_score': np.float64(0.6), 'comment': 'The predicted recipe provides a feasible method for making a red potato salad, aligning with the general appearance of the image in terms of potatoes, creamy dressing, and chives. The steps are practical and follow a logical order. However, it deviates significantly from the ground-truth recipe and image by omitting key ingredients like peas, celery, vinegar, Dijon mustard, garlic, and cayenne pepper. The absence of peas and celery, which appear visible in the image, makes it less accurate for this specific dish despite producing a functional potato salad.', 'full': '{

6it [00:45,  7.23s/it]

cosine_similarity: 
{'steps': np.float64(0.5807718451565093), 'ingredients': np.float64(0.57668648329934)}
bleu_scores: 
{'steps': 0.024162862805171253, 'ingredients': 0.02086974602538369}
rouge_scores: 
{'steps': {'rouge1': 0.2414548917095606, 'rougeL': 0.19912684938151828}, 'ingredients': {'rouge1': 0.37460407239819005, 'rougeL': 0.37460407239819005}}
llm_evaluation: 
{'feasibility_score': np.float64(0.1), 'comment': "The predicted recipe is completely unrelated to the image and the ground-truth recipe. The ingredients (bean sprouts, cabbage, cilantro, soy dressing) would produce a very different salad than the one pictured, which features jicama, parsley, celery leaves, and a light vinaigrette. The title 'Asian Noodle Salad' also doesn't match the image or content. While the predicted recipe is executable as a standalone recipe, it fails entirely to represent the target dish, hence the very low score.", 'full': '{\n "evaluation": {\n  "feasibility_score": "0.1",\n  "comment": "The p

7it [00:51,  6.66s/it]

cosine_similarity: 
{'steps': np.float64(0.512392293117682), 'ingredients': np.float64(0.5517665622981096)}
bleu_scores: 
{'steps': 0.00029064620063013407, 'ingredients': 0.12180542114279798}
rouge_scores: 
{'steps': {'rouge1': 0.08237266679734823, 'rougeL': 0.06069004976117803}, 'ingredients': {'rouge1': 0.580643166357452, 'rougeL': 0.580643166357452}}
llm_evaluation: 
{'feasibility_score': np.float64(0.1), 'comment': 'The predicted recipe is for glazed ham, which is entirely different from the prime rib roast shown in the image and described in the ground truth recipe. The protein, ingredients, and cooking method are completely mismatched. While the predicted recipe itself is feasible for making glazed ham, it would not realistically produce the dish depicted. The score reflects this significant discrepancy.', 'full': '{\n "evaluation": {\n  "feasibility_score": "0.1",\n  "comment": "The predicted recipe is for glazed ham, which is entirely different from the prime rib roast shown in

8it [00:56,  6.30s/it]

cosine_similarity: 
{'steps': np.float64(0.5319784275470252), 'ingredients': np.float64(0.7690200166711301)}
bleu_scores: 
{'steps': 1.9005407146006656e-07, 'ingredients': 0.10795181070674881}
rouge_scores: 
{'steps': {'rouge1': 0.0540345864136524, 'rougeL': 0.03912238854393223}, 'ingredients': {'rouge1': 0.6706349206349206, 'rougeL': 0.6706349206349206}}
llm_evaluation: 
{'feasibility_score': np.float64(0.0), 'comment': "The predicted recipe for 'Chocolate Coffee Cake' is completely unrelated to the image and the ground truth recipe, which describes a 'Sour Cream Apple Pie'. The ingredients, cooking steps, and resulting dish are fundamentally different. The predicted recipe would produce a dark chocolate cake, not the light-colored pie slice with crumb topping shown in the image. Therefore, it is completely infeasible to produce the dish in the image using the predicted recipe, resulting in a score of 0.0.", 'full': '{\n  "evaluation": {\n    "feasibility_score": "0.0",\n    "comment"

9it [01:01,  5.63s/it]

cosine_similarity: 
{'steps': np.float64(0.43264199023034405), 'ingredients': np.float64(0.44352407139942807)}
bleu_scores: 
{'steps': 0.0007232165944561758, 'ingredients': 0.043447302703249764}
rouge_scores: 
{'steps': {'rouge1': 0.09233023301013983, 'rougeL': 0.06447256750391746}, 'ingredients': {'rouge1': 0.3254761904761905, 'rougeL': 0.3254761904761905}}
llm_evaluation: 
{'feasibility_score': np.float64(0.0), 'comment': 'As a chef, I can clearly see the predicted recipe describes grilled chicken and roasted vegetables, which is a completely different dish from the pan-fried patties (crab cakes) visible in the image and described in the ground-truth recipe. The ingredients, cooking method, and final appearance described in the prediction bear no resemblance to the image or the correct recipe. While the predicted recipe itself is feasible as a standalone dish, it is entirely infeasible as a representation of the food shown or intended by the ground truth. Therefore, the score is 0.0.

10it [01:10,  6.79s/it]

cosine_similarity: 
{'steps': np.float64(0.5114130718317998), 'ingredients': np.float64(0.5048339839068134)}
bleu_scores: 
{'steps': 0.0002687991230665661, 'ingredients': 0.05216256487826041}
rouge_scores: 
{'steps': {'rouge1': 0.09974263123869424, 'rougeL': 0.07364376655715238}, 'ingredients': {'rouge1': 0.47835497835497837, 'rougeL': 0.47835497835497837}}
llm_evaluation: 
{'feasibility_score': np.float64(0.3), 'comment': "The predicted recipe describes a smoked pork chop, which is a completely different cooking method than the broiling used in the ground truth recipe and suggested by the image's appearance. It lacks the significant marinade present in the ground truth, crucial for the flavor and dark, glazed exterior seen in the image. The side dish components also differ from what is shown. While feasible as a method to cook a *different* pork dish, these instructions would not produce the specific dish pictured or described by the ground truth recipe.", 'full': '{\n "evaluation": {

11it [01:19,  7.34s/it]

cosine_similarity: 
{'steps': np.float64(0.5931877552430282), 'ingredients': np.float64(0.6540957423783647)}
bleu_scores: 
{'steps': 2.7501934613476825e-11, 'ingredients': 0.06911214306664767}
rouge_scores: 
{'steps': {'rouge1': 0.04868944570947897, 'rougeL': 0.03868114004929701}, 'ingredients': {'rouge1': 0.5022079772079772, 'rougeL': 0.46517094017094013}}
llm_evaluation: 
{'feasibility_score': np.float64(0.9), 'comment': "The predicted recipe outlines a standard and feasible method for making chicken enchiladas that would visually resemble the dish in the image. It correctly includes the core components: meat filling, red sauce, melted cheese, and toppings like sour cream and lime. While it differs significantly from the complex homemade sauce and specific cheeses in the ground truth recipe, it's a perfectly valid and common approach to making enchiladas. The main visual discrepancy might be the potential for smaller corn tortillas (implied by the size) versus the larger ones potenti

12it [01:29,  8.21s/it]

cosine_similarity: 
{'steps': np.float64(0.5413646405516991), 'ingredients': np.float64(0.6849335084113308)}
bleu_scores: 
{'steps': 2.3936168588458133e-09, 'ingredients': 0.08386212788125048}
rouge_scores: 
{'steps': {'rouge1': 0.04713327853144912, 'rougeL': 0.037999095943438865}, 'ingredients': {'rouge1': 0.5998128855271713, 'rougeL': 0.5998128855271713}}
llm_evaluation: 
{'feasibility_score': np.float64(0.3), 'comment': "The predicted recipe's ingredients are plausible for an apple filling and it matches the fruit shown in the image, unlike the ground truth. However, it describes making a standard pie in a pre-made crust rather than the galette (freeform tart) shown, which requires shaping and folding the crust edges over the filling. The steps do not align with creating the specific dish form visible in the image. While the ingredients are feasible for an apple dessert, the method doesn't reproduce the visual target.", 'full': '{\n "evaluation": {\n  "feasibility_score": "0.3",\n  

13it [01:40,  8.97s/it]

cosine_similarity: 
{'steps': np.float64(0.6660604858129598), 'ingredients': np.float64(0.6085160626877024)}
bleu_scores: 
{'steps': 0.002631957649914767, 'ingredients': 0.05095617071740641}
rouge_scores: 
{'steps': {'rouge1': 0.171491065200307, 'rougeL': 0.10979582633294509}, 'ingredients': {'rouge1': 0.5102922077922079, 'rougeL': 0.4902922077922078}}
llm_evaluation: 
{'feasibility_score': np.float64(0.25), 'comment': 'While the predicted recipe correctly identifies the core concept of a baked bread dish and uses some similar base ingredients like bread and milk, it has significant inaccuracies. The recipe calls for cranberries instead of the visible raspberries and lacks the prominent lemon flavour seen in the image. Most critically, the predicted recipe uses only one egg for a loaf of bread and milk, which is far too little to create the rich, custardy texture evident in the image. This recipe would likely result in a soggy, poorly set dish very different from the picture.', 'full':

14it [01:45,  7.81s/it]

cosine_similarity: 
{'steps': np.float64(0.833767801096166), 'ingredients': np.float64(0.6428968099676031)}
bleu_scores: 
{'steps': 0.0009504760095586216, 'ingredients': 0.08240339839119883}
rouge_scores: 
{'steps': {'rouge1': 0.1581439393939394, 'rougeL': 0.09753787878787878}, 'ingredients': {'rouge1': 0.5535353535353535, 'rougeL': 0.5535353535353535}}
llm_evaluation: 
{'feasibility_score': np.float64(0.0), 'comment': 'This predicted recipe bears no resemblance to the ground truth lemon custard pie or any of the other desserts shown in the image. The ingredients list seems like a random collection of baking items, including baking powder and flour which are not used in the ground truth recipe. The instructions repeatedly list the same generic steps for making three identical pies, which is not how you create three different desserts. This recipe is completely infeasible for producing the dishes depicted or described in the ground truth.', 'full': '{\n  "evaluation": {\n    "feasibilit

15it [01:51,  7.42s/it]

cosine_similarity: 
{'steps': np.float64(0.6017933703783644), 'ingredients': np.float64(0.6258855452696291)}
bleu_scores: 
{'steps': 9.834348658179556e-05, 'ingredients': 0.042445936820560845}
rouge_scores: 
{'steps': {'rouge1': 0.09984725937581955, 'rougeL': 0.07409200104904275}, 'ingredients': {'rouge1': 0.46226551226551227, 'rougeL': 0.46226551226551227}}
llm_evaluation: 
{'feasibility_score': np.float64(0.35), 'comment': "The predicted recipe uses significantly different ingredients (whole mustard seeds instead of powder and eggs) and a much simpler cooking method (simmering vs. double boiler with eggs). This fundamental difference in ingredients and technique means the predicted recipe would likely produce a grainy or pasty mustard, not the smooth, emulsified texture visible in the image and described by the ground-truth recipe which uses eggs as an emulsifier and thickener. While it's a feasible way to make *a* mustard, it wouldn't replicate the specific dish shown or the ground-

16it [01:58,  7.19s/it]

cosine_similarity: 
{'steps': np.float64(0.4130829981186912), 'ingredients': np.float64(0.5012589854996744)}
bleu_scores: 
{'steps': 0.0018052154458724939, 'ingredients': 0.04777765641015811}
rouge_scores: 
{'steps': {'rouge1': 0.1640366801657124, 'rougeL': 0.11482722773045352}, 'ingredients': {'rouge1': 0.41038961038961036, 'rougeL': 0.34978354978354975}}
llm_evaluation: 
{'feasibility_score': np.float64(0.0), 'comment': "The predicted recipe for 'Grilled Fennel and Apple Salad' would not produce the dish shown in the image or described in the ground truth recipe. The image clearly shows a fresh, uncooked salad with frisée, thinly sliced white vegetable (like kohlrabi or celery root), apple, and chives, dressed with a vinaigrette. The predicted recipe uses different key ingredients (fennel and parsley instead of kohlrabi/celery root and chives) and involves grilling the main components, which would result in a visually and texturally different dish from the crisp, fresh salad depicted

17it [02:04,  6.93s/it]

cosine_similarity: 
{'steps': np.float64(0.6838857662055151), 'ingredients': np.float64(0.5758073499365471)}
bleu_scores: 
{'steps': 5.930146839781371e-07, 'ingredients': 0.06994161931927398}
rouge_scores: 
{'steps': {'rouge1': 0.0836317598119009, 'rougeL': 0.07203231446313378}, 'ingredients': {'rouge1': 0.3801892551892552, 'rougeL': 0.3801892551892552}}
llm_evaluation: 
{'feasibility_score': np.float64(0.3), 'comment': "This predicted recipe describes a functional salad, but it fails significantly to match the dish depicted in the image and the ground truth. The most critical omission is the beef, which is central to a 'Thai Beef Salad' and clearly visible. The inclusion of avocado and the use of romaine lettuce also diverge from the source material. While the dressing shares some Asian flavors, the overall ingredient list and lack of meat preparation steps make it unfeasible for recreating the intended dish.", 'full': '{\n  "evaluation": {\n    "feasibility_score": "0.3",\n    "comme

18it [02:12,  7.13s/it]

cosine_similarity: 
{'steps': np.float64(0.5162384494284241), 'ingredients': np.float64(0.5716005077553936)}
bleu_scores: 
{'steps': 2.3083644219448536e-06, 'ingredients': 0.06326664735685866}
rouge_scores: 
{'steps': {'rouge1': 0.06920681415797984, 'rougeL': 0.04950435076024543}, 'ingredients': {'rouge1': 0.4258494031221304, 'rougeL': 0.4258494031221304}}
llm_evaluation: 
{'feasibility_score': np.float64(0.1), 'comment': 'The predicted recipe is entirely unrelated to the dish shown in the image and the ground-truth recipe. It describes a grilled steak salad with different ingredients (steak, spinach, feta, vinaigrette) and a tossed preparation method, which does not match the deconstructed layout and components visible in the image (bacon strips, lettuce leaves, creamy sauce, breadcrumbs/nuts). While the predicted recipe is technically feasible as a standalone dish, it completely fails to replicate or align with the provided context.', 'full': '{\n "evaluation": {\n  "feasibility_scor

19it [02:21,  7.77s/it]

cosine_similarity: 
{'steps': np.float64(0.7282422788137265), 'ingredients': np.float64(0.6427167822005774)}
bleu_scores: 
{'steps': 6.611363028745495e-05, 'ingredients': 0.06442929602381638}
rouge_scores: 
{'steps': {'rouge1': 0.13606696011482647, 'rougeL': 0.09194080058151566}, 'ingredients': {'rouge1': 0.42954267954267955, 'rougeL': 0.4035686535686536}}
llm_evaluation: 
{'feasibility_score': np.float64(0.65), 'comment': "The predicted recipe is feasible for grilling a steak with herb butter, but it doesn't fully capture the specific method and visual result shown in the image and described in the ground truth. The ground truth uses a 'board dressing' applied to the sliced steak, coating the pieces, whereas the predicted recipe suggests serving the herb butter on the side. The specified grilling time in the predicted recipe also seems short for a thick steak often used in such preparations, potentially not achieving the char or doneness seen in the image. While functional, it's not a

20it [02:29,  7.46s/it]

cosine_similarity: 
{'steps': np.float64(0.5483549516360793), 'ingredients': np.float64(0.621614139660321)}
bleu_scores: 
{'steps': 0.002667752782945187, 'ingredients': 0.06663059344033823}
rouge_scores: 
{'steps': {'rouge1': 0.19132738792690057, 'rougeL': 0.15471997823475628}, 'ingredients': {'rouge1': 0.5056203056203056, 'rougeL': 0.5056203056203056}}
llm_evaluation: 
{'feasibility_score': np.float64(0.3), 'comment': "This predicted recipe deviates significantly from the ground-truth recipe and the image. It omits key ingredients like wax beans and shallots, which are visible and listed in the original. It also adds ingredients like cherry tomatoes, parsley, and Dijon mustard, which are not present. The instruction for preparing the beans is vague and doesn't align with properly cooking fresh beans as described in the ground truth. While it outlines making a salad with a dressing, it would not produce the specific dish shown in the image or described by the original recipe due to the




Evaluating Baseline qwen-2.5 32B model output







In [5]:
import json
from typing import Any, Dict, List
import ntpath
import evals
import csv
import xml.etree.ElementTree as ET
import tqdm
import statistics

print("Baseline 3B")
def parse_jsonl(file_path: str) -> List[Dict[str, Any]]:
    """
    Parse a JSONL file where each line is a JSON object containing:
      - image_path: str
      - recipe_xml: str
      - ing_vecs: List[List[float]]
      - step_vecs: List[List[float]]

    Returns a list of dictionaries with those keys.
    """
    entries: List[Dict[str, Any]] = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            data = json.loads(line)
            entry = {
                'image_path': 'Food Images/Food Images/' + ntpath.basename(data.get('image_path', '')),
                'recipe_xml': data.get('recipe_xml', ''),
                'ing_vecs': data.get('ing_vecs', []),
                'step_vecs': data.get('step_vecs', []),
            }
            entries.append(entry)
    return entries

data = parse_jsonl('/Users/BenChung/Desktop/CSCI 467/vision-r1/data/dev_stage/dev_pairs_vec.jsonl')
# print(data[0]['image_path'])

results = []
csv_path = '/Users/BenChung/Desktop/CSCI 467/Generated Recipes Baseline 32B.csv'
with open(csv_path, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        xml_str = row.get("response", "")
        category = None
        if xml_str:
            try:
                root = ET.fromstring(xml_str)
                cat_elem = root.find('.//response_category')
                if cat_elem is not None:
                    category = cat_elem.text
            except ET.ParseError:
                # malformed XML—leave category as None
                pass
        # row['response_category'] = category
        results.append(xml_str)



model_xml_outputs = results

# print(model_xml_outputs)

aggregate_evals = []

# print(data)

for i, example in tqdm.tqdm(enumerate(data)):
    # print(model_xml_outputs[i])
    temp = evals.compute_evals(model_xml_outputs[i], example)
    if temp == None: continue
    else: 
        aggregate_evals.append(temp)
        print("cosine_similarity: ")
        print(temp['cosine_similarity'])
        print("bleu_scores: ")
        print(temp["bleu_scores"])
        print("rouge_scores: ")
        print(temp['rouge_scores'])
        print("llm_evaluation: ")
        print(temp['llm_evaluation'])

cos_steps       = [e['cosine_similarity']['steps']       for e in aggregate_evals]
cos_ing         = [e['cosine_similarity']['ingredients'] for e in aggregate_evals]
bleu_steps      = [e['bleu_scores']['steps']             for e in aggregate_evals]
bleu_ing        = [e['bleu_scores']['ingredients']       for e in aggregate_evals]

rouge_s1_steps  = [e['rouge_scores']['steps']['rouge1']       for e in aggregate_evals]
rougeL_steps    = [e['rouge_scores']['steps']['rougeL']       for e in aggregate_evals]
rouge_s1_ing    = [e['rouge_scores']['ingredients']['rouge1'] for e in aggregate_evals]
rougeL_ing      = [e['rouge_scores']['ingredients']['rougeL'] for e in aggregate_evals]

feas_scores     = [e['llm_evaluation']['feasibility_score']   for e in aggregate_evals]

def stats(vals: List[float]) -> Dict[str, float]:
    return {
        'mean': statistics.mean(vals),
        'std':  statistics.pstdev(vals)
    }

print(aggregate_evals)
print()
print()
print("Final averages and std deviations:")

new_dict = {'cosine_similarity': {
            'steps':       stats(cos_steps),
            'ingredients': stats(cos_ing),
        },
        'bleu_scores': {
            'steps':       stats(bleu_steps),
            'ingredients': stats(bleu_ing)
        },
        'rouge_scores': {
            'steps': {
                'rouge1': stats(rouge_s1_steps),
                'rougeL': stats(rougeL_steps),
            },
            'ingredients': {
                'rouge1': stats(rouge_s1_ing),
                'rougeL': stats(rougeL_ing),
            }
        },
        'llm_feasibility_score': stats(feas_scores)
}
print("cosine_similarity:\n\tsteps: ", end="")
print(new_dict['cosine_similarity']['steps'])
print("\tingredients: ", end="")
print(new_dict['cosine_similarity']['ingredients'])
print("bleu_scores:\n\tsteps: ", end="")
print(new_dict['bleu_scores']['steps'])
print("\tingredients: ", end="")
print(new_dict['bleu_scores']['ingredients'])
print("rouge_scores:\n\tsteps: ", end="")
print(new_dict['rouge_scores']['steps'])
print("\tingredients: ", end="")
print(new_dict['rouge_scores']['ingredients'])
print("llm_feasibility_score: ", end="")
print(new_dict['llm_feasibility_score'])



Baseline 3B


0it [00:00, ?it/s]

Calculating cosine similarity...
Calculating BLEU scores...
Calculating ROUGE scores...
Calculating LLM evaluation...


1it [00:06,  6.84s/it]

cosine_similarity: 
{'steps': np.float64(0.4318893307474136), 'ingredients': np.float64(0.4781369234449445)}
bleu_scores: 
{'steps': 0.005069595493544869, 'ingredients': 0.051222312243778906}
rouge_scores: 
{'steps': {'rouge1': 0.17066815247422207, 'rougeL': 0.12826228792179942}, 'ingredients': {'rouge1': 0.3316738816738817, 'rougeL': 0.3316738816738817}}
llm_evaluation: 
{'feasibility_score': np.float64(0.1), 'comment': "The predicted recipe for 'Tuna Mac and Cheese' is completely different from the dish shown in the image, which is a lentil bolognese. The ingredients (tuna, mozzarella, creamy sauce base) and preparation steps (making a cheese sauce, baking) do not align with the visual appearance (tomato/lentil sauce, grated cheese topping) or the ground truth recipe. The pasta shape is also incorrect. This recipe would produce a wholly different dish.", 'full': '{\n  "evaluation": {\n    "feasibility_score": "0.1",\n    "comment": "The predicted recipe for \'Tuna Mac and Cheese\' is

2it [00:12,  6.08s/it]

cosine_similarity: 
{'steps': np.float64(0.4190025059097334), 'ingredients': np.float64(0.7000321915399098)}
bleu_scores: 
{'steps': 0.0013072996266002897, 'ingredients': 0.11739549703120164}
rouge_scores: 
{'steps': {'rouge1': 0.12106432880242576, 'rougeL': 0.08842230284037436}, 'ingredients': {'rouge1': 0.5896464646464646, 'rougeL': 0.5896464646464646}}
llm_evaluation: 
{'feasibility_score': np.float64(0.05), 'comment': 'The predicted recipe describes a tomato and olive salad, which is completely different from the Chioggia beet salad shown in the image and detailed in the ground truth recipe. The ingredients, visual appearance, and preparation steps are entirely mismatched. While the predicted steps are feasible for making a tomato salad, they are irrelevant to the dish presented. The extremely low score reflects this complete lack of alignment.', 'full': '{\n  "evaluation": {\n    "feasibility_score": "0.05",\n    "comment": "The predicted recipe describes a tomato and olive salad,

3it [00:21,  7.68s/it]

cosine_similarity: 
{'steps': np.float64(0.37566308926864783), 'ingredients': np.float64(0.5735907707630217)}
bleu_scores: 
{'steps': 0.01825486447422265, 'ingredients': 0.043953323166588974}
rouge_scores: 
{'steps': {'rouge1': 0.22684368962466292, 'rougeL': 0.14408779678129888}, 'ingredients': {'rouge1': 0.3345236825628983, 'rougeL': 0.3345236825628983}}
llm_evaluation: 
{'feasibility_score': np.float64(0.3), 'comment': 'The predicted recipe shares common ingredients like berries, cream cheese (as an option), lemon, and vanilla, and the final step of adding berries is correct. However, it fundamentally misunderstands the base of the dish shown in the image and described in the ground truth. The predicted recipe describes a tart with a crust and added filling, whereas the image and ground truth show a baked slice base (ricotta and cream cheese) without a distinct pastry crust. It also misses the key ricotta ingredient. While the predicted recipe is feasible as a general berry tart, it 

4it [00:30,  7.87s/it]

cosine_similarity: 
{'steps': np.float64(0.5141819341176939), 'ingredients': np.float64(0.5107270487799306)}
bleu_scores: 
{'steps': 0.003917256314923258, 'ingredients': 0.03051368972923772}
rouge_scores: 
{'steps': {'rouge1': 0.14601125408668514, 'rougeL': 0.10878454675729712}, 'ingredients': {'rouge1': 0.28442273465156764, 'rougeL': 0.2738964188620939}}
llm_evaluation: 
{'feasibility_score': np.float64(0.9), 'comment': 'The predicted recipe for Beef Bourguignon is highly feasible and would realistically produce a dish very similar to the one pictured. The ingredients and slow-cooking method are appropriate for creating tender stewed beef in a rich, dark sauce, which visually matches the image well, especially served over a base like polenta. While step 6 is a bit unusual, the overall process is sound. This recipe, however, bears no resemblance to the provided ground-truth recipe for quick-cooked steaks.', 'full': '{\n "evaluation": {\n  "feasibility_score": "0.9",\n  "comment": "The 

5it [00:37,  7.55s/it]

cosine_similarity: 
{'steps': np.float64(0.6666299295729998), 'ingredients': np.float64(0.6716736945057873)}
bleu_scores: 
{'steps': 0.003778748949912806, 'ingredients': 0.12049940623665008}
rouge_scores: 
{'steps': {'rouge1': 0.2258950543780265, 'rougeL': 0.16459020855305687}, 'ingredients': {'rouge1': 0.5761303511303512, 'rougeL': 0.5761303511303512}}
llm_evaluation: 
{'feasibility_score': np.float64(0.25), 'comment': 'The predicted recipe outlines a standard potato salad but does not accurately reflect the image or the ground-truth recipe. It specifies peeled potatoes instead of unpeeled red-skinned, and introduces ingredients like hard-boiled egg, red onion, and parsley which are absent in the original and image. Key components like peas, vinegar, garlic, and cayenne from the ground-truth are missing. While the steps are practical for *a* potato salad, this recipe would create a significantly different dish than the one shown.', 'full': '{\n  "evaluation": {\n    "feasibility_score

6it [00:45,  7.67s/it]

cosine_similarity: 
{'steps': np.float64(0.47592799430807337), 'ingredients': np.float64(0.6310759778840348)}
bleu_scores: 
{'steps': 0.012754022115643355, 'ingredients': 0.05877180784776226}
rouge_scores: 
{'steps': {'rouge1': 0.17250691869335935, 'rougeL': 0.1348593810458217}, 'ingredients': {'rouge1': 0.4063277000777001, 'rougeL': 0.4063277000777001}}
llm_evaluation: 
{'feasibility_score': np.float64(0.65), 'comment': "The predicted recipe is a perfectly feasible and practical recipe for making a salad. The steps are logical and easy to follow. However, the ingredients deviate significantly from the ground truth recipe, substituting cilantro and daikon radish for parsley, celery leaves, jicama, and radish sprouts. While the texture profile (julienned root, leafy greens, sliced alliums) is similar to the image, the specific greens in the picture don't strongly resemble cilantro, making it a less accurate visual match to the intended dish shown and described by the ground truth.", 'fu

7it [00:52,  7.60s/it]

cosine_similarity: 
{'steps': np.float64(0.4888380448230292), 'ingredients': np.float64(0.5516611389806017)}
bleu_scores: 
{'steps': 0.0026888585430779924, 'ingredients': 0.051471504227230694}
rouge_scores: 
{'steps': {'rouge1': 0.13052805262169453, 'rougeL': 0.08419079031302797}, 'ingredients': {'rouge1': 0.3978138528138528, 'rougeL': 0.3978138528138528}}
llm_evaluation: 
{'feasibility_score': np.float64(0.0), 'comment': 'The predicted recipe describes how to cook beef brisket, a cut entirely different from the prime rib roast shown in the image and detailed in the ground truth. The cooking method and target temperature are appropriate for brisket but would not yield the medium-rare interior or thick slices visible. Key elements from the ground truth, such as the mustard crust and horseradish sauce, are also completely absent. This recipe is entirely unrelated to the dish pictured.', 'full': '{\n  "evaluation": {\n    "feasibility_score": "0.0",\n    "comment": "The predicted recipe d

8it [00:59,  7.35s/it]

cosine_similarity: 
{'steps': np.float64(0.5582385360560151), 'ingredients': np.float64(0.7294769858908204)}
bleu_scores: 
{'steps': 0.00020514943064168915, 'ingredients': 0.09405640252653924}
rouge_scores: 
{'steps': {'rouge1': 0.1058071717366333, 'rougeL': 0.06716979340521068}, 'ingredients': {'rouge1': 0.6367815055315055, 'rougeL': 0.6367815055315055}}
llm_evaluation: 
{'feasibility_score': np.float64(0.1), 'comment': 'The predicted recipe is for a cake, while the image and ground truth show a pie with a crumbly topping. The ingredients and method are fundamentally different. It lacks pastry dough, apples, and sour cream, including cake-specific ingredients like baking powder and milk instead. While the predicted recipe could theoretically produce a cake, it would not yield the dish pictured or match the ground truth recipe in any meaningful way, aside from the flower garnish. This makes it largely irrelevant to the visual target and ground truth.', 'full': '{\n  "evaluation": {\n  

9it [01:05,  7.15s/it]

cosine_similarity: 
{'steps': np.float64(0.46870335025169513), 'ingredients': np.float64(0.4832010696028833)}
bleu_scores: 
{'steps': 0.0094859640387772, 'ingredients': 0.03683671267769419}
rouge_scores: 
{'steps': {'rouge1': 0.1932437156370886, 'rougeL': 0.12133430999470785}, 'ingredients': {'rouge1': 0.3212698412698413, 'rougeL': 0.3212698412698413}}
llm_evaluation: 
{'feasibility_score': np.float64(0.0), 'comment': 'This predicted recipe describes how to make seafood fettuccine carbonara, which is entirely different from the crab cakes shown in the image and detailed in the ground-truth recipe. The ingredients, cooking method (pasta vs. pan-frying patties), and final appearance are completely mismatched. While the predicted recipe might be feasible for making carbonara, it would not produce the dish pictured, resulting in a feasibility score of 0.0.', 'full': '{\n "evaluation": {\n  "feasibility_score": "0.0",\n  "comment": "This predicted recipe describes how to make seafood fettuc

10it [01:16,  8.07s/it]

cosine_similarity: 
{'steps': np.float64(0.407538817320389), 'ingredients': np.float64(0.5416920988830594)}
bleu_scores: 
{'steps': 0.0019329275471541062, 'ingredients': 0.046151596899083346}
rouge_scores: 
{'steps': {'rouge1': 0.1253639615475209, 'rougeL': 0.07349236354741462}, 'ingredients': {'rouge1': 0.4024416941083608, 'rougeL': 0.4024416941083608}}
llm_evaluation: 
{'feasibility_score': np.float64(0.15), 'comment': 'The predicted recipe for Glazed Pork Belly is significantly different from the ground-truth Hoisin-Marinated Pork Chops and the image. The image shows a thick cut of pork chop or roast, not pork belly, and the ground truth confirms pork chops. The cooking method (baking vs. broiling) and main ingredient are incorrect. While the predicted farro salad visually somewhat matches the side dish in the image, the core pork preparation is completely off. This recipe would not produce the pictured dish.', 'full': '{\n "evaluation": {\n  "feasibility_score": "0.15",\n  "comment

11it [01:22,  7.66s/it]

cosine_similarity: 
{'steps': np.float64(0.43271269065336465), 'ingredients': np.float64(0.47086462020367803)}
bleu_scores: 
{'steps': 1.4411135827174264e-07, 'ingredients': 0.042104155855855425}
rouge_scores: 
{'steps': {'rouge1': 0.05914446698714959, 'rougeL': 0.04191153991161355}, 'ingredients': {'rouge1': 0.33324862272230693, 'rougeL': 0.32795761743130164}}
llm_evaluation: 
{'feasibility_score': np.float64(0.0), 'comment': 'The predicted recipe describes making spicy fried chicken strips with a creamy sauce. This is fundamentally different from the baked enchiladas shown in the image and detailed in the ground-truth recipe, which involves rolled tortillas, a chile-based sauce, cheese, and baking. The ingredients, cooking method (frying vs. baking), and final appearance of the predicted dish would not match the image or the ground truth at all. The predicted recipe is completely infeasible for producing this specific dish.', 'full': '{\n "evaluation": {\n  "feasibility_score": "0.0"

12it [01:33,  8.54s/it]

cosine_similarity: 
{'steps': np.float64(0.5772920405093968), 'ingredients': np.float64(0.643032248491381)}
bleu_scores: 
{'steps': 0.00043575759414312835, 'ingredients': 0.04435733363011308}
rouge_scores: 
{'steps': {'rouge1': 0.13113603652117792, 'rougeL': 0.08336417564000694}, 'ingredients': {'rouge1': 0.4907407407407407, 'rougeL': 0.4907407407407407}}
llm_evaluation: 
{'feasibility_score': np.float64(0.2), 'comment': 'The predicted recipe describes how to make a standard apple tart baked in a pan with a cooked fruit filling. This fundamentally differs from the image and ground-truth recipe, which depict a rustic, freeform galette with neatly arranged, likely fresh or partially cooked, fruit slices. The crust type and method are also different (pre-made sheet vs. homemade dough, pan-baked vs. freeform). While the predicted recipe is feasible for making an apple tart, it would not produce the dish shown in the image.', 'full': '{\n "evaluation": {\n  "feasibility_score": "0.2",\n  "c

13it [01:42,  8.75s/it]

cosine_similarity: 
{'steps': np.float64(0.5647646580561294), 'ingredients': np.float64(0.5860707941218386)}
bleu_scores: 
{'steps': 0.0036985237763991607, 'ingredients': 0.04996109817123122}
rouge_scores: 
{'steps': {'rouge1': 0.16697299026906237, 'rougeL': 0.13062309412833326}, 'ingredients': {'rouge1': 0.4696464646464646, 'rougeL': 0.45631313131313134}}
llm_evaluation: 
{'feasibility_score': np.float64(0.3), 'comment': 'The predicted recipe outlines a standard and feasible method for making bread pudding. However, it deviates significantly from the ground truth and the image by specifying dried cranberries and walnuts instead of fresh raspberries. The flavoring (vanilla vs lemon) and fat/sweetener profile (butter/cream/sugar vs maple syrup) are also different. While the cooking technique is sound, these ingredient changes mean the recipe would produce a different dish, especially visually due to the dried fruit and presence of nuts not seen in the image. The score reflects its pract

14it [01:51,  8.80s/it]

cosine_similarity: 
{'steps': np.float64(0.6696310638023689), 'ingredients': np.float64(0.6127229336397447)}
bleu_scores: 
{'steps': 0.010060316519393344, 'ingredients': 0.06677920448994769}
rouge_scores: 
{'steps': {'rouge1': 0.28318423092980255, 'rougeL': 0.1411439949684716}, 'ingredients': {'rouge1': 0.4366943747195848, 'rougeL': 0.4366943747195848}}
llm_evaluation: 
{'feasibility_score': np.float64(0.85), 'comment': "The predicted recipe correctly identifies and provides instructions for all three distinct desserts shown in the image. The ingredients and steps are generally feasible and would produce visually similar results, including a custard-like pie with whipped cream, a crumble-topped pie, and a chocolate cake. While the 'Whipped Cream Pie' recipe is a simplified version compared to the ground-truth 'Lemon Custard Pie', omitting the lemon and using a less refined technique, it is still a plausible recipe for a cream pie that matches the image visually. The crumble topping ins

15it [01:59,  8.43s/it]

cosine_similarity: 
{'steps': np.float64(0.6130445391153023), 'ingredients': np.float64(0.49416461901625586)}
bleu_scores: 
{'steps': 0.002778477578372198, 'ingredients': 0.020454958325443664}
rouge_scores: 
{'steps': {'rouge1': 0.13580169567180225, 'rougeL': 0.10157837245302231}, 'ingredients': {'rouge1': 0.19854497354497355, 'rougeL': 0.19854497354497355}}
llm_evaluation: 
{'feasibility_score': np.float64(0.4), 'comment': "The predicted recipe provides a plausible method for a simple honey mustard, but it significantly deviates from the ground truth and the visual characteristics of the image. It's a cold mix using pre-made Dijon, whereas the ground truth uses mustard powder and eggs cooked via a double boiler, which creates the smooth, thick, stable consistency seen in the jars in the image. While the predicted recipe makes a type of honey mustard, it would likely be thinner and less emulsified than pictured. The ingredient list and cooking method are fundamentally different.", 'ful

16it [02:05,  7.91s/it]

cosine_similarity: 
{'steps': np.float64(0.6282486393966191), 'ingredients': np.float64(0.6112772878457903)}
bleu_scores: 
{'steps': 0.010278075006570646, 'ingredients': 0.08512522340575113}
rouge_scores: 
{'steps': {'rouge1': 0.20814382702772893, 'rougeL': 0.1526365041940419}, 'ingredients': {'rouge1': 0.45573671497584545, 'rougeL': 0.45573671497584545}}
llm_evaluation: 
{'feasibility_score': np.float64(0.5), 'comment': 'The predicted recipe outlines a feasible method for making a salad, with logical steps. However, it diverges significantly from the image and ground truth recipe regarding key ingredients; it uses radish instead of kohlrabi/celery root and arugula instead of frisée. The dressing components also differ. While it would produce a salad, it would not be the specific dish shown, leading to a moderate feasibility score.', 'full': '{\n "evaluation": {\n  "feasibility_score": "0.5",\n  "comment": "The predicted recipe outlines a feasible method for making a salad, with logica

17it [02:12,  7.61s/it]

cosine_similarity: 
{'steps': np.float64(0.5778318112057463), 'ingredients': np.float64(0.501218632536852)}
bleu_scores: 
{'steps': 3.304397388497743e-05, 'ingredients': 0.04333129892220015}
rouge_scores: 
{'steps': {'rouge1': 0.0922615661990888, 'rougeL': 0.06084600053251232}, 'ingredients': {'rouge1': 0.32630577483518663, 'rougeL': 0.32630577483518663}}
llm_evaluation: 
{'feasibility_score': np.float64(0.1), 'comment': 'The predicted recipe describes an Avocado and Spinach Salad with feta, which is completely different from the Thai Beef Salad shown in the image and detailed in the ground truth recipe. The main ingredients (beef, specific lettuce, cucumber, tomato, herbs, chili-lime dressing vs. spinach, avocado, bell pepper, feta, vinaigrette) are entirely mismatched. While the predicted steps are feasible for the salad it describes, it bears no resemblance to the visual target or the intended dish, hence the very low feasibility score.', 'full': '{\n  "evaluation": {\n    "feasibil

18it [02:21,  8.00s/it]

cosine_similarity: 
{'steps': np.float64(0.5161489570079049), 'ingredients': np.float64(0.5359677148119334)}
bleu_scores: 
{'steps': 4.2303095449574187e-05, 'ingredients': 0.05488369506220401}
rouge_scores: 
{'steps': {'rouge1': 0.0912479362107331, 'rougeL': 0.05956939355578374}, 'ingredients': {'rouge1': 0.37811688311688313, 'rougeL': 0.37811688311688313}}
llm_evaluation: 
{'feasibility_score': np.float64(0.2), 'comment': "The predicted recipe is completely different from the dish shown and described in the ground truth. It misidentifies the main protein as prosciutto instead of bacon and proposes a completely different dressing (Lemon-Dijon vinaigrette vs. creamy horseradish mayonnaise). Key components visible in the image and listed in the ground truth, such as breadcrumbs/croutons and basil, are also missing. While the predicted recipe is a feasible dish in itself, it does not match the image or the ground-truth recipe's ingredients or steps.", 'full': '{\n "evaluation": {\n  "feas

19it [02:30,  8.40s/it]

cosine_similarity: 
{'steps': np.float64(0.5797421530664408), 'ingredients': np.float64(0.5685595175090796)}
bleu_scores: 
{'steps': 0.00045424455120924776, 'ingredients': 0.05271190877675713}
rouge_scores: 
{'steps': {'rouge1': 0.12043325782776303, 'rougeL': 0.07708517339341306}, 'ingredients': {'rouge1': 0.3325876504447933, 'rougeL': 0.3325876504447933}}
llm_evaluation: 
{'feasibility_score': np.float64(0.4), 'comment': "The predicted recipe includes grilling steak with herb butter, which aligns somewhat with the image's sliced steak topped with herbs. However, it deviates significantly by adding seared vegetables and a balsamic dressing not present in the visual or ground truth. The ground truth recipe and image feature a T-bone steak specifically and use a 'board dressing' method which is key to the appearance of the sliced steak. The predicted recipe misses these crucial details, making it only moderately feasible to produce the exact dish pictured.", 'full': '{\n "evaluation": {\

20it [02:36,  7.85s/it]

cosine_similarity: 
{'steps': np.float64(0.45894492026350947), 'ingredients': np.float64(0.5675192502668893)}
bleu_scores: 
{'steps': 0.009178556265743926, 'ingredients': 0.068566864871851}
rouge_scores: 
{'steps': {'rouge1': 0.22866708385481851, 'rougeL': 0.16438429981921845}, 'ingredients': {'rouge1': 0.4013708513708514, 'rougeL': 0.4013708513708514}}
llm_evaluation: 
{'feasibility_score': np.float64(0.1), 'comment': "The predicted recipe shares only the title with the ground truth and image. The ingredients listed (lettuce, butter/kidney beans, stilton cheese, almonds) are completely different from the string beans, shallots, and basil in the ground truth recipe and pictured. Consequently, the cooking steps and the resulting dish are entirely different from what is shown in the image. While it describes a plausible recipe for *a* salad, it does not align with the specific 'Herbed Bean Salad' depicted, hence the very low feasibility score.", 'full': '{\n  "evaluation": {\n    "feasib




Evaluating GRPO fine-tuned qwen-2.5 3B model output







In [6]:
import json
from typing import Any, Dict, List
import ntpath
import evals
import csv
import xml.etree.ElementTree as ET
import tqdm
import statistics

print("Baseline 3B")
def parse_jsonl(file_path: str) -> List[Dict[str, Any]]:
    """
    Parse a JSONL file where each line is a JSON object containing:
      - image_path: str
      - recipe_xml: str
      - ing_vecs: List[List[float]]
      - step_vecs: List[List[float]]

    Returns a list of dictionaries with those keys.
    """
    entries: List[Dict[str, Any]] = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            data = json.loads(line)
            entry = {
                'image_path': 'Food Images/Food Images/' + ntpath.basename(data.get('image_path', '')),
                'recipe_xml': data.get('recipe_xml', ''),
                'ing_vecs': data.get('ing_vecs', []),
                'step_vecs': data.get('step_vecs', []),
            }
            entries.append(entry)
    return entries

data = parse_jsonl('/Users/BenChung/Desktop/CSCI 467/vision-r1/data/dev_stage/dev_pairs_vec.jsonl')
# print(data[0]['image_path'])

results = []
csv_path = '/Users/BenChung/Desktop/CSCI 467/Generated Recipes.csv'
with open(csv_path, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        xml_str = row.get("response", "")
        category = None
        if xml_str:
            try:
                root = ET.fromstring(xml_str)
                cat_elem = root.find('.//response_category')
                if cat_elem is not None:
                    category = cat_elem.text
            except ET.ParseError:
                # malformed XML—leave category as None
                pass
        # row['response_category'] = category
        results.append(xml_str)



model_xml_outputs = results

# print(model_xml_outputs)

aggregate_evals = []

# print(data)

for i, example in tqdm.tqdm(enumerate(data)):
    # print(model_xml_outputs[i])
    temp = evals.compute_evals(model_xml_outputs[i], example)
    if temp == None: continue
    else: 
        aggregate_evals.append(temp)
        print("cosine_similarity: ")
        print(temp['cosine_similarity'])
        print("bleu_scores: ")
        print(temp["bleu_scores"])
        print("rouge_scores: ")
        print(temp['rouge_scores'])
        print("llm_evaluation: ")
        print(temp['llm_evaluation'])

cos_steps       = [e['cosine_similarity']['steps']       for e in aggregate_evals]
cos_ing         = [e['cosine_similarity']['ingredients'] for e in aggregate_evals]
bleu_steps      = [e['bleu_scores']['steps']             for e in aggregate_evals]
bleu_ing        = [e['bleu_scores']['ingredients']       for e in aggregate_evals]

rouge_s1_steps  = [e['rouge_scores']['steps']['rouge1']       for e in aggregate_evals]
rougeL_steps    = [e['rouge_scores']['steps']['rougeL']       for e in aggregate_evals]
rouge_s1_ing    = [e['rouge_scores']['ingredients']['rouge1'] for e in aggregate_evals]
rougeL_ing      = [e['rouge_scores']['ingredients']['rougeL'] for e in aggregate_evals]

feas_scores     = [e['llm_evaluation']['feasibility_score']   for e in aggregate_evals]

def stats(vals: List[float]) -> Dict[str, float]:
    return {
        'mean': statistics.mean(vals),
        'std':  statistics.pstdev(vals)
    }

print(aggregate_evals)
print()
print()
print("Final averages and std deviations:")

new_dict = {'cosine_similarity': {
            'steps':       stats(cos_steps),
            'ingredients': stats(cos_ing),
        },
        'bleu_scores': {
            'steps':       stats(bleu_steps),
            'ingredients': stats(bleu_ing)
        },
        'rouge_scores': {
            'steps': {
                'rouge1': stats(rouge_s1_steps),
                'rougeL': stats(rougeL_steps),
            },
            'ingredients': {
                'rouge1': stats(rouge_s1_ing),
                'rougeL': stats(rougeL_ing),
            }
        },
        'llm_feasibility_score': stats(feas_scores)
}
print("cosine_similarity:\n\tsteps: ", end="")
print(new_dict['cosine_similarity']['steps'])
print("\tingredients: ", end="")
print(new_dict['cosine_similarity']['ingredients'])
print("bleu_scores:\n\tsteps: ", end="")
print(new_dict['bleu_scores']['steps'])
print("\tingredients: ", end="")
print(new_dict['bleu_scores']['ingredients'])
print("rouge_scores:\n\tsteps: ", end="")
print(new_dict['rouge_scores']['steps'])
print("\tingredients: ", end="")
print(new_dict['rouge_scores']['ingredients'])
print("llm_feasibility_score: ", end="")
print(new_dict['llm_feasibility_score'])



Baseline 3B


0it [00:00, ?it/s]

Calculating cosine similarity...
Calculating BLEU scores...
Calculating ROUGE scores...
Calculating LLM evaluation...


1it [00:07,  7.90s/it]

cosine_similarity: 
{'steps': np.float64(0.5324286358401825), 'ingredients': np.float64(0.6754974909290884)}
bleu_scores: 
{'steps': 0.00012925265683217318, 'ingredients': 0.052170028630432755}
rouge_scores: 
{'steps': {'rouge1': 0.11908838342724229, 'rougeL': 0.09250725954244311}, 'ingredients': {'rouge1': 0.48679306804306804, 'rougeL': 0.45107878232878235}}
llm_evaluation: 
{'feasibility_score': np.float64(0.3), 'comment': "The predicted recipe is fundamentally different from both the image and the ground truth. The image clearly shows a textured sauce with what appear to be small particles (like lentils or ground meat) and a non-creamy base, served with cavatappi or similar pasta shape. The predicted recipe calls for penne pasta and a creamy tomato sauce with cherry tomatoes, which would look completely different. While the steps describe making a pasta dish, the ingredients and method do not match the visual target or the actual recipe. The inclusion of an oven preheat step that is

2it [00:16,  8.43s/it]

cosine_similarity: 
{'steps': np.float64(0.3416149187752506), 'ingredients': np.float64(0.6681123255193008)}
bleu_scores: 
{'steps': 3.197855130671387e-05, 'ingredients': 0.1396662573169596}
rouge_scores: 
{'steps': {'rouge1': 0.07334122791005913, 'rougeL': 0.06261484679313159}, 'ingredients': {'rouge1': 0.5113997113997114, 'rougeL': 0.4891774891774892}}
llm_evaluation: 
{'feasibility_score': np.float64(0.1), 'comment': 'The predicted recipe fundamentally misidentifies the main ingredient, mistaking sliced beets for tomatoes, and the allium component, calling scallions red onions. It also includes an entirely nonsensical oven preheating step for a raw salad. These errors mean the recipe would not produce the dish shown in the image and has minimal alignment with the ground-truth recipe. While some basic salad components like olive oil and salt are present, the core ingredients and method are incorrect. The thinking process states the correct goal but fails completely in execution. The 

3it [00:29, 10.24s/it]

cosine_similarity: 
{'steps': np.float64(0.36937189853152524), 'ingredients': np.float64(0.5194203299622575)}
bleu_scores: 
{'steps': 0.0015492253121489853, 'ingredients': 0.029201855312617948}
rouge_scores: 
{'steps': {'rouge1': 0.1317182367396323, 'rougeL': 0.09421050924346415}, 'ingredients': {'rouge1': 0.41565656565656567, 'rougeL': 0.41565656565656567}}
llm_evaluation: 
{'feasibility_score': np.float64(0.3), 'comment': "The predicted recipe recognizes the presence of berries and a filling, proposing a 'Berry Tart'. However, it fundamentally misinterprets the dish's structure and preparation. The image shows a rectangular slice, not a circular tart, and the ground truth confirms it's a 'Berry and Ricotta Slice'. The predicted recipe also calls for baking the berries on top, which would result in a jammy texture unlike the fresh berries seen. The filling type (cooked custard) and base (pie crust) also differ significantly from the image and ground truth (baked, chilled ricotta/cream

5it [00:39,  7.39s/it]

cosine_similarity: 
{'steps': np.float64(0.6006888171666277), 'ingredients': np.float64(0.5992953117468428)}
bleu_scores: 
{'steps': 0.0005428432021250207, 'ingredients': 0.046412699015906264}
rouge_scores: 
{'steps': {'rouge1': 0.1478770579980588, 'rougeL': 0.12059278908130426}, 'ingredients': {'rouge1': 0.35457251082251084, 'rougeL': 0.35457251082251084}}
llm_evaluation: 
{'feasibility_score': np.float64(0.55), 'comment': "The predicted recipe is a plausible recipe for a generic herb potato salad, aligning somewhat with the visual cues of creamy dressing, potatoes, and herbs in the image. However, it misses several key elements specific to the image and ground truth, such as the use of red-skinned potatoes and the presence of peas. The ingredient list and dressing composition differ significantly from the ground truth recipe. While the steps are logical for making a potato salad, they lack the specific nuance of the ground truth. The model's thinking process correctly identifies gene

6it [00:48,  7.98s/it]

cosine_similarity: 
{'steps': np.float64(0.5810050592728655), 'ingredients': np.float64(0.5842436302996052)}
bleu_scores: 
{'steps': 0.01617408980277489, 'ingredients': 0.03530977494557812}
rouge_scores: 
{'steps': {'rouge1': 0.24904162923030845, 'rougeL': 0.18761605271039236}, 'ingredients': {'rouge1': 0.3821212121212121, 'rougeL': 0.3821212121212121}}
llm_evaluation: 
{'feasibility_score': np.float64(0.1), 'comment': "The predicted recipe is completely different from the ground truth and would not produce the dish shown in the image. The predicted recipe includes ingredients like chicken and cheese that are not present in the image or the ground truth (jicama, celery leaf, radish sprouts). While the steps are simple salad preparation, the ingredients are entirely wrong. The model's thinking process misinterprets the visual components, leading to a completely incorrect conclusion about the dish being a chicken salad. This recipe is functionally infeasible for replicating the visual ta

7it [00:58,  8.49s/it]

cosine_similarity: 
{'steps': np.float64(0.615306388704278), 'ingredients': np.float64(0.4810243595183405)}
bleu_scores: 
{'steps': 0.000166416271537757, 'ingredients': 0.02788827853026744}
rouge_scores: 
{'steps': {'rouge1': 0.10544721059610739, 'rougeL': 0.08087765059185327}, 'ingredients': {'rouge1': 0.3300098891007982, 'rougeL': 0.3300098891007982}}
llm_evaluation: 
{'feasibility_score': np.float64(0.1), 'comment': 'The predicted recipe describes a completely different dish from the image and ground truth. It calls for a small beef tenderloin instead of a large prime rib roast, uses a different cooking method without the crucial crust step, and specifies ingredients for a vegetable pan sauce rather than the creamy horseradish sauce shown. The predicted cooking time for the tenderloin is also likely incorrect. Therefore, this recipe would not produce the depicted dish.', 'full': '{\n "evaluation": {\n  "feasibility_score": "0.1",\n  "comment": "The predicted recipe describes a compl

8it [01:05,  7.98s/it]

cosine_similarity: 
{'steps': np.float64(0.5238120815834606), 'ingredients': np.float64(0.6735244047136234)}
bleu_scores: 
{'steps': 9.216779159993357e-07, 'ingredients': 0.05089212532431692}
rouge_scores: 
{'steps': {'rouge1': 0.05787193889424206, 'rougeL': 0.04654575822456124}, 'ingredients': {'rouge1': 0.48650793650793644, 'rougeL': 0.48650793650793644}}
llm_evaluation: 
{'feasibility_score': np.float64(0.05), 'comment': "The predicted recipe is for a chocolate pie, which bears no resemblance to the apple pie with crumble topping shown in the image. The ingredients and method described would produce a chocolate cake-like dessert baked in a pie dish, not a pie with a distinct crust and filling. The model's reasoning in the `<think>` section is fundamentally flawed, incorrectly identifying the dish as chocolate based on the visual evidence. This prediction is completely infeasible for replicating the image or aligning with the ground truth. The very low score reflects the complete mis

9it [01:11,  7.44s/it]

cosine_similarity: 
{'steps': np.float64(0.4559442962767328), 'ingredients': np.float64(0.3419618483564927)}
bleu_scores: 
{'steps': 0.0005963089237783952, 'ingredients': 0.012621531576115324}
rouge_scores: 
{'steps': {'rouge1': 0.1358034166640173, 'rougeL': 0.09836802117009583}, 'ingredients': {'rouge1': 0.08333333333333333, 'rougeL': 0.08333333333333333}}
llm_evaluation: 
{'feasibility_score': np.float64(0.1), 'comment': "The predicted recipe completely misidentifies the dish shown in the image and described in the ground truth. The image clearly shows pan-fried patties, which the ground truth confirms are crab cakes. The predicted recipe describes chicken tenders or fish fillets. Consequently, the ingredients and cooking steps in the prediction do not match either the image's visual cues or the ground truth recipe's instructions for making crab cakes. The thinking process also starts with the incorrect assumption that the dish is chicken or fish fillets, leading to a completely unre

10it [01:20,  7.77s/it]

cosine_similarity: 
{'steps': np.float64(0.4676020561881923), 'ingredients': np.float64(0.5504776597561708)}
bleu_scores: 
{'steps': 0.00033798113574861943, 'ingredients': 0.04209959831692814}
rouge_scores: 
{'steps': {'rouge1': 0.1019763910897318, 'rougeL': 0.06336841739779718}, 'ingredients': {'rouge1': 0.3978354978354979, 'rougeL': 0.3740259740259741}}
llm_evaluation: 
{'feasibility_score': np.float64(0.4), 'comment': "The predicted recipe is a feasible recipe in isolation, describing a method to cook pork chops with a lentil side. However, it aligns poorly with both the image and the ground truth recipe. The baking method for the pork described would likely not produce the dark, caramelized crust visible in the photo, which suggests broiling or searing. The side dish concept (lentils, onion, garlic, broccoli) somewhat matches the visual components of the base in the image, but the texture looks less like a creamy 'risotto'. The ground truth recipe is completely different, using an 

11it [01:29,  8.34s/it]

cosine_similarity: 
{'steps': np.float64(0.5970083409597812), 'ingredients': np.float64(0.534566574889744)}
bleu_scores: 
{'steps': 2.2699625987279473e-07, 'ingredients': 0.034138471228689814}
rouge_scores: 
{'steps': {'rouge1': 0.06013861392321985, 'rougeL': 0.04614818842612763}, 'ingredients': {'rouge1': 0.29665242165242167, 'rougeL': 0.29665242165242167}}
llm_evaluation: 
{'feasibility_score': np.float64(0.2), 'comment': "The predicted recipe bears little resemblance to the dish shown in the image or the ground-truth recipe. It incorrectly mixes filling ingredients with crushed chips and attempts to 'shape' them into enchiladas rather than filling tortillas. The method of baking on a sheet pan is contrary to the image which shows baked enchiladas covered in sauce. Furthermore, adding sour cream before baking is incorrect. The model's thinking process identified key components like meat, cheese, sauce, and sour cream, but the resulting recipe failed to incorporate these elements (par

12it [01:39,  8.66s/it]

cosine_similarity: 
{'steps': np.float64(0.5814120463215986), 'ingredients': np.float64(0.5812437752328997)}
bleu_scores: 
{'steps': 5.288534261722377e-08, 'ingredients': 0.03949441145078756}
rouge_scores: 
{'steps': {'rouge1': 0.050804882884280256, 'rougeL': 0.04168185320245074}, 'ingredients': {'rouge1': 0.39997964997964996, 'rougeL': 0.39997964997964996}}
llm_evaluation: 
{'feasibility_score': np.float64(0.1), 'comment': "The predicted recipe is largely infeasible for producing the dish shown in the image or matching the ground truth. The image clearly depicts an open-faced galette, not a traditional pie baked in a dish as described by the prediction. The ground truth confirms it's a pear galette with almond components, none of which are present in the predicted apple pie recipe. The ingredients list is sparse and includes questionable items like olive oil. The assembly steps are completely different. The model's thinking process incorrectly identifies the dish as a standard apple p

13it [01:45,  8.03s/it]

cosine_similarity: 
{'steps': np.float64(0.5290193797231013), 'ingredients': np.float64(0.5314567584717459)}
bleu_scores: 
{'steps': 0.0005586366777311289, 'ingredients': 0.03127287651528009}
rouge_scores: 
{'steps': {'rouge1': 0.13157528800687707, 'rougeL': 0.09883383071939912}, 'ingredients': {'rouge1': 0.4293939393939394, 'rougeL': 0.4293939393939394}}
llm_evaluation: 
{'feasibility_score': np.float64(0.0), 'comment': "The predicted recipe for 'Baked Cherry Pie' is completely inaccurate for the dish shown in the image and the ground-truth recipe. The image clearly shows a bread-based baked dish (a strata or bread pudding) with raspberries, not a pie with a pastry crust and cherries. The predicted ingredients (flour, butter, sugar) and steps (making a crust) are entirely different from what is needed to produce the pictured dish or the ground-truth strata. The model's thinking process also incorrectly identifies the dish as a pie. This is a complete mismatch, resulting in a feasibili

14it [01:52,  7.60s/it]

cosine_similarity: 
{'steps': np.float64(0.5888258731428189), 'ingredients': np.float64(0.6184996864864605)}
bleu_scores: 
{'steps': 1.0752636730541744e-05, 'ingredients': 0.050993893640717526}
rouge_scores: 
{'steps': {'rouge1': 0.08694199260127879, 'rougeL': 0.05586951281556371}, 'ingredients': {'rouge1': 0.4161616161616162, 'rougeL': 0.4161616161616162}}
llm_evaluation: 
{'feasibility_score': np.float64(0.1), 'comment': "This predicted recipe is highly infeasible for producing the dish shown. The image displays a custard pie likely with a graham cracker crust, topped with whipped cream, similar to the ground truth Lemon Custard Pie. However, the predicted recipe lists ingredients for a standard pastry crust (not the 'Chocolate Crust' title or the crust in the image) and completely lacks the necessary ingredients for a custard filling (sugar, additional eggs, cream/milk). The described 'filling' of one egg and leftover milk would not create a set custard. The recipe deviates signific

15it [02:02,  8.29s/it]

cosine_similarity: 
{'steps': np.float64(0.5098627345462666), 'ingredients': np.float64(0.6127570228446795)}
bleu_scores: 
{'steps': 0.0010006321594010444, 'ingredients': 0.09352534421607893}
rouge_scores: 
{'steps': {'rouge1': 0.10206989929348126, 'rougeL': 0.06763383139532574}, 'ingredients': {'rouge1': 0.5390331890331891, 'rougeL': 0.5390331890331891}}
llm_evaluation: 
{'feasibility_score': np.float64(0.3), 'comment': "The predicted recipe describes how to make whole-grain mustard by boiling seeds and mixing, which would result in a chunky texture with visible seeds. This fundamentally conflicts with the image, which clearly shows smooth mustard, and the ground-truth recipe, which uses mustard powder and a cooking method to achieve smoothness. The predicted recipe also includes an unnecessary oven preheating step. While the ingredients for 'hot' and 'sweet' elements are present (honey, sugar, cayenne), the method is unsuitable for replicating the pictured dish. The model's thought p

16it [02:08,  7.58s/it]

cosine_similarity: 
{'steps': np.float64(0.4303519642194571), 'ingredients': np.float64(0.5492636607782062)}
bleu_scores: 
{'steps': 0.000260057997542335, 'ingredients': 0.08081019576304738}
rouge_scores: 
{'steps': {'rouge1': 0.08521664798488846, 'rougeL': 0.06322395214779904}, 'ingredients': {'rouge1': 0.3541005291005291, 'rougeL': 0.3541005291005291}}
llm_evaluation: 
{'feasibility_score': np.float64(0.0), 'comment': "The predicted recipe is completely infeasible for producing the dish shown in the image or matching the ground truth recipe. The ingredients listed (butter lettuce, blue cheese, almonds, garlic) are entirely different from those visible (frisée, apple, kohlrabi/celery root, chives) and in the ground truth. The cooking method of roasting lettuce is also completely contrary to the fresh salad depicted. The resulting dish would look nothing like the image. The model's stated thinking is a generic phrase and doesn't reflect any logical steps to infer the *correct* recipe f

17it [02:18,  8.41s/it]

cosine_similarity: 
{'steps': np.float64(0.5997848129246802), 'ingredients': np.float64(0.5954396312319775)}
bleu_scores: 
{'steps': 5.157218964180255e-07, 'ingredients': 0.07598523699474391}
rouge_scores: 
{'steps': {'rouge1': 0.07214367709311893, 'rougeL': 0.06051286269413844}, 'ingredients': {'rouge1': 0.38484848484848483, 'rougeL': 0.38484848484848483}}
llm_evaluation: 
{'feasibility_score': np.float64(0.1), 'comment': "The predicted recipe does not realistically produce the Thai Beef Salad shown in the image or described in the ground truth. While it identifies some basic salad ingredients like greens, tomatoes, and onions, it crucially misses the main protein (beef), cucumber, peanuts, and the specific chili-lime dressing visible. The inclusion of feta cheese is inaccurate for this dish. The instructions are overly simplistic, omitting the necessary steps for cooking the beef and preparing the correct dressing. The model's thinking process correctly identifies basic vegetables an

19it [02:32,  7.70s/it]

cosine_similarity: 
{'steps': np.float64(0.6423547652678897), 'ingredients': np.float64(0.47107454523320774)}
bleu_scores: 
{'steps': 0.0003296566738260276, 'ingredients': 0.019918393983362474}
rouge_scores: 
{'steps': {'rouge1': 0.13137714046284643, 'rougeL': 0.08352328695867307}, 'ingredients': {'rouge1': 0.20105820105820105, 'rougeL': 0.13756613756613756}}
llm_evaluation: 
{'feasibility_score': np.float64(0.35), 'comment': "The predicted recipe correctly identifies that the dish involves grilled steak cooked to medium-rare with a charred exterior. However, it significantly deviates from the ground truth and the visual evidence in the image. It includes roasted potatoes, which are not present. It suggests an 'herb butter' which is a different preparation than the liquid 'board dressing' shown on the sliced steak and described in the ground truth. The crucial technique of slicing the steak directly into the dressing on the board, visible in the image and central to the ground truth, i

20it [02:39,  7.98s/it]

cosine_similarity: 
{'steps': np.float64(0.4502754074629116), 'ingredients': np.float64(0.513716630432621)}
bleu_scores: 
{'steps': 0.0031078871291768937, 'ingredients': 0.054619286899042205}
rouge_scores: 
{'steps': {'rouge1': 0.14322067384947712, 'rougeL': 0.1136034351044493}, 'ingredients': {'rouge1': 0.4933954933954933, 'rougeL': 0.4933954933954933}}
llm_evaluation: 
{'feasibility_score': np.float64(0.1), 'comment': "As a chef, I can say this predicted recipe would not produce the dish shown in the image or described in the ground truth. The primary issue is the use of white beans instead of the green and wax beans depicted and specified. The herb mixture is also completely different from the shallots and basil in the ground truth. The predicted recipe includes an unnecessary oven preheating step and omits the crucial boiling step for the green/wax beans. While it technically generates a recipe for 'Herbed Bean Salad,' it's for a different type of salad entirely. The thinking proce


