In [2]:
import os
from datasets import load_dataset
import json


proxy_address = "http://songmingyang:dSpydxsxxhKix63HfIFhjwnZLEInXEDawSoMD35G1IT2CygKnHsJqG9ZHbEP@10.1.20.50:23128/"
os.environ["http_proxy"] = proxy_address
os.environ["https_proxy"] = proxy_address
os.environ["HTTP_PROXY"] = proxy_address

os.environ["HTTPS_PROXY"] = proxy_address

In [3]:

def calc_folding_nets_f1(model="gpt-4o", task="folding_nets_test", 
                         res_dir="/mnt/petrelfs/songmingyang/code/reasoning/others/stare_open/vissim_eval/scripts/results/close_source/evals2",
                         dataset_cache_dir = "/mnt/petrelfs/songmingyang/songmingyang/data/mm/reasoning/vissim"
                         ):
    import numpy as np
    from sklearn.metrics import f1_score
    from collections import defaultdict
    from datasets import load_dataset
    
    dataset = f"VisSim/{task}"
    gt_data = load_dataset(dataset, cache_dir=dataset_cache_dir)["train"]

    def calc_f1(preds, gts):
        preds = [p.lower() for p in preds]
        gts = [g.lower() for g in gts]
        return f1_score(gts, preds, average='weighted')
    

    def get_metrics(data, gt_data):
        pred_by_type = defaultdict(list)
        gt_by_type = defaultdict(list)

        # if "tangram_puzzle" in task:
        # load gt and get the number of steps
        qid2steps = {}
        for sample in gt_data:
            qid = sample['qid']
            if "folding_nets" in task:
                steps = 5
            else:
                steps = len(json.loads(sample["question_info"])["instructions"])
            qid2steps[qid] = steps
        pred_by_steps = defaultdict(list)
        gt_by_steps = defaultdict(list)

        metrics = {}
        overall_gt = []
        overall_pred = []
        for k, v in data.items():
            pred = v['pred']
            gt_ans = v['gt_ans']
            if "folding_nets" in task:
                variant = " ".join(k.split("_")[1:])
                if "all vis" in variant:
                    variant = f"all for valid {gt_ans}"
            elif "tangram_puzzle" in task:
                variant = " ".join(k.split("_")[3:])
                if "all for valid" in variant:
                    variant = f"all for valid {gt_ans}"
            if len(variant.strip()) == 0:
                return {}
            
            
            
            s_ = qid2steps[k]
            pred_by_steps[s_].append(pred.lower())
            gt_by_steps[s_].append(gt_ans.lower())
            pred_by_type[variant].append(pred.lower())
            gt_by_type[variant].append(gt_ans.lower())
            overall_gt.append(gt_ans.lower())
            overall_pred.append(pred.lower())
        metrics["overall_f1"] = calc_f1(overall_pred, overall_gt)
        for k in pred_by_type.keys():
            metrics[f"{k}_f1"] = calc_f1(pred_by_type[k], gt_by_type[k])
        for k in pred_by_steps.keys():
            metrics[f"{k}_f1"] = calc_f1(pred_by_steps[k], gt_by_steps[k])
            metrics[f"{k}_count"] = len(gt_by_steps[k])
        return metrics


    if "vissim" in task:
        data = np.load(f"{res_dir}/{model}_{task}_0.json.npy", allow_pickle=True)
        try:
            d_ = data[0]
            new_data = {d_['qid']: d_ for d_ in data}
            data = new_data
        except:
            data = data.item()
        metrics = get_metrics(data, gt_data)
    else:
        metrics = defaultdict(float)
        valid_samples = 0
        for seed in range(3):
            file_path = f"{res_dir}/{model}_{task}_{seed}.json.npy"
            if not os.path.exists(file_path):
                print(f"File not found: {file_path}")
                continue
            data  = np.load(file_path, allow_pickle=True)
            # convert numpy object to dict
            try:
                d_ = data[0]
                new_data = {d_['qid']: d_ for d_ in data}
                data = new_data
            except:
                data = data.item()
            metrics_ = get_metrics(data, gt_data)
            if len(metrics_) == 0:
                print(f"Empty metrics for {model}_{task}_{seed}.json")
            for k, v in metrics_.items():
                metrics[k] += v
            valid_samples += 1
        for k in metrics.keys():
            metrics[k] /= valid_samples
    for k in metrics.keys():
        print(f"{k}: {metrics[k]}")
    return metrics


def calc_tranformation_acc(model="gpt-4o", task="2d", visim=False, debug=False, 
                           res_dir="/mnt/petrelfs/songmingyang/code/reasoning/others/stare_open/vissim_eval/scripts/results/close_source/evals2",
                           dataset_cache_dir = "/mnt/petrelfs/songmingyang/songmingyang/data/mm/reasoning/vissim"):
    import numpy as np
    from collections import defaultdict
    from datasets import load_dataset
    task_type = ["va", "text_instruct"]
    # task_type = ["va"]
    
    def get_metrics(data, gt_data):
        qid2transformation = {}
        for sample in gt_data:
            qid = sample['qid']
            transformations = sample['transformations']
            trans = []
            if 'shear' in transformations:
                trans.append('shear')
            if 'scale' in transformations:
                trans.append('scale')
            if 'rotate' in transformations:
                trans.append('rotate')
            if 'translate' in transformations:
                trans.append('translate')
            if 'flip' in transformations:
                trans.append('flip')
            # else:
            #     print(f"Unknown transformation for {transformations}")
            qid2transformation[qid] = trans
        pred_by_type = defaultdict(list)
        gt_by_type = defaultdict(list)

        pred_by_steps = defaultdict(list)
        gt_by_steps = defaultdict(list)

        pred_by_difficulty = defaultdict(list)
        gt_by_difficulty = defaultdict(list)

        pred_by_transformation = defaultdict(list)
        gt_by_transformation = defaultdict(list)

        metrics = {}
        overall_gt = []
        overall_pred = []
        valid_samples = 0
        for k, v in data.items():
            pred = v['pred'].lower()
            gt_ans = v['gt_ans'].lower()
            variant = " ".join(k.split("_")[2:-1])
            difficulty = k.split("_")[1]
            num_steps = k.split("_")[0]
            if k not in qid2transformation:
                print(f"qid {k} not in gt_data")
                continue
            valid_samples += 1
            trans = qid2transformation[k]
            # if len(trans) == 1:
            pred_by_type[variant].append(pred.lower())
            gt_by_type[variant].append(gt_ans.lower())
        
            if num_steps == "" or variant == "" or difficulty == "":
                print(f"Empty keys for {k}")
                return {}

            overall_gt.append(gt_ans.lower())
            overall_pred.append(pred.lower())

            if visim and variant != "all":
                continue
            pred_by_difficulty[difficulty].append(pred.lower())
            gt_by_difficulty[difficulty].append(gt_ans.lower())
            pred_by_steps[num_steps].append(pred.lower())
            gt_by_steps[num_steps].append(gt_ans.lower())
            for t_ in trans:
                pred_by_transformation[t_].append(pred.lower())
                gt_by_transformation[t_].append(gt_ans.lower())
            if debug and pred != gt_ans:
                import ipdb
                ipdb.set_trace()
        if len(overall_gt) == 0:
            print(f"Empty overall_gt")
            return {}
        metrics["overall_acc"] = sum([p == g for p, g in zip(overall_pred, overall_gt)])/len(overall_gt)
        for k in pred_by_type.keys():
            metrics[f"{k}_acc"] = sum([p == g for p, g in zip(pred_by_type[k], gt_by_type[k])])/len(gt_by_type[k])
        for k in pred_by_difficulty.keys():
            metrics[f"{k}_acc"] = sum([p == g for p, g in zip(pred_by_difficulty[k], gt_by_difficulty[k])])/len(gt_by_difficulty[k])
        for k in pred_by_steps.keys():
            metrics[f"{k}_acc"] = sum([p == g for p, g in zip(pred_by_steps[k], gt_by_steps[k])])/len(gt_by_steps[k])
        for k in pred_by_transformation.keys():
            metrics[f"{k}_acc"] = sum([p == g for p, g in zip(pred_by_transformation[k], gt_by_transformation[k])])/len(gt_by_transformation[k])
        return metrics
    if not visim:
        gt_data = {}
        for task_ in task_type:
            dataset = f"VisSim/{task}_{task_}_test"
            gt_data[task_] = load_dataset(dataset, cache_dir=dataset_cache_dir)["train"]
        overall_metrics = defaultdict(float)
        valid_sample = 0
        for task_ in task_type:
            metrics = defaultdict(float)
            task_valid_sample = 0
            for seed in range(3):
                file_path = f"{res_dir}/{model}_{task}_{task_}_test_{seed}.json.npy"
                if not os.path.exists(file_path):
                    print(f"File not found: {file_path}")
                    continue
                data = np.load(file_path, allow_pickle=True)
                try:
                    d_ = data[0]
                    new_data = {d_['qid']: d_ for d_ in data}
                    data = new_data
                except:
                    data = data.item()
                metrics_ = get_metrics(data, gt_data[task_])
                if len(metrics_) == 0:
                    print(f"Empty metrics for {model}_{task}_{task_}_test_{seed}.json")
                    continue
                for k, v in metrics_.items():
                    metrics[k] += v
                print(f">>>>>>>>>>{task_} metrics:")
                for k, v in metrics_.items():
                    print(f"\t\t{k}: {v}")
                valid_sample += 1
                task_valid_sample += 1
            print(f">>>>>>>>>>Task metrics for {task_}:")
            for k in metrics.keys():
                metrics[k] /= task_valid_sample
                print(f"{k}: {metrics[k]}") 
            for k, v in metrics.items():
                overall_metrics[k] += v
        print(f">>>>>>>>>>Overall metrics:")
        for k in overall_metrics.keys():
            overall_metrics[k] /= len(task_type)
            print(f"{k}: {overall_metrics[k]}")
    else:

        gt_data = {}
        for task_ in task_type:
            dataset = f"VisSim/{task}_{task_}_vissim_test"
            gt_data[task_] = load_dataset(dataset, cache_dir=dataset_cache_dir)["train"]
        metrics = defaultdict(float)
        for task_ in task_type:
            file_path = f"{res_dir}/{model}_{task}_{task_}_vissim_test_0.json.npy"
            if not os.path.exists(file_path):
                print(f"File not found: {file_path}")
                continue
            data = np.load(file_path, allow_pickle=True)
            try:
                d_ = data[0]
                new_data = {d_['qid']: d_ for d_ in data}
                data = new_data
            except:
                data = data.item()
            metrics_ = get_metrics(data, gt_data[task_])
            for k, v in metrics_.items():
                metrics[k] += v
            print(f">>>>>>>>>>{task_} metrics:")
            for k, v in metrics_.items():
                print(f"\t\t{k}: {v}")
        print(f">>>>>>>>>>Overall metrics:")
        for k in metrics.keys():
            if k not in ["all w ans_acc", "all last_acc"]:
                metrics[k] /= 2
            print(f"{k}: {metrics[k]}")
    
    return metrics

In [4]:
# def calc_folding_nets_f1(model="gpt-4o", task="folding_nets_test", 
#                          res_dir="/mnt/petrelfs/songmingyang/code/reasoning/others/stare_open/vissim_eval/scripts/results/close_source/evals2",
#                          dataset_cache_dir = "/mnt/petrelfs/songmingyang/songmingyang/data/mm/reasoning/vissim"
#                          ):
calc_folding_nets_f1()

overall_f1: 0.5027343623164179
q only_f1: 0.5015633006964447
q+steps_f1: 0.5043618816945162
5_f1: 0.5027343623164179
5_count: 193.0


In [9]:
# def calc_tranformation_acc(model="gpt-4o", task="2d", visim=False, debug=False, 
#                            res_dir="/mnt/petrelfs/songmingyang/code/reasoning/others/stare_open/vissim_eval/scripts/results/close_source/evals2",
#                            dataset_cache_dir = "/mnt/petrelfs/songmingyang/songmingyang/data/mm/reasoning/vissim"):
# calc_tranformation_acc()
# calc_tranformation_acc("claude-3-5-sonnet-20241022", "2d", visim=False, debug=False)
# calc_tranformation_acc("gemini-2.0-flash-thinking-exp-01-21", "2d", visim=False, debug=False)
# metrics = calc_tranformation_acc(visim=True)
calc_tranformation_acc("claude-3-5-sonnet-20241022", "2d", visim=True, debug=False)

>>>>>>>>>>va metrics:
		overall_acc: 0.7647058823529411
		all_acc: 0.7598039215686274
		partial_acc: 0.7745098039215687
		easy_acc: 0.8088235294117647
		medium_acc: 0.7941176470588235
		hard_acc: 0.6764705882352942
		2steps_acc: 0.7843137254901961
		3steps_acc: 0.7352941176470589
		rotate_acc: 0.7281553398058253
		translate_acc: 0.8
		shear_acc: 0.5476190476190477
		scale_acc: 0.9193548387096774
		flip_acc: 0.7692307692307693
>>>>>>>>>>text_instruct metrics:
		overall_acc: 0.8809815950920246
		all w ans_acc: 0.8690095846645367
		all_acc: 0.8939393939393939
		all last_acc: 0.8939393939393939
		partial_acc: 0.8679245283018868
		easy_acc: 0.9154929577464789
		medium_acc: 0.8939393939393939
		hard_acc: 0.8688524590163934
		2steps_acc: 0.8804347826086957
		3steps_acc: 0.9056603773584906
		rotate_acc: 0.8598130841121495
		translate_acc: 0.9151515151515152
		flip_acc: 0.9117647058823529
		scale_acc: 0.9324324324324325
>>>>>>>>>>Overall metrics:
overall_acc: 0.8228437387224828
all_acc: 0.82687

In [10]:
metrics