In [1]:
import os
from datasets import load_dataset
import json


proxy_address = "http://songmingyang:dSpydxsxxhKix63HfIFhjwnZLEInXEDawSoMD35G1IT2CygKnHsJqG9ZHbEP@10.1.20.50:23128/"
os.environ["http_proxy"] = proxy_address
os.environ["https_proxy"] = proxy_address
os.environ["HTTP_PROXY"] = proxy_address

os.environ["HTTPS_PROXY"] = proxy_address

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

def calc_folding_nets_f1(model="gpt-4o", task="folding_nets_test", 
                         res_dir="/mnt/petrelfs/songmingyang/code/reasoning/others/stare_open/vissim_eval/scripts/results/close_source/evals2",
                         dataset_cache_dir = "/mnt/petrelfs/songmingyang/songmingyang/data/mm/reasoning/vissim"
                         ):
    import numpy as np
    from sklearn.metrics import f1_score
    from collections import defaultdict
    from datasets import load_dataset
    
    dataset = f"VisSim/{task}"
    gt_data = load_dataset(dataset, cache_dir=dataset_cache_dir)["train"]

    def calc_f1(preds, gts):
        preds = [p.lower() for p in preds]
        gts = [g.lower() for g in gts]
        return f1_score(gts, preds, average='weighted')
    

    def get_metrics(data, gt_data):
        pred_by_type = defaultdict(list)
        gt_by_type = defaultdict(list)

        # if "tangram_puzzle" in task:
        # load gt and get the number of steps
        qid2steps = {}
        for sample in gt_data:
            qid = sample['qid']
            if "folding_nets" in task:
                steps = 5
            else:
                steps = len(json.loads(sample["question_info"])["instructions"])
            qid2steps[qid] = steps
        pred_by_steps = defaultdict(list)
        gt_by_steps = defaultdict(list)

        metrics = {}
        overall_gt = []
        overall_pred = []
        for k, v in data.items():
            pred = v['pred']
            gt_ans = v['gt_ans']
            if "folding_nets" in task:
                variant = " ".join(k.split("_")[1:])
                if "all vis" in variant:
                    variant = f"all for valid {gt_ans}"
            elif "tangram_puzzle" in task:
                variant = " ".join(k.split("_")[3:])
                if "all for valid" in variant:
                    variant = f"all for valid {gt_ans}"
            if len(variant.strip()) == 0:
                return {}
            
            
            
            s_ = qid2steps[k]
            pred_by_steps[s_].append(pred.lower())
            gt_by_steps[s_].append(gt_ans.lower())
            pred_by_type[variant].append(pred.lower())
            gt_by_type[variant].append(gt_ans.lower())
            overall_gt.append(gt_ans.lower())
            overall_pred.append(pred.lower())
        metrics["overall_f1"] = calc_f1(overall_pred, overall_gt)
        for k in pred_by_type.keys():
            metrics[f"{k}_f1"] = calc_f1(pred_by_type[k], gt_by_type[k])
        for k in pred_by_steps.keys():
            metrics[f"{k}_f1"] = calc_f1(pred_by_steps[k], gt_by_steps[k])
            metrics[f"{k}_count"] = len(gt_by_steps[k])
        return metrics


    if "vissim" in task:
        data = np.load(f"{res_dir}/{model}_{task}_0.json.npy", allow_pickle=True)
        try:
            d_ = data[0]
            new_data = {d_['qid']: d_ for d_ in data}
            data = new_data
        except:
            data = data.item()
        metrics = get_metrics(data, gt_data)
    else:
        metrics = defaultdict(float)
        valid_samples = 0
        for seed in range(3):
            file_path = f"{res_dir}/{model}_{task}_{seed}.json.npy"
            if not os.path.exists(file_path):
                print(f"File not found: {file_path}")
                continue
            data  = np.load(file_path, allow_pickle=True)
            # convert numpy object to dict
            try:
                d_ = data[0]
                new_data = {d_['qid']: d_ for d_ in data}
                data = new_data
            except:
                data = data.item()
            metrics_ = get_metrics(data, gt_data)
            if len(metrics_) == 0:
                print(f"Empty metrics for {model}_{task}_{seed}.json")
            for k, v in metrics_.items():
                metrics[k] += v
            valid_samples += 1
        for k in metrics.keys():
            metrics[k] /= valid_samples
    for k in metrics.keys():
        print(f"{k}: {metrics[k]}")
    return metrics


def calc_tranformation_acc(model="gpt-4o", task="2d", visim=False, debug=False, 
                           res_dir="/mnt/petrelfs/songmingyang/code/reasoning/others/stare_open/vissim_eval/scripts/results/close_source/evals2",
                           dataset_cache_dir = "/mnt/petrelfs/songmingyang/songmingyang/data/mm/reasoning/vissim"):
    import numpy as np
    from collections import defaultdict
    from datasets import load_dataset
    task_type = ["va", "text_instruct"]
    # task_type = ["va"]
    
    def get_metrics(data, gt_data):
        qid2transformation = {}
        for sample in gt_data:
            qid = sample['qid']
            transformations = sample['transformations']
            trans = []
            if 'shear' in transformations:
                trans.append('shear')
            if 'scale' in transformations:
                trans.append('scale')
            if 'rotate' in transformations:
                trans.append('rotate')
            if 'translate' in transformations:
                trans.append('translate')
            if 'flip' in transformations:
                trans.append('flip')
            # else:
            #     print(f"Unknown transformation for {transformations}")
            qid2transformation[qid] = trans
        pred_by_type = defaultdict(list)
        gt_by_type = defaultdict(list)

        pred_by_steps = defaultdict(list)
        gt_by_steps = defaultdict(list)

        pred_by_difficulty = defaultdict(list)
        gt_by_difficulty = defaultdict(list)

        pred_by_transformation = defaultdict(list)
        gt_by_transformation = defaultdict(list)

        metrics = {}
        overall_gt = []
        overall_pred = []
        valid_samples = 0
        for k, v in data.items():
            pred = v['pred'].lower()
            gt_ans = v['gt_ans'].lower()
            variant = " ".join(k.split("_")[2:-1])
            difficulty = k.split("_")[1]
            num_steps = k.split("_")[0]
            if k not in qid2transformation:
                print(f"qid {k} not in gt_data")
                continue
            valid_samples += 1
            trans = qid2transformation[k]
            # if len(trans) == 1:
            pred_by_type[variant].append(pred.lower())
            gt_by_type[variant].append(gt_ans.lower())
        
            if num_steps == "" or variant == "" or difficulty == "":
                print(f"Empty keys for {k}")
                return {}

            overall_gt.append(gt_ans.lower())
            overall_pred.append(pred.lower())

            if visim and variant != "all":
                continue
            pred_by_difficulty[difficulty].append(pred.lower())
            gt_by_difficulty[difficulty].append(gt_ans.lower())
            pred_by_steps[num_steps].append(pred.lower())
            gt_by_steps[num_steps].append(gt_ans.lower())
            for t_ in trans:
                pred_by_transformation[t_].append(pred.lower())
                gt_by_transformation[t_].append(gt_ans.lower())
            if debug and pred != gt_ans:
                import ipdb
                ipdb.set_trace()
        if len(overall_gt) == 0:
            print(f"Empty overall_gt")
            return {}
        metrics["overall_acc"] = sum([p == g for p, g in zip(overall_pred, overall_gt)])/len(overall_gt)
        for k in pred_by_type.keys():
            metrics[f"{k}_acc"] = sum([p == g for p, g in zip(pred_by_type[k], gt_by_type[k])])/len(gt_by_type[k])
        for k in pred_by_difficulty.keys():
            metrics[f"{k}_acc"] = sum([p == g for p, g in zip(pred_by_difficulty[k], gt_by_difficulty[k])])/len(gt_by_difficulty[k])
        for k in pred_by_steps.keys():
            metrics[f"{k}_acc"] = sum([p == g for p, g in zip(pred_by_steps[k], gt_by_steps[k])])/len(gt_by_steps[k])
        for k in pred_by_transformation.keys():
            metrics[f"{k}_acc"] = sum([p == g for p, g in zip(pred_by_transformation[k], gt_by_transformation[k])])/len(gt_by_transformation[k])
        return metrics
    if not visim:
        gt_data = {}
        for task_ in task_type:
            dataset = f"VisSim/{task}_{task_}_test"
            gt_data[task_] = load_dataset(dataset, cache_dir=dataset_cache_dir)["train"]
        overall_metrics = defaultdict(float)
        valid_sample = 0
        for task_ in task_type:
            metrics = defaultdict(float)
            task_valid_sample = 0
            for seed in range(3):
                file_path = f"{res_dir}/{model}_{task}_{task_}_test_{seed}.json.npy"
                if not os.path.exists(file_path):
                    print(f"File not found: {file_path}")
                    continue
                data = np.load(file_path, allow_pickle=True)
                try:
                    d_ = data[0]
                    new_data = {d_['qid']: d_ for d_ in data}
                    data = new_data
                except:
                    data = data.item()
                metrics_ = get_metrics(data, gt_data[task_])
                if len(metrics_) == 0:
                    print(f"Empty metrics for {model}_{task}_{task_}_test_{seed}.json")
                    continue
                for k, v in metrics_.items():
                    metrics[k] += v
                print(f">>>>>>>>>>{task_} metrics:")
                for k, v in metrics_.items():
                    print(f"\t\t{k}: {v}")
                valid_sample += 1
                task_valid_sample += 1
            print(f">>>>>>>>>>Task metrics for {task_}:")
            for k in metrics.keys():
                metrics[k] /= task_valid_sample
                print(f"{k}: {metrics[k]}") 
            for k, v in metrics.items():
                overall_metrics[k] += v
        print(f">>>>>>>>>>Overall metrics:")
        for k in overall_metrics.keys():
            overall_metrics[k] /= len(task_type)
            print(f"{k}: {overall_metrics[k]}")
    else:

        gt_data = {}
        for task_ in task_type:
            dataset = f"VisSim/{task}_{task_}_vissim_test"
            gt_data[task_] = load_dataset(dataset, cache_dir=dataset_cache_dir)["train"]
        metrics = defaultdict(float)
        for task_ in task_type:
            file_path = f"{res_dir}/{model}_{task}_{task_}_vissim_test_0.json.npy"
            if not os.path.exists(file_path):
                print(f"File not found: {file_path}")
                continue
            data = np.load(file_path, allow_pickle=True)
            try:
                d_ = data[0]
                new_data = {d_['qid']: d_ for d_ in data}
                data = new_data
            except:
                data = data.item()
            metrics_ = get_metrics(data, gt_data[task_])
            for k, v in metrics_.items():
                metrics[k] += v
            print(f">>>>>>>>>>{task_} metrics:")
            for k, v in metrics_.items():
                print(f"\t\t{k}: {v}")
        print(f">>>>>>>>>>Overall metrics:")
        for k in metrics.keys():
            if k not in ["all w ans_acc", "all last_acc"]:
                metrics[k] /= 2
            print(f"{k}: {metrics[k]}")
    
    return metrics

In [3]:
# def calc_folding_nets_f1(model="gpt-4o", task="folding_nets_test", 
#                          res_dir="/mnt/petrelfs/songmingyang/code/reasoning/others/stare_open/vissim_eval/scripts/results/close_source/evals2",
#                          dataset_cache_dir = "/mnt/petrelfs/songmingyang/songmingyang/data/mm/reasoning/vissim"
#                          ):
calc_folding_nets_f1()

overall_f1: 0.5027343623164179
q only_f1: 0.5015633006964447
q+steps_f1: 0.5043618816945162
5_f1: 0.5027343623164179
5_count: 193.0


defaultdict(float,
            {'overall_f1': 0.5027343623164179,
             'q only_f1': 0.5015633006964447,
             'q+steps_f1': 0.5043618816945162,
             '5_f1': 0.5027343623164179,
             '5_count': 193.0})

In [4]:
calc_tranformation_acc()

>>>>>>>>>>va metrics:
		overall_acc: 0.6895424836601307
		no_acc: 0.6895424836601307
		easy_acc: 0.7941176470588235
		medium_acc: 0.6862745098039216
		hard_acc: 0.5882352941176471
		1steps_acc: 0.5882352941176471
		2steps_acc: 0.7254901960784313
		3steps_acc: 0.7549019607843137
		shear_acc: 0.5714285714285714
		translate_acc: 0.704225352112676
		scale_acc: 0.8888888888888888
		rotate_acc: 0.6896551724137931
		flip_acc: 0.8214285714285714
>>>>>>>>>>va metrics:
		overall_acc: 0.6895424836601307
		no_acc: 0.6895424836601307
		easy_acc: 0.7843137254901961
		medium_acc: 0.6372549019607843
		hard_acc: 0.6470588235294118
		1steps_acc: 0.5784313725490197
		2steps_acc: 0.7647058823529411
		3steps_acc: 0.7254901960784313
		shear_acc: 0.4897959183673469
		translate_acc: 0.7276995305164319
		scale_acc: 0.8777777777777778
		rotate_acc: 0.6896551724137931
		flip_acc: 0.8214285714285714
>>>>>>>>>>va metrics:
		overall_acc: 0.7124183006535948
		no_acc: 0.7124183006535948
		easy_acc: 0.8333333333333334

defaultdict(float,
            {'overall_acc': 0.7267267267267267,
             'no_acc': 0.7267267267267267,
             'easy_acc': 0.7614942528735632,
             'medium_acc': 0.7040229885057471,
             'hard_acc': 0.712871287128713,
             '1steps_acc': 0.6726726726726726,
             '2steps_acc': 0.7755775577557756,
             '3steps_acc': 0.7355371900826446,
             'translate_acc': 0.7330729166666666,
             'scale_acc': 0.9061488673139159,
             'rotate_acc': 0.6780821917808219,
             'flip_acc': 0.6578947368421053})

In [5]:
calc_tranformation_acc("claude-3-5-sonnet-20241022", "2d", visim=False, debug=False)

>>>>>>>>>>va metrics:
		overall_acc: 0.6830065359477124
		no_acc: 0.6830065359477124
		easy_acc: 0.7549019607843137
		medium_acc: 0.6666666666666666
		hard_acc: 0.6274509803921569
		1steps_acc: 0.6568627450980392
		2steps_acc: 0.7352941176470589
		3steps_acc: 0.6568627450980392
		shear_acc: 0.5306122448979592
		translate_acc: 0.6995305164319249
		scale_acc: 0.8777777777777778
		rotate_acc: 0.6379310344827587
		flip_acc: 0.7142857142857143
>>>>>>>>>>va metrics:
		overall_acc: 0.7026143790849673
		no_acc: 0.7026143790849673
		easy_acc: 0.803921568627451
		medium_acc: 0.6862745098039216
		hard_acc: 0.6176470588235294
		1steps_acc: 0.6470588235294118
		2steps_acc: 0.7745098039215687
		3steps_acc: 0.6862745098039216
		shear_acc: 0.6326530612244898
		translate_acc: 0.7370892018779343
		scale_acc: 0.8666666666666667
		rotate_acc: 0.603448275862069
		flip_acc: 0.8214285714285714
>>>>>>>>>>va metrics:
		overall_acc: 0.6666666666666666
		no_acc: 0.6666666666666666
		easy_acc: 0.7352941176470589


defaultdict(float,
            {'overall_acc': 0.6346346346346347,
             'no_acc': 0.6346346346346347,
             'easy_acc': 0.6867816091954023,
             'medium_acc': 0.6178160919540231,
             'hard_acc': 0.594059405940594,
             '1steps_acc': 0.6516516516516516,
             '2steps_acc': 0.6501650165016502,
             '3steps_acc': 0.6060606060606061,
             'translate_acc': 0.6184895833333334,
             'scale_acc': 0.8446601941747572,
             'rotate_acc': 0.5684931506849316,
             'flip_acc': 0.5263157894736842})

In [6]:
calc_tranformation_acc("gemini-2.0-flash-thinking-exp-01-21", "2d", visim=False, debug=False)

>>>>>>>>>>va metrics:
		overall_acc: 0.5261437908496732
		no_acc: 0.5261437908496732
		easy_acc: 0.696078431372549
		medium_acc: 0.45098039215686275
		hard_acc: 0.43137254901960786
		1steps_acc: 0.5098039215686274
		2steps_acc: 0.4803921568627451
		3steps_acc: 0.5882352941176471
		shear_acc: 0.40816326530612246
		translate_acc: 0.5352112676056338
		scale_acc: 0.7333333333333333
		rotate_acc: 0.4827586206896552
		flip_acc: 0.6071428571428571
>>>>>>>>>>va metrics:
		overall_acc: 0.5620915032679739
		no_acc: 0.5620915032679739
		easy_acc: 0.6666666666666666
		medium_acc: 0.5784313725490197
		hard_acc: 0.4411764705882353
		1steps_acc: 0.5588235294117647
		2steps_acc: 0.5980392156862745
		3steps_acc: 0.5294117647058824
		shear_acc: 0.46938775510204084
		translate_acc: 0.5727699530516432
		scale_acc: 0.7444444444444445
		rotate_acc: 0.4827586206896552
		flip_acc: 0.5357142857142857
>>>>>>>>>>va metrics:
		overall_acc: 0.5424836601307189
		no_acc: 0.5424836601307189
		easy_acc: 0.627450980392

defaultdict(float,
            {'overall_acc': 0.6681681681681682,
             'no_acc': 0.6681681681681682,
             'easy_acc': 0.6551724137931034,
             'medium_acc': 0.6939655172413793,
             'hard_acc': 0.6534653465346535,
             '1steps_acc': 0.617117117117117,
             '2steps_acc': 0.6732673267326733,
             '3steps_acc': 0.7107438016528926,
             'translate_acc': 0.677734375,
             'scale_acc': 0.8300970873786407,
             'rotate_acc': 0.6061643835616439,
             'flip_acc': 0.631578947368421})

In [7]:
metrics = calc_tranformation_acc(visim=True)

>>>>>>>>>>va metrics:
		overall_acc: 0.7647058823529411
		all_acc: 0.7598039215686274
		partial_acc: 0.7745098039215687
		easy_acc: 0.8088235294117647
		medium_acc: 0.7941176470588235
		hard_acc: 0.6764705882352942
		2steps_acc: 0.7843137254901961
		3steps_acc: 0.7352941176470589
		rotate_acc: 0.7281553398058253
		translate_acc: 0.8
		shear_acc: 0.5476190476190477
		scale_acc: 0.9193548387096774
		flip_acc: 0.7692307692307693
>>>>>>>>>>text_instruct metrics:
		overall_acc: 0.8809815950920246
		all w ans_acc: 0.8690095846645367
		all_acc: 0.8939393939393939
		all last_acc: 0.8939393939393939
		partial_acc: 0.8679245283018868
		easy_acc: 0.9154929577464789
		medium_acc: 0.8939393939393939
		hard_acc: 0.8688524590163934
		2steps_acc: 0.8804347826086957
		3steps_acc: 0.9056603773584906
		rotate_acc: 0.8598130841121495
		translate_acc: 0.9151515151515152
		flip_acc: 0.9117647058823529
		scale_acc: 0.9324324324324325
>>>>>>>>>>Overall metrics:
overall_acc: 0.8228437387224828
all_acc: 0.82687

In [8]:
calc_tranformation_acc("claude-3-5-sonnet-20241022", "2d", visim=True, debug=False)

>>>>>>>>>>va metrics:
		overall_acc: 0.6928104575163399
		all_acc: 0.7058823529411765
		partial_acc: 0.6666666666666666
		easy_acc: 0.7647058823529411
		medium_acc: 0.7205882352941176
		hard_acc: 0.6323529411764706
		2steps_acc: 0.7058823529411765
		3steps_acc: 0.7058823529411765
		rotate_acc: 0.7087378640776699
		translate_acc: 0.7393939393939394
		shear_acc: 0.5
		scale_acc: 0.8548387096774194
		flip_acc: 0.7307692307692307
>>>>>>>>>>text_instruct metrics:
		overall_acc: 0.7251533742331289
		all w ans_acc: 0.7507987220447284
		all_acc: 0.7222222222222222
		all last_acc: 0.7070707070707071
		partial_acc: 0.6886792452830188
		easy_acc: 0.7887323943661971
		medium_acc: 0.6515151515151515
		hard_acc: 0.7213114754098361
		2steps_acc: 0.717391304347826
		3steps_acc: 0.7264150943396226
		rotate_acc: 0.7289719626168224
		translate_acc: 0.7393939393939394
		flip_acc: 0.5588235294117647
		scale_acc: 0.8378378378378378
>>>>>>>>>>Overall metrics:
overall_acc: 0.7089819158747344
all_acc: 0.714052

defaultdict(float,
            {'overall_acc': 0.7089819158747344,
             'all_acc': 0.7140522875816994,
             'partial_acc': 0.6776729559748427,
             'easy_acc': 0.7767191383595691,
             'medium_acc': 0.6860516934046346,
             'hard_acc': 0.6768322082931533,
             '2steps_acc': 0.7116368286445013,
             '3steps_acc': 0.7161487236403996,
             'rotate_acc': 0.7188549133472462,
             'translate_acc': 0.7393939393939394,
             'shear_acc': 0.25,
             'scale_acc': 0.8463382737576286,
             'flip_acc': 0.6447963800904977,
             'all w ans_acc': 0.7507987220447284,
             'all last_acc': 0.7070707070707071})

In [9]:
calc_tranformation_acc("gemini-2.0-flash-thinking-exp-01-21", "2d", visim=True, debug=False)

>>>>>>>>>>va metrics:
		overall_acc: 0.5555555555555556
		all_acc: 0.5245098039215687
		partial_acc: 0.6176470588235294
		easy_acc: 0.5441176470588235
		medium_acc: 0.5588235294117647
		hard_acc: 0.47058823529411764
		2steps_acc: 0.46078431372549017
		3steps_acc: 0.5882352941176471
		rotate_acc: 0.49514563106796117
		translate_acc: 0.5636363636363636
		shear_acc: 0.40476190476190477
		scale_acc: 0.7096774193548387
		flip_acc: 0.6153846153846154
>>>>>>>>>>text_instruct metrics:
		overall_acc: 0.7214723926380369
		all w ans_acc: 0.7476038338658147
		all_acc: 0.7323232323232324
		all last_acc: 0.6868686868686869
		partial_acc: 0.6886792452830188
		easy_acc: 0.7605633802816901
		medium_acc: 0.7424242424242424
		hard_acc: 0.6885245901639344
		2steps_acc: 0.7934782608695652
		3steps_acc: 0.6792452830188679
		rotate_acc: 0.6822429906542056
		translate_acc: 0.7393939393939394
		flip_acc: 0.7058823529411765
		scale_acc: 0.8918918918918919
>>>>>>>>>>Overall metrics:
overall_acc: 0.63851397409679

defaultdict(float,
            {'overall_acc': 0.6385139740967962,
             'all_acc': 0.6284165181224005,
             'partial_acc': 0.6531631520532741,
             'easy_acc': 0.6523405136702568,
             'medium_acc': 0.6506238859180036,
             'hard_acc': 0.579556412729026,
             '2steps_acc': 0.6271312872975277,
             '3steps_acc': 0.6337402885682575,
             'rotate_acc': 0.5886943108610834,
             'translate_acc': 0.6515151515151515,
             'shear_acc': 0.20238095238095238,
             'scale_acc': 0.8007846556233653,
             'flip_acc': 0.660633484162896,
             'all w ans_acc': 0.7476038338658147,
             'all last_acc': 0.6868686868686869})

In [12]:
# def calc_tranformation_acc(model="gpt-4o", task="2d", visim=False, debug=False, 
#                            res_dir="/mnt/petrelfs/songmingyang/code/reasoning/others/stare_open/vissim_eval/scripts/results/close_source/evals2",
#                            dataset_cache_dir = "/mnt/petrelfs/songmingyang/songmingyang/data/mm/reasoning/vissim"):
# calc_tranformation_acc()
# calc_tranformation_acc("claude-3-5-sonnet-20241022", "2d", visim=False, debug=False)
# calc_tranformation_acc("gemini-2.0-flash-thinking-exp-01-21", "2d", visim=False, debug=False)
# metrics = calc_tranformation_acc(visim=True)
# calc_tranformation_acc("claude-3-5-sonnet-20241022", "2d", visim=True, debug=False)
# calc_tranformation_acc("gemini-2.0-flash-thinking-exp-01-21", "2d", visim=True, debug=False)

>>>>>>>>>>va metrics:
		overall_acc: 0.5555555555555556
		all_acc: 0.5245098039215687
		partial_acc: 0.6176470588235294
		easy_acc: 0.5441176470588235
		medium_acc: 0.5588235294117647
		hard_acc: 0.47058823529411764
		2steps_acc: 0.46078431372549017
		3steps_acc: 0.5882352941176471
		rotate_acc: 0.49514563106796117
		translate_acc: 0.5636363636363636
		shear_acc: 0.40476190476190477
		scale_acc: 0.7096774193548387
		flip_acc: 0.6153846153846154
>>>>>>>>>>text_instruct metrics:
		overall_acc: 0.7214723926380369
		all w ans_acc: 0.7476038338658147
		all_acc: 0.7323232323232324
		all last_acc: 0.6868686868686869
		partial_acc: 0.6886792452830188
		easy_acc: 0.7605633802816901
		medium_acc: 0.7424242424242424
		hard_acc: 0.6885245901639344
		2steps_acc: 0.7934782608695652
		3steps_acc: 0.6792452830188679
		rotate_acc: 0.6822429906542056
		translate_acc: 0.7393939393939394
		flip_acc: 0.7058823529411765
		scale_acc: 0.8918918918918919
>>>>>>>>>>Overall metrics:
overall_acc: 0.63851397409679

In [14]:
calc_tranformation_acc(model="gpt-4o", task="3d", visim=False)
# calc_tranformation_acc("claude-3-5-sonnet-20241022", "2d", visim=False, debug=False)
# calc_tranformation_acc("gemini-2.0-flash-thinking-exp-01-21", "2d", visim=False, debug=False)
# metrics = calc_tranformation_acc(visim=True)
# calc_tranformation_acc("claude-3-5-sonnet-20241022", "2d", visim=True, debug=False)
# calc_tranformation_acc("gemini-2.0-flash-thinking-exp-01-21", "2d", visim=True, debug=False)

>>>>>>>>>>va metrics:
		overall_acc: 0.6830065359477124
		no_acc: 0.6830065359477124
		easy_acc: 0.7549019607843137
		medium_acc: 0.6666666666666666
		hard_acc: 0.6274509803921569
		1steps_acc: 0.5980392156862745
		2steps_acc: 0.7254901960784313
		3steps_acc: 0.7254901960784313
		translate_acc: 0.7816091954022989
		shear_acc: 0.5897435897435898
		rotate_acc: 0.5833333333333334
		scale_acc: 0.7666666666666667
>>>>>>>>>>va metrics:
		overall_acc: 0.6733333333333333
		no_acc: 0.6733333333333333
		easy_acc: 0.7352941176470589
		medium_acc: 0.6041666666666666
		hard_acc: 0.6470588235294118
		1steps_acc: 0.6372549019607843
		2steps_acc: 0.75
		translate_acc: 0.8157894736842105
		shear_acc: 0.49019607843137253
		rotate_acc: 0.5925925925925926
		scale_acc: 0.7241379310344828
>>>>>>>>>>va metrics:
		overall_acc: 0.7026143790849673
		no_acc: 0.7026143790849673
		easy_acc: 0.7352941176470589
		medium_acc: 0.6764705882352942
		hard_acc: 0.696078431372549
		1steps_acc: 0.6470588235294118
		2steps_a

In [15]:
calc_tranformation_acc("claude-3-5-sonnet-20241022", "3d", visim=False, debug=False)


>>>>>>>>>>va metrics:
		overall_acc: 0.545751633986928
		no_acc: 0.545751633986928
		easy_acc: 0.5490196078431373
		medium_acc: 0.5588235294117647
		hard_acc: 0.5294117647058824
		1steps_acc: 0.4803921568627451
		2steps_acc: 0.5784313725490197
		3steps_acc: 0.5784313725490197
		translate_acc: 0.6264367816091954
		shear_acc: 0.452991452991453
		rotate_acc: 0.5520833333333334
		scale_acc: 0.6333333333333333
>>>>>>>>>>va metrics:
		overall_acc: 0.5196078431372549
		no_acc: 0.5196078431372549
		easy_acc: 0.5490196078431373
		medium_acc: 0.5294117647058824
		hard_acc: 0.4803921568627451
		1steps_acc: 0.4215686274509804
		2steps_acc: 0.5686274509803921
		3steps_acc: 0.5686274509803921
		translate_acc: 0.6264367816091954
		shear_acc: 0.4700854700854701
		rotate_acc: 0.4479166666666667
		scale_acc: 0.6333333333333333
File not found: /mnt/petrelfs/songmingyang/code/reasoning/others/stare_open/vissim_eval/scripts/results/close_source/evals2/claude-3-5-sonnet-20241022_3d_va_test_2.json.npy
>>>>>>

In [16]:
calc_tranformation_acc("gemini-2.0-flash-thinking-exp-01-21", "3d", visim=False, debug=False)

>>>>>>>>>>va metrics:
		overall_acc: 0.5424836601307189
		no_acc: 0.5424836601307189
		easy_acc: 0.5588235294117647
		medium_acc: 0.5392156862745098
		hard_acc: 0.5294117647058824
		1steps_acc: 0.4803921568627451
		2steps_acc: 0.5588235294117647
		3steps_acc: 0.5882352941176471
		translate_acc: 0.6264367816091954
		shear_acc: 0.49572649572649574
		rotate_acc: 0.40625
		scale_acc: 0.6444444444444445
>>>>>>>>>>va metrics:
		overall_acc: 0.5032679738562091
		no_acc: 0.5032679738562091
		easy_acc: 0.5098039215686274
		medium_acc: 0.5196078431372549
		hard_acc: 0.4803921568627451
		1steps_acc: 0.4803921568627451
		2steps_acc: 0.5
		3steps_acc: 0.5294117647058824
		translate_acc: 0.6149425287356322
		shear_acc: 0.38461538461538464
		rotate_acc: 0.4166666666666667
		scale_acc: 0.5666666666666667
>>>>>>>>>>va metrics:
		overall_acc: 0.5424836601307189
		no_acc: 0.5424836601307189
		easy_acc: 0.5686274509803921
		medium_acc: 0.5588235294117647
		hard_acc: 0.5
		1steps_acc: 0.47058823529411764
	

In [20]:
calc_tranformation_acc(model="gpt-4o", task="3d", visim=True)

>>>>>>>>>>va metrics:
		overall_acc: 0.7352941176470589
		all_acc: 0.7156862745098039
		partial_acc: 0.7745098039215687
		easy_acc: 0.8088235294117647
		medium_acc: 0.75
		hard_acc: 0.5882352941176471
		2steps_acc: 0.7058823529411765
		3steps_acc: 0.7254901960784313
		translate_acc: 0.7596899224806202
		scale_acc: 0.782051282051282
		rotate_acc: 0.6428571428571429
		shear_acc: 0.6428571428571429
>>>>>>>>>>text_instruct metrics:
		overall_acc: 0.6642156862745098
		all w ans_acc: 0.6797385620915033
		all_acc: 0.6519607843137255
		all last_acc: 0.6519607843137255
		partial_acc: 0.6666666666666666
		easy_acc: 0.75
		medium_acc: 0.6470588235294118
		hard_acc: 0.5588235294117647
		2steps_acc: 0.6176470588235294
		3steps_acc: 0.6862745098039216
		shear_acc: 0.5466666666666666
		translate_acc: 0.6851851851851852
		rotate_acc: 0.6262626262626263
		scale_acc: 0.7526881720430108
>>>>>>>>>>Overall metrics:
overall_acc: 0.6997549019607843
all_acc: 0.6838235294117647
partial_acc: 0.7205882352941176


In [18]:
calc_tranformation_acc("claude-3-5-sonnet-20241022", "3d", visim=True, debug=False)

>>>>>>>>>>va metrics:
		overall_acc: 0.5784313725490197
		all_acc: 0.5735294117647058
		partial_acc: 0.5882352941176471
		easy_acc: 0.6764705882352942
		medium_acc: 0.5294117647058824
		hard_acc: 0.5147058823529411
		2steps_acc: 0.5686274509803921
		3steps_acc: 0.5784313725490197
		translate_acc: 0.5968992248062015
		scale_acc: 0.6923076923076923
		rotate_acc: 0.5119047619047619
		shear_acc: 0.5952380952380952
>>>>>>>>>>text_instruct metrics:
		overall_acc: 0.5625
		all w ans_acc: 0.5686274509803921
		all_acc: 0.5833333333333334
		all last_acc: 0.5588235294117647
		partial_acc: 0.5098039215686274
		easy_acc: 0.6617647058823529
		medium_acc: 0.5735294117647058
		hard_acc: 0.5147058823529411
		2steps_acc: 0.6568627450980392
		3steps_acc: 0.5098039215686274
		shear_acc: 0.48
		translate_acc: 0.5925925925925926
		rotate_acc: 0.5555555555555556
		scale_acc: 0.6451612903225806
>>>>>>>>>>Overall metrics:
overall_acc: 0.5704656862745099
all_acc: 0.5784313725490196
partial_acc: 0.54901960784313

In [19]:
calc_tranformation_acc("gemini-2.0-flash-thinking-exp-01-21", "3d", visim=True, debug=False)

>>>>>>>>>>va metrics:
		overall_acc: 0.5751633986928104
		all_acc: 0.5833333333333334
		partial_acc: 0.5588235294117647
		easy_acc: 0.7205882352941176
		medium_acc: 0.5441176470588235
		hard_acc: 0.4852941176470588
		2steps_acc: 0.5588235294117647
		3steps_acc: 0.6078431372549019
		translate_acc: 0.6666666666666666
		scale_acc: 0.6025641025641025
		rotate_acc: 0.5
		shear_acc: 0.47619047619047616
>>>>>>>>>>text_instruct metrics:
		overall_acc: 0.5343137254901961
		all w ans_acc: 0.5620915032679739
		all_acc: 0.5392156862745098
		all last_acc: 0.5294117647058824
		partial_acc: 0.45098039215686275
		easy_acc: 0.6323529411764706
		medium_acc: 0.4852941176470588
		hard_acc: 0.5
		2steps_acc: 0.5392156862745098
		3steps_acc: 0.5392156862745098
		shear_acc: 0.4666666666666667
		translate_acc: 0.5925925925925926
		rotate_acc: 0.48484848484848486
		scale_acc: 0.5913978494623656
>>>>>>>>>>Overall metrics:
overall_acc: 0.5547385620915033
all_acc: 0.5612745098039216
partial_acc: 0.504901960784313