In [7]:
import pandas as pd
import pandas
from functools import partial
from _utils import calculate_accuracy
import glob
import os
import re
from pathlib import Path
from collections import defaultdict
from tqdm.auto import tqdm
import traceback

In [8]:
import sys
sys.path.insert(0, '/home/y-guo/self-ensemble/self-ensemble')
sys.path.insert(0, '/home/y-guo/self-ensemble/self-ensemble/notebook')

# Import utils from notebook directory (has take_until_punct_or_space)
import utils

In [None]:
# TODO: For every model，For every perprompt.feather, calculate every (TT TF FT FF) with same shots and save to a new file for example, for /home/xzhao/workspace/GYB_self-ensemble/datasets/myriadlama/llama3.2_1b_it/baseline_per_prompt.0shots.feather, let it be tpath and calculate the (TT TF FT FF), here let /home/xzhao/workspace/GYB_self-ensemble/datasets/myriadlama/llama3.2_1b_it/myriadlama.logits.avg.0fshots.5samples.5paras.feather , vscode-remote://ssh-remote%2Btokyo106/home/xzhao/workspace/GYB_self-ensemble/datasets/myriadlama/llama3.2_1b_it/myriadlama.logits.avg.avglayer.layer12.alpha100.token-last.multilayer.0fshots.5samples.5paras.feather and so on be t2path, note thet the shots (shown in file neme) should be the same. read /home/xzhao/workspace/GYB_self-ensemble/datasets/myriadlama for file structure

In [172]:
tpath = f"/home/xzhao/workspace/GYB_self-ensemble/datasets/myriadlama/llama3.2_3b/baseline_per_prompt.0shots.feather"
df_perprompt = pandas.read_feather(tpath)

In [171]:
t2path = f"/home/xzhao/workspace/GYB_self-ensemble/datasets/myriadlama/llama3.2_3b/myriadlama.logits.avg.0fshots.5samples.5paras.feather"
df_logit = pandas.read_feather(t2path)


In [9]:
def TF(generations, _gold_answers):
    generations = utils.take_until_punct_or_space(generations)
    if len(generations) == 0:
        return False
    return utils.partial_match(generations, _gold_answers, True)

In [10]:
# Configuration
BASE_PATH = "/home/y-guo/self-ensemble/myriadlama"
OUTPUT_BASE = "/home/y-guo/self-ensemble/analyzeResults"

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_BASE, exist_ok=True)

In [11]:
def extract_shot_value(filename):
    """Extract shot value from baseline or myriadlama filename.
    
    Args:
        filename: Name of the file (not full path)
        
    Returns:
        Shot value as integer, or None if not found
    """
    # Baseline pattern: baseline_per_prompt.{X}shots.feather
    baseline_match = re.search(r'baseline_per_prompt\.(\d+)shots\.feather', filename)
    if baseline_match:
        return int(baseline_match.group(1))
    
    # Myriadlama pattern with explicit shots: *.{X}fshots.*
    fshots_match = re.search(r'\.(\d+)fshots\.', filename)
    if fshots_match:
        return int(fshots_match.group(1))
    
    # Myriadlama pattern without shots (default to 5 shots)
    # Pattern: myriadlama.*.5samples.*.feather (no fshots)
    if filename.startswith('myriadlama.') and 'fshots' not in filename:
        # Check if it has samples pattern (indicating it's a result file)
        if re.search(r'\d+samples\.', filename):
            return 5
    
    return None


def match_files_by_shot(model_dir):
    """Match baseline and myriadlama files by shot count.
    
    Args:
        model_dir: Path to model directory
        
    Returns:
        Dictionary mapping shot_count -> {'baseline': Path or None, 'myriadlama': [Path, ...]}
    """
    model_dir = Path(model_dir)
    
    # Group files by shot count
    files_by_shot = defaultdict(lambda: {'baseline': None, 'myriadlama': []})
    
    for file_path in model_dir.glob('*.feather'):
        filename = file_path.name
        shot_value = extract_shot_value(filename)
        
        if shot_value is None:
            continue
        
        if filename.startswith('baseline_per_prompt.'):
            files_by_shot[shot_value]['baseline'] = file_path
        elif filename.startswith('myriadlama.'):
            files_by_shot[shot_value]['myriadlama'].append(file_path)
    
    return dict(files_by_shot)


def extract_method_name(filepath):
    """Extract method name from myriadlama filename (keeping full detail).
    
    Args:
        filepath: Path object or string
        
    Returns:
        Method name string
    """
    filename = Path(filepath).name
    # Remove 'myriadlama.' prefix and '.feather' suffix
    method = filename.replace('myriadlama.', '').replace('.feather', '')
    return method

In [12]:
def calculate_ttff(baseline_path, method_path):
    """Calculate TT, TF, FT, FF metrics comparing baseline and method results.
    
    Args:
        baseline_path: Path to baseline_per_prompt.*.feather file
        method_path: Path to myriadlama.*.feather file
        
    Returns:
        Dictionary with keys: TT, TF, FT, FF, total_samples, status, error
    """
    try:
        # Load files
        df_baseline = pd.read_feather(baseline_path)
        df_method = pd.read_feather(method_path)
        
        # Check required columns
        required_baseline_cols = ['paraphrase', 'generation_lemmas', 'answer_lemmas']
        required_method_cols = ['paraphrases', 'generation_lemmas', 'answer_lemmas']
        
        for col in required_baseline_cols:
            if col not in df_baseline.columns:
                return {
                    'TT': None, 'TF': None, 'FT': None, 'FF': None,
                    'total_samples': 0, 'status': 'error',
                    'error': f'Missing column {col} in baseline'
                }
        
        for col in required_method_cols:
            if col not in df_method.columns:
                return {
                    'TT': None, 'TF': None, 'FT': None, 'FF': None,
                    'total_samples': 0, 'status': 'error',
                    'error': f'Missing column {col} in method file'
                }
        
        # Build paraphrase lookup dictionary from baseline
        paraphrase_dict = defaultdict(list)
        for idx, row in df_baseline.iterrows():
            paraphrase_dict[row["paraphrase"]].append({
                "generation_lemmas": row["generation_lemmas"],
                "answer_lemmas": row["answer_lemmas"]
            })
        
        # Calculate TT, TF, FT, FF
        countBA_TT = 0
        countBA_TF = 0
        countBA_FT = 0
        countBA_FF = 0
        
        for paras, gen_lemmas, ans_lemmas in zip(
            df_method["paraphrases"], 
            df_method["generation_lemmas"], 
            df_method["answer_lemmas"]):
            
            # Check if ANY paraphrase in baseline was correct
            pp_anymatch = False
            for para in paras:
                if para in paraphrase_dict:
                    for item in paraphrase_dict[para]:
                        if TF(item["generation_lemmas"], item["answer_lemmas"]):
                            pp_anymatch = True
                            break
                if pp_anymatch:
                    break
            
            # Check if current method is correct
            current_match = TF(gen_lemmas, ans_lemmas)
            
            # Update counts
            countBA_TT += int(pp_anymatch and current_match)
            countBA_TF += int(pp_anymatch and (not current_match))
            countBA_FT += int((not pp_anymatch) and current_match)
            countBA_FF += int((not pp_anymatch) and (not current_match))
        
        total = countBA_TT + countBA_TF + countBA_FT + countBA_FF
        
        return {
            'TT': countBA_TT,
            'TF': countBA_TF,
            'FT': countBA_FT,
            'FF': countBA_FF,
            'total_samples': total,
            'status': 'success',
            'error': None
        }
        
    except Exception as e:
        return {
            'TT': None, 'TF': None, 'FT': None, 'FF': None,
            'total_samples': 0, 'status': 'error',
            'error': str(e)
        }

In [13]:
def process_model(model_name, model_dir, output_dir):
    """Process all shots and methods for a single model.
    
    Args:
        model_name: Name of the model
        model_dir: Path to model directory containing feather files
        output_dir: Path to output directory for results
        
    Returns:
        DataFrame with all results for this model
    """
    print(f"\n{'='*80}")
    print(f"Processing model: {model_name}")
    print(f"{'='*80}")
    
    # Match files by shot count
    files_by_shot = match_files_by_shot(model_dir)
    
    if not files_by_shot:
        print(f"  No files found for {model_name}")
        return None
    
    results = []
    total_processed = 0
    total_incomplete = 0
    total_errors = 0
    
    # Process each shot configuration
    for shot_count in sorted(files_by_shot.keys()):
        shot_data = files_by_shot[shot_count]
        baseline_file = shot_data['baseline']
        method_files = shot_data['myriadlama']
        
        print(f"\n  Shot count: {shot_count}")
        print(f"    Baseline: {baseline_file.name if baseline_file else 'MISSING'}")
        print(f"    Method files: {len(method_files)}")
        
        # Process each method file
        for method_file in tqdm(method_files, desc=f"    Processing {shot_count}-shot methods", leave=False):
            method_name = extract_method_name(method_file)
            
            if baseline_file is None:
                # No baseline - mark as incomplete
                results.append({
                    'model': model_name,
                    'shot_count': shot_count,
                    'method_name': method_name,
                    'baseline_file': 'MISSING',
                    'method_file': method_file.name,
                    'TT': None,
                    'TF': None,
                    'FT': None,
                    'FF': None,
                    'total_samples': None,
                    'baseline_acc': None,
                    'method_acc': None,
                    'improvement': None,
                    'status': 'incomplete',
                    'error': 'No baseline file found for this shot count'
                })
                total_incomplete += 1
            else:
                # Calculate TTFF metrics
                ttff = calculate_ttff(baseline_file, method_file)
                
                if ttff['status'] == 'success':
                    # Calculate accuracies
                    total = ttff['total_samples']
                    baseline_acc = (ttff['TT'] + ttff['TF']) / total if total > 0 else 0
                    method_acc = (ttff['TT'] + ttff['FT']) / total if total > 0 else 0
                    improvement = method_acc - baseline_acc
                    
                    results.append({
                        'model': model_name,
                        'shot_count': shot_count,
                        'method_name': method_name,
                        'baseline_file': baseline_file.name,
                        'method_file': method_file.name,
                        'TT': ttff['TT'],
                        'TF': ttff['TF'],
                        'FT': ttff['FT'],
                        'FF': ttff['FF'],
                        'total_samples': ttff['total_samples'],
                        'baseline_acc': f"{baseline_acc:.4f}",
                        'method_acc': f"{method_acc:.4f}",
                        'improvement': f"{improvement:.4f}",
                        'status': 'success',
                        'error': None
                    })
                    total_processed += 1
                else:
                    # Error during calculation
                    results.append({
                        'model': model_name,
                        'shot_count': shot_count,
                        'method_name': method_name,
                        'baseline_file': baseline_file.name,
                        'method_file': method_file.name,
                        'TT': None,
                        'TF': None,
                        'FT': None,
                        'FF': None,
                        'total_samples': None,
                        'baseline_acc': None,
                        'method_acc': None,
                        'improvement': None,
                        'status': 'error',
                        'error': ttff['error']
                    })
                    total_errors += 1
    
    print(f"\n  Summary for {model_name}:")
    print(f"    ✓ Successfully processed: {total_processed}")
    print(f"    ⚠ Incomplete (no baseline): {total_incomplete}")
    print(f"    ✗ Errors: {total_errors}")
    print(f"    Total results: {len(results)}")
    
    # Convert to DataFrame
    df_results = pd.DataFrame(results)
    
    # Save results
    model_output_dir = Path(output_dir) / model_name
    model_output_dir.mkdir(parents=True, exist_ok=True)
    
    output_file = model_output_dir / "ttff_results.csv"
    df_results.to_csv(output_file, index=False)
    print(f"    Saved to: {output_file}")
    
    # Also save as feather for faster loading
    output_feather = model_output_dir / "ttff_results.feather"
    df_results.to_feather(output_feather)
    print(f"    Saved to: {output_feather}")
    
    return df_results

In [14]:
# Test on one model first
test_model = "llama3.2_3b"
test_model_dir = Path(BASE_PATH) / test_model

if test_model_dir.exists():
    df_test = process_model(test_model, test_model_dir, OUTPUT_BASE)
    if df_test is not None:
        print(f"\nFirst 5 results:")
        display(df_test.head())
else:
    print(f"Test model directory not found: {test_model_dir}")


Processing model: llama3.2_3b

  Shot count: 0
    Baseline: baseline_per_prompt.0shots.feather
    Method files: 27


    Processing 0-shot methods:   0%|          | 0/27 [00:00<?, ?it/s]


  Shot count: 1
    Baseline: baseline_per_prompt.1shots.feather
    Method files: 7


    Processing 1-shot methods:   0%|          | 0/7 [00:00<?, ?it/s]


  Shot count: 2
    Baseline: baseline_per_prompt.2shots.feather
    Method files: 12


    Processing 2-shot methods:   0%|          | 0/12 [00:00<?, ?it/s]


  Shot count: 3
    Baseline: baseline_per_prompt.3shots.feather
    Method files: 9


    Processing 3-shot methods:   0%|          | 0/9 [00:00<?, ?it/s]


  Shot count: 4
    Baseline: baseline_per_prompt.4shots.feather
    Method files: 12


    Processing 4-shot methods:   0%|          | 0/12 [00:00<?, ?it/s]


  Shot count: 5
    Baseline: baseline_per_prompt.5shots.feather
    Method files: 51


    Processing 5-shot methods:   0%|          | 0/51 [00:00<?, ?it/s]


  Shot count: 6
    Baseline: baseline_per_prompt.6shots.feather
    Method files: 9


    Processing 6-shot methods:   0%|          | 0/9 [00:00<?, ?it/s]


  Shot count: 7
    Baseline: baseline_per_prompt.7shots.feather
    Method files: 2


    Processing 7-shot methods:   0%|          | 0/2 [00:00<?, ?it/s]


  Shot count: 8
    Baseline: baseline_per_prompt.8shots.feather
    Method files: 8


    Processing 8-shot methods:   0%|          | 0/8 [00:00<?, ?it/s]


  Shot count: 10
    Baseline: MISSING
    Method files: 5


    Processing 10-shot methods:   0%|          | 0/5 [00:00<?, ?it/s]


  Summary for llama3.2_3b:
    ✓ Successfully processed: 137
    ⚠ Incomplete (no baseline): 5
    ✗ Errors: 0
    Total results: 142
    Saved to: /home/y-guo/self-ensemble/analyzeResults/llama3.2_3b/ttff_results.csv
    Saved to: /home/y-guo/self-ensemble/analyzeResults/llama3.2_3b/ttff_results.feather

First 5 results:


Unnamed: 0,model,shot_count,method_name,baseline_file,method_file,TT,TF,FT,FF,total_samples,baseline_acc,method_acc,improvement,status,error
0,llama3.2_3b,0,modifyattn.modifyrope.singleparaqapair.0fshots...,baseline_per_prompt.0shots.feather,myriadlama.modifyattn.modifyrope.singleparaqap...,4023.0,1997.0,198.0,3782.0,10000.0,0.602,0.4221,-0.1799,success,
1,llama3.2_3b,0,modifyattn.modifyrope.scalescore20.singleparaq...,baseline_per_prompt.0shots.feather,myriadlama.modifyattn.modifyrope.scalescore20....,3472.0,2548.0,156.0,3824.0,10000.0,0.602,0.3628,-0.2392,success,
2,llama3.2_3b,0,logits.max.avglayer.layer21.alpha100.token-las...,baseline_per_prompt.0shots.feather,myriadlama.logits.max.avglayer.layer21.alpha10...,4307.0,1713.0,166.0,3814.0,10000.0,0.602,0.4473,-0.1547,success,
3,llama3.2_3b,0,logits.avg.avglayer.layer1.alpha100.token-last...,baseline_per_prompt.0shots.feather,myriadlama.logits.avg.avglayer.layer1.alpha100...,4221.0,1799.0,166.0,3814.0,10000.0,0.602,0.4387,-0.1633,success,
4,llama3.2_3b,0,modifyrope.singleparaqapair.0fshots.5samples.5...,baseline_per_prompt.0shots.feather,myriadlama.modifyrope.singleparaqapair.0fshots...,2919.0,3101.0,176.0,3804.0,10000.0,0.602,0.3095,-0.2925,success,


In [15]:
def process_all_models(base_path, output_base):
    """Process all models in the base path directory.
    
    Args:
        base_path: Path to directory containing model subdirectories
        output_base: Path to output directory for results
    """
    base_path = Path(base_path)
    
    # Find all model directories that have baseline files
    model_dirs = []
    for item in base_path.iterdir():
        if item.is_dir():
            # Check if directory has any baseline files
            baseline_files = list(item.glob('baseline_per_prompt.*.feather'))
            if baseline_files:
                model_dirs.append(item)
    
    print(f"Found {len(model_dirs)} models with baseline files:")
    for model_dir in sorted(model_dirs):
        print(f"  - {model_dir.name}")
    
    # Process each model
    all_results = []
    for model_dir in tqdm(model_dirs, desc="Processing models"):
        model_name = model_dir.name
        try:
            df_model = process_model(model_name, model_dir, output_base)
            if df_model is not None:
                all_results.append(df_model)
        except Exception as e:
            print(f"\n✗ ERROR processing {model_name}: {e}")
            traceback.print_exc()
    
    # Combine all results
    if all_results:
        df_all = pd.concat(all_results, ignore_index=True)
        
        # Save combined results
        combined_output = Path(output_base) / "all_models_ttff_results.csv"
        df_all.to_csv(combined_output, index=False)
        print(f"\n{'='*80}")
        print(f"Combined results saved to: {combined_output}")
        print(f"Total rows: {len(df_all)}")
        print(f"{'='*80}")
        
        return df_all
    else:
        print("\nNo results to combine.")
        return None

In [None]:
# Run full batch processing for all models
df_all_results = process_all_models(BASE_PATH, OUTPUT_BASE)

Found 12 models with baseline files:
  - llama3.1_8b
  - llama3.1_8b_it
  - llama3.2_1b
  - llama3.2_1b_it
  - llama3.2_3b
  - llama3.2_3b_it
  - qwen2.5_14b
  - qwen2.5_14b_it
  - qwen2.5_3b
  - qwen2.5_3b_it
  - qwen2.5_7b
  - qwen2.5_7b_it


Processing models:   0%|          | 0/12 [00:00<?, ?it/s]


Processing model: qwen2.5_14b_it

  Shot count: 0
    Baseline: baseline_per_prompt.0shots.feather
    Method files: 5


    Processing 0-shot methods:   0%|          | 0/5 [00:00<?, ?it/s]


  Shot count: 1
    Baseline: baseline_per_prompt.1shots.feather
    Method files: 1


    Processing 1-shot methods:   0%|          | 0/1 [00:00<?, ?it/s]


  Shot count: 2
    Baseline: baseline_per_prompt.2shots.feather
    Method files: 1


    Processing 2-shot methods:   0%|          | 0/1 [00:00<?, ?it/s]


  Shot count: 3
    Baseline: baseline_per_prompt.3shots.feather
    Method files: 7


    Processing 3-shot methods:   0%|          | 0/7 [00:00<?, ?it/s]


  Shot count: 4
    Baseline: baseline_per_prompt.4shots.feather
    Method files: 1


    Processing 4-shot methods:   0%|          | 0/1 [00:00<?, ?it/s]


  Shot count: 5
    Baseline: baseline_per_prompt.5shots.feather
    Method files: 37


    Processing 5-shot methods:   0%|          | 0/37 [00:00<?, ?it/s]


  Shot count: 6
    Baseline: baseline_per_prompt.6shots.feather
    Method files: 0


    Processing 6-shot methods: 0it [00:00, ?it/s]


  Shot count: 8
    Baseline: baseline_per_prompt.8shots.feather
    Method files: 0


    Processing 8-shot methods: 0it [00:00, ?it/s]


  Summary for qwen2.5_14b_it:
    ✓ Successfully processed: 52
    ⚠ Incomplete (no baseline): 0
    ✗ Errors: 0
    Total results: 52
    Saved to: /home/y-guo/self-ensemble/analyzeResults/qwen2.5_14b_it/ttff_results.csv
    Saved to: /home/y-guo/self-ensemble/analyzeResults/qwen2.5_14b_it/ttff_results.feather

Processing model: llama3.2_1b_it

  Shot count: 0
    Baseline: baseline_per_prompt.0shots.feather
    Method files: 13


    Processing 0-shot methods:   0%|          | 0/13 [00:00<?, ?it/s]


  Shot count: 1
    Baseline: baseline_per_prompt.1shots.feather
    Method files: 6


    Processing 1-shot methods:   0%|          | 0/6 [00:00<?, ?it/s]


  Shot count: 2
    Baseline: baseline_per_prompt.2shots.feather
    Method files: 12


    Processing 2-shot methods:   0%|          | 0/12 [00:00<?, ?it/s]


  Shot count: 3
    Baseline: baseline_per_prompt.3shots.feather
    Method files: 8


    Processing 3-shot methods:   0%|          | 0/8 [00:00<?, ?it/s]


  Shot count: 4
    Baseline: baseline_per_prompt.4shots.feather
    Method files: 12


    Processing 4-shot methods:   0%|          | 0/12 [00:00<?, ?it/s]


  Shot count: 5
    Baseline: baseline_per_prompt.5shots.feather
    Method files: 64


    Processing 5-shot methods:   0%|          | 0/64 [00:00<?, ?it/s]


  Shot count: 6
    Baseline: baseline_per_prompt.6shots.feather
    Method files: 9


    Processing 6-shot methods:   0%|          | 0/9 [00:00<?, ?it/s]


  Shot count: 8
    Baseline: baseline_per_prompt.8shots.feather
    Method files: 8


    Processing 8-shot methods:   0%|          | 0/8 [00:00<?, ?it/s]


  Shot count: 10
    Baseline: MISSING
    Method files: 8


    Processing 10-shot methods:   0%|          | 0/8 [00:00<?, ?it/s]


  Summary for llama3.2_1b_it:
    ✓ Successfully processed: 132
    ⚠ Incomplete (no baseline): 8
    ✗ Errors: 0
    Total results: 140
    Saved to: /home/y-guo/self-ensemble/analyzeResults/llama3.2_1b_it/ttff_results.csv
    Saved to: /home/y-guo/self-ensemble/analyzeResults/llama3.2_1b_it/ttff_results.feather

Processing model: qwen2.5_3b_it

  Shot count: 0
    Baseline: baseline_per_prompt.0shots.feather
    Method files: 13


    Processing 0-shot methods:   0%|          | 0/13 [00:00<?, ?it/s]


  Shot count: 1
    Baseline: baseline_per_prompt.1shots.feather
    Method files: 6


    Processing 1-shot methods:   0%|          | 0/6 [00:00<?, ?it/s]


  Shot count: 2
    Baseline: baseline_per_prompt.2shots.feather
    Method files: 11


    Processing 2-shot methods:   0%|          | 0/11 [00:00<?, ?it/s]


  Shot count: 3
    Baseline: baseline_per_prompt.3shots.feather
    Method files: 7


    Processing 3-shot methods:   0%|          | 0/7 [00:00<?, ?it/s]


  Shot count: 4
    Baseline: baseline_per_prompt.4shots.feather
    Method files: 9


    Processing 4-shot methods:   0%|          | 0/9 [00:00<?, ?it/s]


  Shot count: 5
    Baseline: baseline_per_prompt.5shots.feather
    Method files: 69


    Processing 5-shot methods:   0%|          | 0/69 [00:00<?, ?it/s]


  Shot count: 6
    Baseline: baseline_per_prompt.6shots.feather
    Method files: 3


    Processing 6-shot methods:   0%|          | 0/3 [00:00<?, ?it/s]


  Shot count: 8
    Baseline: baseline_per_prompt.8shots.feather
    Method files: 3


    Processing 8-shot methods:   0%|          | 0/3 [00:00<?, ?it/s]


  Shot count: 10
    Baseline: MISSING
    Method files: 3


    Processing 10-shot methods:   0%|          | 0/3 [00:00<?, ?it/s]


  Summary for qwen2.5_3b_it:
    ✓ Successfully processed: 121
    ⚠ Incomplete (no baseline): 3
    ✗ Errors: 0
    Total results: 124
    Saved to: /home/y-guo/self-ensemble/analyzeResults/qwen2.5_3b_it/ttff_results.csv
    Saved to: /home/y-guo/self-ensemble/analyzeResults/qwen2.5_3b_it/ttff_results.feather

Processing model: llama3.2_1b

  Shot count: 0
    Baseline: baseline_per_prompt.0shots.feather
    Method files: 19


    Processing 0-shot methods:   0%|          | 0/19 [00:00<?, ?it/s]


  Shot count: 1
    Baseline: baseline_per_prompt.1shots.feather
    Method files: 8


    Processing 1-shot methods:   0%|          | 0/8 [00:00<?, ?it/s]


  Shot count: 2
    Baseline: baseline_per_prompt.2shots.feather
    Method files: 12


    Processing 2-shot methods:   0%|          | 0/12 [00:00<?, ?it/s]


  Shot count: 3
    Baseline: baseline_per_prompt.3shots.feather
    Method files: 10


    Processing 3-shot methods:   0%|          | 0/10 [00:00<?, ?it/s]


  Shot count: 4
    Baseline: baseline_per_prompt.4shots.feather
    Method files: 12


    Processing 4-shot methods:   0%|          | 0/12 [00:00<?, ?it/s]


  Shot count: 5
    Baseline: baseline_per_prompt.5shots.feather
    Method files: 60


    Processing 5-shot methods:   0%|          | 0/60 [00:00<?, ?it/s]


  Shot count: 6
    Baseline: baseline_per_prompt.6shots.feather
    Method files: 9


    Processing 6-shot methods:   0%|          | 0/9 [00:00<?, ?it/s]


  Shot count: 7
    Baseline: baseline_per_prompt.7shots.feather
    Method files: 3


    Processing 7-shot methods:   0%|          | 0/3 [00:00<?, ?it/s]


  Shot count: 8
    Baseline: baseline_per_prompt.8shots.feather
    Method files: 9


    Processing 8-shot methods:   0%|          | 0/9 [00:00<?, ?it/s]


  Shot count: 9
    Baseline: baseline_per_prompt.9shots.feather
    Method files: 3


    Processing 9-shot methods:   0%|          | 0/3 [00:00<?, ?it/s]


  Shot count: 10
    Baseline: MISSING
    Method files: 9


    Processing 10-shot methods:   0%|          | 0/9 [00:00<?, ?it/s]


  Summary for llama3.2_1b:
    ✓ Successfully processed: 145
    ⚠ Incomplete (no baseline): 9
    ✗ Errors: 0
    Total results: 154
    Saved to: /home/y-guo/self-ensemble/analyzeResults/llama3.2_1b/ttff_results.csv
    Saved to: /home/y-guo/self-ensemble/analyzeResults/llama3.2_1b/ttff_results.feather

Processing model: qwen2.5_3b

  Shot count: 0
    Baseline: baseline_per_prompt.0shots.feather
    Method files: 20


    Processing 0-shot methods:   0%|          | 0/20 [00:00<?, ?it/s]


  Shot count: 1
    Baseline: baseline_per_prompt.1shots.feather
    Method files: 6


    Processing 1-shot methods:   0%|          | 0/6 [00:00<?, ?it/s]


  Shot count: 2
    Baseline: baseline_per_prompt.2shots.feather
    Method files: 11


    Processing 2-shot methods:   0%|          | 0/11 [00:00<?, ?it/s]


  Shot count: 3
    Baseline: baseline_per_prompt.3shots.feather
    Method files: 7


    Processing 3-shot methods:   0%|          | 0/7 [00:00<?, ?it/s]


  Shot count: 4
    Baseline: baseline_per_prompt.4shots.feather
    Method files: 11


    Processing 4-shot methods:   0%|          | 0/11 [00:00<?, ?it/s]


  Shot count: 5
    Baseline: baseline_per_prompt.5shots.feather
    Method files: 40


    Processing 5-shot methods:   0%|          | 0/40 [00:00<?, ?it/s]


  Shot count: 6
    Baseline: baseline_per_prompt.6shots.feather
    Method files: 9


    Processing 6-shot methods:   0%|          | 0/9 [00:00<?, ?it/s]


  Shot count: 8
    Baseline: baseline_per_prompt.8shots.feather
    Method files: 9


    Processing 8-shot methods:   0%|          | 0/9 [00:00<?, ?it/s]


  Shot count: 10
    Baseline: MISSING
    Method files: 8


    Processing 10-shot methods:   0%|          | 0/8 [00:00<?, ?it/s]


  Summary for qwen2.5_3b:
    ✓ Successfully processed: 90
    ⚠ Incomplete (no baseline): 8
    ✗ Errors: 23
    Total results: 121
    Saved to: /home/y-guo/self-ensemble/analyzeResults/qwen2.5_3b/ttff_results.csv
    Saved to: /home/y-guo/self-ensemble/analyzeResults/qwen2.5_3b/ttff_results.feather

Processing model: qwen2.5_7b

  Shot count: 0
    Baseline: baseline_per_prompt.0shots.feather
    Method files: 8


    Processing 0-shot methods:   0%|          | 0/8 [00:00<?, ?it/s]


  Shot count: 1
    Baseline: baseline_per_prompt.1shots.feather
    Method files: 5


    Processing 1-shot methods:   0%|          | 0/5 [00:00<?, ?it/s]


  Shot count: 2
    Baseline: baseline_per_prompt.2shots.feather
    Method files: 5


    Processing 2-shot methods:   0%|          | 0/5 [00:00<?, ?it/s]


  Shot count: 3
    Baseline: baseline_per_prompt.3shots.feather
    Method files: 7


    Processing 3-shot methods:   0%|          | 0/7 [00:00<?, ?it/s]


  Shot count: 4
    Baseline: baseline_per_prompt.4shots.feather
    Method files: 5


    Processing 4-shot methods:   0%|          | 0/5 [00:00<?, ?it/s]


  Shot count: 5
    Baseline: baseline_per_prompt.5shots.feather
    Method files: 38


    Processing 5-shot methods:   0%|          | 0/38 [00:00<?, ?it/s]


  Shot count: 6
    Baseline: baseline_per_prompt.6shots.feather
    Method files: 0


    Processing 6-shot methods: 0it [00:00, ?it/s]


  Shot count: 8
    Baseline: baseline_per_prompt.8shots.feather
    Method files: 0


    Processing 8-shot methods: 0it [00:00, ?it/s]


  Summary for qwen2.5_7b:
    ✓ Successfully processed: 68
    ⚠ Incomplete (no baseline): 0
    ✗ Errors: 0
    Total results: 68
    Saved to: /home/y-guo/self-ensemble/analyzeResults/qwen2.5_7b/ttff_results.csv
    Saved to: /home/y-guo/self-ensemble/analyzeResults/qwen2.5_7b/ttff_results.feather

Processing model: qwen2.5_7b_it

  Shot count: 0
    Baseline: baseline_per_prompt.0shots.feather
    Method files: 11


    Processing 0-shot methods:   0%|          | 0/11 [00:00<?, ?it/s]


  Shot count: 1
    Baseline: baseline_per_prompt.1shots.feather
    Method files: 5


    Processing 1-shot methods:   0%|          | 0/5 [00:00<?, ?it/s]


  Shot count: 2
    Baseline: baseline_per_prompt.2shots.feather
    Method files: 5


    Processing 2-shot methods:   0%|          | 0/5 [00:00<?, ?it/s]


  Shot count: 3
    Baseline: baseline_per_prompt.3shots.feather
    Method files: 7


    Processing 3-shot methods:   0%|          | 0/7 [00:00<?, ?it/s]

In [173]:
# compute TT, TF, FT, FF
from collections import defaultdict

paraphrase_dict = defaultdict(list)

for idx, row in df_perprompt.iterrows():
    paraphrase_dict[row["paraphrase"]].append({
        "generation_lemmas": row["generation_lemmas"],
        "answer_lemmas": row["answer_lemmas"]
    })
countBA_TT = 0
countBA_TF = 0
countBA_FT = 0
countBA_FF = 0

for paras, gen_lemmas, ans_lemmas in zip(df_logit["paraphrases"], df_logit["generation_lemmas"], df_logit["answer_lemmas"]):
    # 直接从字典查找，避免 isin 查询
    items = []
    pp_anymatch = False
    for para in paras:
        if para in paraphrase_dict:
            for item in paraphrase_dict[para]:
                items.append([lemma for lemma in item["generation_lemmas"].tolist()])
                if TF(item["generation_lemmas"], item["answer_lemmas"]):
                    pp_anymatch = True
                    break
        if pp_anymatch:
            break
    
    current_match = TF(gen_lemmas, ans_lemmas)
    countBA_TT += int(pp_anymatch and current_match)
    countBA_TF += int(pp_anymatch and (not current_match))
    countBA_FT += int((not pp_anymatch) and current_match)
    countBA_FF += int((not pp_anymatch) and (not current_match))
    # if (not pp_anymatch) and current_match:
    #     print("No paraphrase match but current match:")
    #     print(gen_lemmas)
    #     print(ans_lemmas)
    #     for item in items:
    #         print(item)        
        

print(f"TT: {countBA_TT}, TF: {countBA_TF}, FT: {countBA_FT}, FF: {countBA_FF}")

TT: 3811, TF: 2209, FT: 100, FF: 3880


In [None]:
print(paraphrase_dict)

[]
