# MyriadLAMA Results Analysis

This notebook provides comprehensive analysis tools for MyriadLAMA generation results.

## Features:
1. **One-click search** for all generation files and baselines
2. **Built-in lemmatization** functionality
3. **Comparison tables** generation
4. **Detailed examples** for in-depth analysis

## Usage:
1. Set the `DATASET_ROOT` variable to your dataset path
2. Run all cells to load results and generate analysis
3. Use the provided functions to explore specific results

## 1. Setup and Imports

In [1]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from glob import glob
import warnings
warnings.filterwarnings('ignore')

# Add parent directory to path
sys.path.insert(0, str(Path.cwd().parent))
from utils import partial_match, partial_match_scores

## 2. Configuration

In [2]:
# Configure your dataset path here
DATASET_NAME = "myriadlama"
MODEL_NAME = "qwen2.5_7b_it"  # Change to your model name
DATASET_ROOT = f"/net/tokyo100-10g/data/str01_01/y-guo/datasets/myriadlama/llama3.1_8b/myriadlama_custom_1paras.csv"

# Check if path exists
if not os.path.exists(DATASET_ROOT):
    print(f"‚ö†Ô∏è  Warning: Dataset root not found: {DATASET_ROOT}")
    print(f"   Please update DATASET_ROOT variable")
else:
    print(f"‚úÖ Dataset root found: {DATASET_ROOT}")

‚úÖ Dataset root found: /net/tokyo100-10g/data/str01_01/y-guo/datasets/myriadlama/llama3.1_8b/myriadlama_custom_1paras.csv


## 3. Lemmatization Setup

In [8]:
# Import spacy for lemmatization
try:
    import spacy
    nlp = spacy.load("en_core_web_lg")
    print("‚úÖ Spacy loaded successfully")
except ImportError:
    print("‚ö†Ô∏è  Spacy not found. Install with: pip install spacy")
    print("   Then download model: python -m spacy download en_core_web_lg")
    nlp = None
except OSError:
    print("‚ö†Ô∏è  Spacy model not found. Download with: python -m spacy download en_core_web_lg")
    nlp = None

def lemmatize_text(text):
    """Lemmatize a text string."""
    if nlp is None:
        return text.lower().split()
    doc = nlp(text)
    return [token.lemma_.lower() for token in doc]

def lemmatize_answers(answers):
    """Lemmatize a list of answer strings."""
    return [lemmatize_text(ans) for ans in answers]

‚úÖ Spacy loaded successfully


## 4. File Discovery - One-Click Search for All Results

In [4]:
def discover_all_results(dataset_root):
    """
    Discover all generation result files in the dataset directory.
    
    Returns:
        dict: Dictionary mapping method names to file paths
    """
    results = {}
    
    if not os.path.exists(dataset_root):
        print(f"‚ùå Dataset root not found: {dataset_root}")
        return results
    
    # Search for all .feather files
    feather_files = glob(os.path.join(dataset_root, "*.feather"))
    
    for file_path in feather_files:
        file_name = os.path.basename(file_path)
        method_name = file_name.replace(".feather", "")
        results[method_name] = file_path
    
    return results

# Discover all results
all_results = discover_all_results(DATASET_ROOT)

print(f"\n{'='*70}")
print("Discovered Result Files")
print(f"{'='*70}")
print(f"\nTotal files found: {len(all_results)}\n")

# Categorize results
baselines = [k for k in all_results.keys() if 'baseline' in k]
ensembles = [k for k in all_results.keys() if 'ensemble' in k]
flex_attention = [k for k in all_results.keys() if 'flex_attention' in k]
others = [k for k in all_results.keys() if k not in baselines + ensembles + flex_attention]

if baselines:
    print("üìä Baselines:")
    for method in sorted(baselines):
        print(f"   - {method}")

if ensembles:
    print("\nüîÑ Ensemble Methods:")
    for method in sorted(ensembles):
        print(f"   - {method}")

if flex_attention:
    print("\n‚ö° FlexAttention Methods:")
    for method in sorted(flex_attention):
        print(f"   - {method}")

if others:
    print("\nüìÅ Other Results:")
    for method in sorted(others):
        print(f"   - {method}")

print(f"\n{'='*70}")


Discovered Result Files

Total files found: 0




## 5. Load and Process Results

This cell loads all discovered result files and automatically applies lemmatization to files that don't have it.

**Automatic Lemmatization:**
- Enabled by default for files missing `predict_lemma` or `answer_lemmas` columns
- Requires Spacy to be loaded (see Section 3)
- Set `apply_lemmatization=False` to disable if needed

In [5]:
def load_result_file(file_path, apply_lemmatization=False):
    """
    Load a result file and optionally apply lemmatization.
    
    Args:
        file_path: Path to the .feather file
        apply_lemmatization: Whether to apply lemmatization if not present
    
    Returns:
        DataFrame with results
    """
    df = pd.read_feather(file_path)
    
    # Check if lemmatization is needed
    needs_lemmatization = apply_lemmatization and (
        'predict_lemma' not in df.columns or 
        'answer_lemmas' not in df.columns
    )
    
    if needs_lemmatization and nlp is not None:
        print(f"   Applying lemmatization to {os.path.basename(file_path)}...")
        
        if 'prediction' in df.columns and 'predict_lemma' not in df.columns:
            df['predict_lemma'] = df['prediction'].apply(lemmatize_text)
        
        if 'answers' in df.columns and 'answer_lemmas' not in df.columns:
            df['answer_lemmas'] = df['answers'].apply(lemmatize_answers)
    
    return df

def load_all_results(all_results, apply_lemmatization=False):
    """
    Load all discovered result files.
    
    Args:
        all_results: Dictionary of method names to file paths
        apply_lemmatization: Whether to apply lemmatization
    
    Returns:
        dict: Dictionary mapping method names to DataFrames
    """
    loaded_results = {}
    
    print("\nLoading all result files...")
    for method, file_path in all_results.items():
        try:
            df = load_result_file(file_path, apply_lemmatization)
            loaded_results[method] = df
            print(f"‚úÖ Loaded {method}: {len(df)} samples")
        except Exception as e:
            print(f"‚ùå Error loading {method}: {e}")
    
    return loaded_results

# Load all results (automatic lemmatization enabled by default for files without lemmas)
# Set apply_lemmatization=False if you want to skip automatic lemmatization
loaded_results = load_all_results(all_results, apply_lemmatization=True)


Loading all result files...


## 6. Calculate Accuracies

In [7]:
def calculate_accuracy(df):
    """
    Calculate accuracy for a result DataFrame.
    
    Args:
        df: DataFrame with results
    
    Returns:
        float: Accuracy score (0-1)
    """
    if 'predict_lemma' not in df.columns or 'answer_lemmas' not in df.columns:
        return None
    
    # Process answer_lemmas to ensure proper format
    df_copy = df.copy()
    df_copy['answer_lemmas'] = df_copy['answer_lemmas'].apply(
        lambda xs: [list(x) if not isinstance(x, list) else x for x in xs] 
        if isinstance(xs, list) else xs
    )
    
    predictions = df_copy['predict_lemma'].tolist()
    answers = df_copy['answer_lemmas'].tolist()
    
    try:
        accuracy = partial_match_scores(predictions, answers)
        return accuracy
    except Exception as e:
        print(f"Error calculating accuracy: {e}")
        return None

# Calculate accuracies for all loaded results
accuracies = {}

print("\nCalculating accuracies...")
for method, df in loaded_results.items():
    acc = calculate_accuracy(df)
    if acc is not None:
        accuracies[method] = acc
        print(f"‚úÖ {method}: {acc:.3f}")
    else:
        print(f"‚ö†Ô∏è  {method}: Lemmatization not available")


Calculating accuracies...


## 7. Generate Comparison Table

In [None]:
def generate_comparison_table(accuracies, loaded_results):
    """
    Generate a comprehensive comparison table.
    
    Args:
        accuracies: Dictionary of method names to accuracy scores
        loaded_results: Dictionary of method names to DataFrames
    
    Returns:
        DataFrame: Comparison table
    """
    comparison_data = []
    
    for method in accuracies.keys():
        df = loaded_results[method]
        acc = accuracies[method]
        
        # Determine method category
        if 'baseline' in method:
            category = 'Baseline'
        elif 'ensemble' in method:
            category = 'Ensemble'
        elif 'flex_attention' in method:
            category = 'FlexAttention'
        else:
            category = 'Other'
        
        comparison_data.append({
            'Method': method,
            'Category': category,
            'Accuracy': acc,
            'Total Samples': len(df),
            'Unique Questions': df['uuid'].nunique() if 'uuid' in df.columns else 'N/A'
        })
    
    comparison_df = pd.DataFrame(comparison_data)
    
    # Sort by accuracy (descending)
    comparison_df = comparison_df.sort_values('Accuracy', ascending=False)
    
    return comparison_df

# Generate comparison table
if accuracies:
    comparison_table = generate_comparison_table(accuracies, loaded_results)
    
    print("\n" + "="*80)
    print("COMPARISON TABLE")
    print("="*80 + "\n")
    
    # Display the table
    display(comparison_table)
    
    # Calculate improvements over best baseline
    baseline_methods = comparison_table[comparison_table['Category'] == 'Baseline']
    if not baseline_methods.empty:
        best_baseline_acc = baseline_methods['Accuracy'].max()
        print(f"\nüìä Best Baseline Accuracy: {best_baseline_acc:.3f}")
        
        print("\nüöÄ Improvements over Best Baseline:")
        for _, row in comparison_table.iterrows():
            if row['Category'] != 'Baseline':
                improvement = row['Accuracy'] - best_baseline_acc
                pct_improvement = (improvement / best_baseline_acc) * 100
                print(f"   {row['Method']:<40} {improvement:+.4f} ({pct_improvement:+.2f}%)")
else:
    print("\n‚ö†Ô∏è  No accuracies available for comparison. Results may need lemmatization.")

## 8. Generate Detailed Examples

In [None]:
def generate_detailed_examples(method_name, df, num_examples=5, show_correct=True, show_incorrect=True):
    """
    Generate detailed examples from a result file for analysis.
    
    Args:
        method_name: Name of the method
        df: DataFrame with results
        num_examples: Number of examples to show
        show_correct: Whether to show correct predictions
        show_incorrect: Whether to show incorrect predictions
    """
    print(f"\n{'='*80}")
    print(f"DETAILED EXAMPLES: {method_name}")
    print(f"{'='*80}\n")
    
    # Check if we can determine correctness
    can_check_correctness = (
        'predict_lemma' in df.columns and 
        'answer_lemmas' in df.columns
    )
    
    if can_check_correctness:
        # Add correctness column
        df_with_correctness = df.copy()
        df_with_correctness['answer_lemmas'] = df_with_correctness['answer_lemmas'].apply(
            lambda xs: [list(x) if not isinstance(x, list) else x for x in xs] 
            if isinstance(xs, list) else xs
        )
        
        correctness = []
        for pred, ans in zip(df_with_correctness['predict_lemma'], df_with_correctness['answer_lemmas']):
            try:
                is_correct = partial_match(pred, ans)
                correctness.append(is_correct)
            except:
                correctness.append(None)
        
        df_with_correctness['is_correct'] = correctness
        
        # Filter based on correctness preference
        if show_correct and show_incorrect:
            filtered_df = df_with_correctness
        elif show_correct:
            filtered_df = df_with_correctness[df_with_correctness['is_correct'] == True]
        elif show_incorrect:
            filtered_df = df_with_correctness[df_with_correctness['is_correct'] == False]
        else:
            filtered_df = df_with_correctness
    else:
        filtered_df = df
    
    # Sample examples
    sample_df = filtered_df.head(num_examples)
    
    for idx, (i, row) in enumerate(sample_df.iterrows(), 1):
        print(f"\n{'‚îÄ'*80}")
        print(f"Example {idx}/{num_examples}")
        print(f"{'‚îÄ'*80}\n")
        
        # Display UUID if available
        if 'uuid' in row:
            print(f"UUID: {row['uuid']}")
        
        # Display question
        if 'question' in row:
            print(f"\nOriginal Question:\n{row['question']}")
        
        # Display paraphrases if available
        if 'paraphrase' in row:
            para_val = row['paraphrase']
            if not (isinstance(para_val, float) and pd.isna(para_val)):
                print(f"\nParaphrase:\n{para_val}")
        elif 'paraphrases' in row:
            paraphrases = row['paraphrases']
            if not (isinstance(paraphrases, float) and pd.isna(paraphrases)):
                if isinstance(paraphrases, (list, tuple)):
                    print(f"\nParaphrases:")
                    for i, para in enumerate(paraphrases, 1):
                        print(f"  {i}. {para}")
                else:
                    print(f"\nParaphrases:\n{paraphrases}")
        
        # Display prompt (truncated)
        if 'prompt' in row:
            prompt_val = row['prompt']
            if not (isinstance(prompt_val, float) and pd.isna(prompt_val)):
                prompt_preview = str(prompt_val)[:300] + "..." if len(str(prompt_val)) > 300 else str(prompt_val)
                print(f"\nPrompt (preview):\n{prompt_preview}")
        
        # Display generation
        if 'generation' in row:
            gen_val = row['generation']
            if not (isinstance(gen_val, float) and pd.isna(gen_val)):
                gen_preview = str(gen_val)[:200] + "..." if len(str(gen_val)) > 200 else str(gen_val)
                print(f"\nGeneration:\n{gen_preview}")
        
        # Display prediction
        if 'prediction' in row:
            print(f"\nPrediction: {row['prediction']}")
        
        # Display correct answers
        if 'answers' in row:
            print(f"\nCorrect Answers: {row['answers']}")
        
        # Display lemmatized versions
        if 'predict_lemma' in row:
            pred_lemma = row['predict_lemma']
            if not (isinstance(pred_lemma, float) and pd.isna(pred_lemma)):
                print(f"\nPrediction (lemmatized): {pred_lemma}")
        
        if 'answer_lemmas' in row:
            ans_lemmas = row['answer_lemmas']
            if not (isinstance(ans_lemmas, float) and pd.isna(ans_lemmas)):
                print(f"Answer Lemmas: {ans_lemmas}")
        
        # Display correctness
        if can_check_correctness and 'is_correct' in row:
            status = "‚úÖ CORRECT" if row['is_correct'] else "‚ùå INCORRECT"
            print(f"\nStatus: {status}")
    
    print(f"\n{'='*80}\n")

### 8.1 View Examples from a Specific Method

Choose a method from the loaded results and generate detailed examples.

In [None]:
# List available methods
print("Available methods for detailed analysis:")
for i, method in enumerate(loaded_results.keys(), 1):
    print(f"{i}. {method}")

# Select a method to analyze (change this)
METHOD_TO_ANALYZE = list(loaded_results.keys())[0] if loaded_results else None

if METHOD_TO_ANALYZE and METHOD_TO_ANALYZE in loaded_results:
    # Generate examples
    # Parameters:
    #   num_examples: number of examples to show
    #   show_correct: show correct predictions
    #   show_incorrect: show incorrect predictions
    generate_detailed_examples(
        METHOD_TO_ANALYZE, 
        loaded_results[METHOD_TO_ANALYZE],
        num_examples=5,
        show_correct=True,
        show_incorrect=True
    )
else:
    print("No methods available for analysis")

### 8.2 Compare Examples Across Methods

View the same examples from different methods for side-by-side comparison.

In [None]:
def compare_examples_across_methods(methods, loaded_results, num_examples=3, uuid_filter=None):
    """
    Compare the same examples across different methods.
    
    Args:
        methods: List of method names to compare
        loaded_results: Dictionary of loaded results
        num_examples: Number of examples to compare
        uuid_filter: Optional list of specific UUIDs to compare
    """
    print(f"\n{'='*80}")
    print(f"CROSS-METHOD COMPARISON")
    print(f"{'='*80}\n")
    
    # Get common UUIDs across all methods
    common_uuids = None
    for method in methods:
        if method not in loaded_results:
            continue
        df = loaded_results[method]
        if 'uuid' not in df.columns:
            print(f"‚ö†Ô∏è  Method {method} does not have UUID column")
            return
        
        uuids = set(df['uuid'].unique())
        if common_uuids is None:
            common_uuids = uuids
        else:
            common_uuids = common_uuids.intersection(uuids)
    
    if uuid_filter:
        common_uuids = [u for u in uuid_filter if u in common_uuids]
    else:
        common_uuids = list(common_uuids)[:num_examples]
    
    for idx, uuid in enumerate(common_uuids, 1):
        print(f"\n{'='*80}")
        print(f"Example {idx}/{len(common_uuids)} - UUID: {uuid}")
        print(f"{'='*80}\n")
        
        for method in methods:
            if method not in loaded_results:
                continue
            
            df = loaded_results[method]
            row = df[df['uuid'] == uuid].iloc[0]
            
            print(f"\n{'‚îÄ'*40}")
            print(f"Method: {method}")
            print(f"{'‚îÄ'*40}\n")
            
            if 'prediction' in row:
                print(f"Prediction: {row['prediction']}")
            
            if 'predict_lemma' in row:
                pred_lemma = row['predict_lemma']
                if not (isinstance(pred_lemma, float) and pd.isna(pred_lemma)):
                    print(f"Prediction (lemmatized): {pred_lemma}")
            
            # Show correctness if available
            if 'predict_lemma' in row and 'answer_lemmas' in row:
                try:
                    answer_lemmas = row['answer_lemmas']
                    if isinstance(answer_lemmas, list):
                        answer_lemmas = [list(x) if not isinstance(x, list) else x for x in answer_lemmas]
                    is_correct = partial_match(row['predict_lemma'], answer_lemmas)
                    status = "‚úÖ CORRECT" if is_correct else "‚ùå INCORRECT"
                    print(f"Status: {status}")
                except:
                    pass
        
        # Show correct answer once
        first_method = methods[0]
        if first_method in loaded_results:
            df = loaded_results[first_method]
            row = df[df['uuid'] == uuid].iloc[0]
            if 'answers' in row:
                print(f"\n{'‚îÄ'*40}")
                print(f"Correct Answers: {row['answers']}")
                print(f"{'‚îÄ'*40}")

# Example usage: Compare baseline with flex_attention
if len(loaded_results) >= 2:
    methods_to_compare = list(loaded_results.keys())[:2]  # Change this to compare specific methods
    compare_examples_across_methods(methods_to_compare, loaded_results, num_examples=3)
else:
    print("Need at least 2 methods to compare")

## 9. Export Detailed Analysis to CSV

In [None]:
def export_detailed_analysis(method_name, df, output_dir="."):
    """
    Export detailed analysis to CSV for external review.
    
    Args:
        method_name: Name of the method
        df: DataFrame with results
        output_dir: Directory to save the CSV file
    """
    # Create analysis DataFrame
    analysis_data = []
    
    for idx, row in df.iterrows():
        item = {
            'Index': idx,
            'UUID': row.get('uuid', 'N/A'),
            'Question': row.get('question', 'N/A'),
            'Prediction': row.get('prediction', 'N/A'),
            'Correct_Answers': str(row.get('answers', 'N/A')),
        }
        
        # Add lemmatized versions
        if 'predict_lemma' in df.columns:
            pred_lemma = row.get('predict_lemma')
            if not (isinstance(pred_lemma, float) and pd.isna(pred_lemma)):
                item['Prediction_Lemma'] = str(pred_lemma)
        
        if 'answer_lemmas' in df.columns:
            ans_lemmas = row.get('answer_lemmas')
            if not (isinstance(ans_lemmas, float) and pd.isna(ans_lemmas)):
                item['Answer_Lemmas'] = str(ans_lemmas)
        
        # Check correctness
        if 'predict_lemma' in df.columns and 'answer_lemmas' in df.columns:
            try:
                answer_lemmas = row['answer_lemmas']
                if isinstance(answer_lemmas, list):
                    answer_lemmas = [list(x) if not isinstance(x, list) else x for x in answer_lemmas]
                is_correct = partial_match(row['predict_lemma'], answer_lemmas)
                item['Is_Correct'] = is_correct
            except:
                item['Is_Correct'] = 'N/A'
        
        analysis_data.append(item)
    
    analysis_df = pd.DataFrame(analysis_data)
    
    # Export to CSV
    output_file = os.path.join(output_dir, f"{method_name}_detailed_analysis.csv")
    analysis_df.to_csv(output_file, index=False, encoding='utf-8-sig')
    
    print(f"‚úÖ Detailed analysis exported to: {output_file}")
    print(f"   Total entries: {len(analysis_df)}")
    
    return analysis_df

# Example: Export detailed analysis for a specific method
if loaded_results:
    method_to_export = list(loaded_results.keys())[0]  # Change this
    export_detailed_analysis(method_to_export, loaded_results[method_to_export], output_dir=DATASET_ROOT)

## 10. Summary and Statistics

In [None]:
def print_summary_statistics():
    """
    Print comprehensive summary statistics.
    """
    print(f"\n{'='*80}")
    print("SUMMARY STATISTICS")
    print(f"{'='*80}\n")
    
    print(f"Dataset: {DATASET_NAME}")
    print(f"Model: {MODEL_NAME}")
    print(f"Dataset Root: {DATASET_ROOT}\n")
    
    print(f"Total result files found: {len(all_results)}")
    print(f"Successfully loaded: {len(loaded_results)}")
    print(f"With accuracy metrics: {len(accuracies)}\n")
    
    if accuracies:
        best_method = max(accuracies.items(), key=lambda x: x[1])
        worst_method = min(accuracies.items(), key=lambda x: x[1])
        avg_accuracy = sum(accuracies.values()) / len(accuracies)
        
        print(f"üèÜ Best Method: {best_method[0]} ({best_method[1]:.3f})")
        print(f"üìâ Worst Method: {worst_method[0]} ({worst_method[1]:.3f})")
        print(f"üìä Average Accuracy: {avg_accuracy:.3f}")
        print(f"üìà Range: {worst_method[1]:.3f} - {best_method[1]:.3f}")
    
    print(f"\n{'='*80}\n")

print_summary_statistics()

## 11. Custom Analysis Functions

Add your own custom analysis functions below.

In [None]:
# Your custom analysis code here
# Example: Analyze error patterns, visualize results, etc.

def analyze_error_patterns(method_name, df):
    """
    Analyze patterns in incorrect predictions.
    
    Args:
        method_name: Name of the method
        df: DataFrame with results
    """
    if 'predict_lemma' not in df.columns or 'answer_lemmas' not in df.columns:
        print("Lemmatization required for error analysis")
        return
    
    print(f"\n{'='*80}")
    print(f"ERROR PATTERN ANALYSIS: {method_name}")
    print(f"{'='*80}\n")
    
    # Calculate correctness
    df_copy = df.copy()
    df_copy['answer_lemmas'] = df_copy['answer_lemmas'].apply(
        lambda xs: [list(x) if not isinstance(x, list) else x for x in xs] 
        if isinstance(xs, list) else xs
    )
    
    correctness = []
    for pred, ans in zip(df_copy['predict_lemma'], df_copy['answer_lemmas']):
        try:
            is_correct = partial_match(pred, ans)
            correctness.append(is_correct)
        except:
            correctness.append(None)
    
    df_copy['is_correct'] = correctness
    
    # Statistics
    total = len(df_copy)
    correct = sum(1 for c in correctness if c is True)
    incorrect = sum(1 for c in correctness if c is False)
    unknown = sum(1 for c in correctness if c is None)
    
    print(f"Total samples: {total}")
    print(f"Correct: {correct} ({correct/total*100:.1f}%)")
    print(f"Incorrect: {incorrect} ({incorrect/total*100:.1f}%)")
    if unknown > 0:
        print(f"Unknown: {unknown} ({unknown/total*100:.1f}%)")
    
    # Show some incorrect examples
    incorrect_df = df_copy[df_copy['is_correct'] == False].head(5)
    
    if len(incorrect_df) > 0:
        print(f"\nüîç Sample Incorrect Predictions:\n")
        for idx, (i, row) in enumerate(incorrect_df.iterrows(), 1):
            print(f"{idx}. Predicted: {row['prediction']} | Correct: {row['answers']}")
    
    print(f"\n{'='*80}\n")

# Example usage
if loaded_results:
    method_to_analyze = list(loaded_results.keys())[0]
    analyze_error_patterns(method_to_analyze, loaded_results[method_to_analyze])