In [None]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}")
else:
  print("No GPU detected. Go to Runtime > Change runtime type > GPU")

In [None]:
# Import of A3CG files

from google.colab import drive
import os
import shutil
import json

print("IMPORTING A3CG FILES")

# Mount Google Drive
print("Mounting Google Drive")
try:
    drive.mount('/content/drive')
    print("Drive mounted successfully")
except Exception as e:
    print(f"Error: {e}")
    exit()

# Finding the fold_1 folder
print("\nSearching for the 'fold_1' folder...")

possible_path = "/content/drive/MyDrive/A3CG_Dataset/folds/fold_1"
drive_fold_path = None

if os.path.exists(possible_path):
    drive_fold_path = possible_path
    print(f"Found: {possible_path}")
else:
    print("Folder 'fold_1' not found.")
    exit()

# List contents of the folder
print(f"\nContents of: {drive_fold_path}")
files_in_folder = os.listdir(drive_fold_path)
required_files = ["seen_train.json", "seen_val.json", "seen_test.json", "unseen_test.json"]

for file in files_in_folder:
    if file.endswith('.json'):
        file_path = os.path.join(drive_fold_path, file)
        size_kb = os.path.getsize(file_path) / 1024
        status = "[OK]" if file in required_files else "[INFO]"
        print(f"  {status} {file} ({size_kb:.1f} KB)")

# Create local structure
local_path = "/content/A3CG_DATASET/folds/fold_1"
os.makedirs(local_path, exist_ok=True)
print(f"\nDirectory created: {local_path}")

# Copy files
print("\nCopying files...")
copied_count = 0

for filename in required_files:
    source = os.path.join(drive_fold_path, filename)
    dest = os.path.join(local_path, filename)

    if os.path.exists(source):
        try:
            shutil.copy2(source, dest)
            size_kb = os.path.getsize(dest) / 1024

            # Check for valid JSON
            with open(dest, 'r', encoding='utf-8') as f:
                data = json.load(f)

            print(f"  [OK] {filename}: {len(data)} samples ({size_kb:.1f} KB)")
            copied_count += 1

        except Exception as e:
            print(f"  [ERROR] {filename}: {e}")
    else:
        print(f"  [MISSING] {filename} not found")

# Final result
print(f"\nRESULT:")
print(f"  Files copied: {copied_count}/4")

if copied_count >= 3:
    print(f"SUCCESS! Files are ready.")
    print(f"Path: {local_path}")

    # Show final structure
    print(f"\nFinal Directory Structure:")
    for root, dirs, files in os.walk("/content/A3CG_DATASET"):
        level = root.replace("/content/A3CG_DATASET", '').count(os.sep)
        indent = '  ' * level
        folder_name = os.path.basename(root) or "A3CG_DATASET"
        print(f'{indent}- {folder_name}/')

        sub_indent = '  ' * (level + 1)
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                size_kb = os.path.getsize(file_path) / 1024
                print(f'{sub_indent}- {file} ({size_kb:.1f} KB)')

    print(f"\nUse this path in your code:")
    print(f"/content/A3CG_DATASET/folds/fold_1/")

else:
    print(f"FAILURE: Only {copied_count}/4 files were copied.")
    print("Please verify that all required JSON files are in your Google Drive folder.")

In [None]:
# HuggingFace token : get it at https://huggingface.co/settings/tokens
from huggingface_hub import login

# Interactive login
login()

# Verification
from huggingface_hub import whoami
try:
    user_info = whoami()
    print(f"Connected as: {user_info['name']}")
except Exception as e:
    print(f"Authentication error: {e}")

In [None]:
# ===================
# INSTALLATION SCRIPT
# ===================

!pip install -q --upgrade transformers peft bitsandbytes accelerate torch datasets scikit-learn "numpy<2.0"

print("Installation completed.")

print("Installation successful with compatible versions.")
print("\nNEXT STEPS:")
print("1. RESTART THE RUNTIME (MANDATORY)")
print("   Runtime > Restart session")
print("2. Wait 30 seconds after restart")

print("\nWHY THIS RESTART IS CRITICAL:")
print("   - Prevents NumPy 1.x/2.x conflicts in memory")
print("   - Clears Python cache")
print("   - Ensures correct versions are loaded")

In [None]:
# CELL 1: DEPENDENCIES INSTALLATION
# Run this cell, then manually restart the runtime environment.

print("STEP 0: Installing and upgrading required packages...")

# Install PyTorch from official CUDA source (cu121) to ensure compatibility and fix torchvision::nms error
!pip install -q -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

# Install other required packages
!pip install -q -U transformers peft bitsandbytes accelerate datasets
!pip install -q flash-attn --no-build-isolation

print("\nINSTALLATION COMPLETE. PLEASE RESTART THE RUNTIME NOW.")
print("(Runtime -> Restart session)")

In [None]:
# NO TRAINING - BASELINE EVALUATION WITH FEW-SHOT

import os
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import re
import pandas as pd
import random
import numpy as np

from datetime import datetime
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
from peft import PeftModel, PeftConfig
import warnings
warnings.filterwarnings('ignore')

# STEP 1: Model Loading
print("\nSTEP 1: Model loading...")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

print(f"Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding_side="left"
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Loading LLaMA-3 8B model...")
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={'': 0},
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2"
)

model = base_model

print("Model configured")

DATA_DIR = "/content/A3CG_DATASET/folds/fold_1"

# STEP 2: Few-shot data processor
print("\nSTEP 2: Few-shot data processor...")

class A3CGEvaluationPromptGenerator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.system_prompt = """You are an expert in ESG analysis. Extract aspect-action pairs from sustainability statements.

CRITICAL INSTRUCTIONS:
- Extract EXACT terms from the input text
- Do not paraphrase or interpret creatively
- Use literal wording from the sentence
- Focus on specific terms rather than general concepts

DEFINITIONS:
- Aspect: A sustainability-related entity, goal, sub-area, or activity (use exact wording)
- Action: "implemented", "planning", or "indeterminate"

OUTPUT FORMAT: ("aspect1", "action1"), ("aspect2", "action2"), ...
If none: ("no aspect", "no action")"""

        self.few_shot_examples = [
            {
                "text": "The Group revitalised and rejuvenated five existing coffeeshops in FY2022 to improve customers' dining experience and hygiene standards.",
                "output": '{"aspect-action_pairs": [{"aspect": "hygiene standards", "action": "implemented"}, {"aspect": "customers\' dining experience", "action": "implemented"}]}'
            },
            {
                "text": "It is imperative that we manage our business prudently with high standard of corporate governance and integrity.",
                "output": '{"aspect-action_pairs": [{"aspect": "corporate governance", "action": "indeterminate"}]}'
            },
            {
                "text": "We are committed to improve on our occupational health and safety initiatives and conduct regular reviews of our programmes, processes, risk assessments and controls.",
                "output": '{"aspect-action_pairs": [{"aspect": "occupational health and safety initiatives", "action": "planning"}, {"aspect": "risk assessments and controls", "action": "planning"}]}'
            },
            {
                "text": "This project will commence in 2022 and take place over the next 3 years.",
                "output": '{"aspect-action_pairs": []}'
            },
            {
                "text": "In 2020, we achieved a 44% reduction in carbon emissions intensity against 2007 levels, putting us on track to achieving our SBTi-validated target of 59% by 2030.",
                "output": '{"aspect-action_pairs": [{"aspect": "carbon emissions intensity", "action": "implemented"}]}'
            },
            {
                "text": "We have in place robust worksite inspection procedures and monthly audits to identify workplace hazards and ensure all activities comply with all Group and regulatory requirements.",
                "output": '{"aspect-action_pairs": [{"aspect": "workplace hazards", "action": "implemented"}, {"aspect": "worksite inspection procedures", "action": "implemented"}]}'
            },
            {
                "text": "Moving ahead, we are targeting to include automation in tracking order status and completion, routing and invoicing to improve customer experience.",
                "output": '{"aspect-action_pairs": [{"aspect": "customer experience", "action": "planning"}]}'
            },
            {
                "text": "Continuous learning is necessary to help enhance workmen proficiency.",
                "output": '{"aspect-action_pairs": [{"aspect": "workmen proficiency", "action": "indeterminate"}, {"aspect": "continuous learning", "action": "indeterminate"}]}'
            },
            {
                "text": "We will continue to maintain or lower our energy and water consumption in FY2022.",
                "output": '{"aspect-action_pairs": [{"aspect": "energy and water consumption", "action": "planning"}]}'
            },
            {
                "text": "We are committed to foster a non-discriminatory workplace environment.",
                "output": '{"aspect-action_pairs": [{"aspect": "non-discriminatory workplace environment", "action": "planning"}]}'
            }
        ]

    def get_few_shot_examples(self, n_examples: int = 3) -> str:
        """Get few-shot examples"""
        selected = random.sample(self.few_shot_examples, min(n_examples, len(self.few_shot_examples)))

        examples_text = ""
        for i, example in enumerate(selected, 1):
            examples_text += f"\nExample {i}:\n"
            examples_text += f"  Text: {example['text']}\n"
            examples_text += f"  Output: {example['output']}\n"

        return examples_text

    def create_prompt(self, sentence: str) -> str:
        """Create evaluation prompt using Llama 3 chat template with few-shot examples"""

        # Get few-shot examples
        examples_text = self.get_few_shot_examples(n_examples=3)

        # Create comprehensive prompt with examples
        user_content = f"""{self.system_prompt}

{examples_text}

Now extract aspect-action pairs from this new sentence:
Text: {sentence}

IMPORTANT:
- Action must be EXACTLY one of: "implemented", "planning", or "indeterminate"
- Extract sustainability-related aspects only (environmental, social, governance)
- Use exact wording from the sentence for aspects
- Format as JSON: {{"aspect-action_pairs": [{{"aspect": "...", "action": "..."}}, ...]}}

Response:"""

        messages = [
            {"role": "user", "content": user_content}
        ]

        return self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

# Initialize prompt generator
prompt_generator = A3CGEvaluationPromptGenerator(tokenizer)

print(f"Prompt generator configured")

# STEP 3: Generation function and parsing
print("\nSTEP 3: Configuring generation function and parsing...")

def generate_prediction_enhanced(model, tokenizer, sentence: str, max_length: int = 512) -> str:
    """Enhanced prediction generation optimized for Llama-3-8B"""

    prompt = prompt_generator.create_prompt(sentence)

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024,
        padding=True
    )

    # Move to device
    device = next(model.parameters()).device if hasattr(model, 'parameters') else 'cuda' if torch.cuda.is_available() else 'cpu'
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generation parameters optimized for Llama 3-8B
    gen_params = {
        "max_new_tokens": 200,
        "do_sample": True,
        "temperature": 0.1,
        "top_p": 0.9,
        "pad_token_id": tokenizer.eos_token_id,
        "eos_token_id": tokenizer.eos_token_id,
    }

    # Generate
    with torch.no_grad():
        try:
            outputs = model.generate(**inputs, **gen_params)
        except Exception as e:
            print(f"WARNING: Generation error: {e}")
            # Fallback generation
            outputs = model.generate(
                inputs['input_ids'],
                max_new_tokens=150,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

    # Decode only the generated part
    generated_ids = outputs[0][inputs['input_ids'].shape[1]:]
    response = tokenizer.decode(generated_ids, skip_special_tokens=True)

    return response.strip()

def parse_prediction_enhanced(prediction: str, model_type: str = "base") -> List[Tuple[str, str]]:
    """Enhanced parsing for JSON format"""

    pairs = []
    prediction = prediction.strip()

    # Standard JSON parsing for base models
    try:
        if '{' in prediction and '}' in prediction:
            # Find the JSON block
            json_start = prediction.find('{')
            brace_count = 0
            json_end = json_start

            for i in range(json_start, len(prediction)):
                if prediction[i] == '{':
                    brace_count += 1
                elif prediction[i] == '}':
                    brace_count -= 1
                    if brace_count == 0:
                        json_end = i + 1
                        break

            if json_end > json_start:
                json_str = prediction[json_start:json_end]
                json_str = re.sub(r'[\n\r\t]', ' ', json_str)
                json_str = re.sub(r'\s+', ' ', json_str)

                data = json.loads(json_str)

                # Try different possible keys
                for key in ["aspect-action_pairs", "aspect_action_pairs", "pairs", "results"]:
                    if key in data and isinstance(data[key], list):
                        for pair in data[key]:
                            if isinstance(pair, dict) and "aspect" in pair and "action" in pair:
                                aspect = str(pair["aspect"]).strip().lower()
                                action = str(pair["action"]).strip().lower()

                                # Validate action
                                if action in ["implemented", "planning", "indeterminate"]:
                                    if aspect and action:
                                        pairs.append((aspect, action))
                        break

    except json.JSONDecodeError:
        print(f"DEBUG: JSON parsing failed, trying regex fallback")

        # Fallback: regex for JSON-like patterns
        json_pattern = r'\{\s*["\']aspect["\']\s*:\s*["\']([^"\']+)["\']\s*,\s*["\']action["\']\s*:\s*["\']([^"\']+)["\']\s*\}'
        matches = re.findall(json_pattern, prediction, re.IGNORECASE)

        for aspect, action in matches:
            aspect = aspect.strip().lower()
            action = action.strip().lower()
            if action in ["implemented", "planning", "indeterminate"]:
                pairs.append((aspect, action))

        # Additional fallback: look for tuple-like patterns
        if not pairs:
            tuple_pattern = r'\(\s*["\']([^"\']+)["\']\s*,\s*["\']([^"\']+)["\']\s*\)'
            matches = re.findall(tuple_pattern, prediction)

            for aspect, action in matches:
                aspect = aspect.strip().lower()
                action = action.strip().lower()
                if action in ["implemented", "planning", "indeterminate"]:
                    pairs.append((aspect, action))

    return pairs

print("Generation and parsing functions configured")

# STEP 4: Evaluation functions - Exact Match only
print("\nSTEP 4: Evaluation functions configuration...")

def calculate_metrics_exact_match(predictions: List[List[Tuple[str, str]]],
                                 ground_truth: List[List[Tuple[str, str]]]) -> Dict:
    """Calculate metrics with exact matching (A3CG paper implementation)"""

    print("Calculating Exact Match metrics (A3CG paper)...")

    exact_matches = 0
    partial_matches = 0
    total_pred_pairs = 0
    total_true_pairs = 0

    exact_true_positives = 0
    exact_false_positives = 0
    exact_false_negatives = 0

    for pred_pairs, true_pairs in zip(predictions, ground_truth):
        total_pred_pairs += len(pred_pairs)
        total_true_pairs += len(true_pairs)

        # Convert to sets for exact comparison
        pred_set = set(pred_pairs)
        true_set = set(true_pairs)

        # Exact matches for this sample
        matched_pairs = pred_set.intersection(true_set)

        exact_true_positives += len(matched_pairs)
        exact_false_positives += len(pred_set - true_set)
        exact_false_negatives += len(true_set - pred_set)

        # Sample-level exact match
        if pred_set == true_set and len(pred_set) > 0:
            exact_matches += 1

        # Sample-level partial match
        if len(matched_pairs) > 0:
            partial_matches += 1

    n_samples = len(predictions)

    exact_match_accuracy = exact_matches / n_samples if n_samples > 0 else 0
    partial_match_accuracy = partial_matches / n_samples if n_samples > 0 else 0

    exact_precision = exact_true_positives / (exact_true_positives + exact_false_positives) if (exact_true_positives + exact_false_positives) > 0 else 0
    exact_recall = exact_true_positives / (exact_true_positives + exact_false_negatives) if (exact_true_positives + exact_false_negatives) > 0 else 0
    exact_f1_score = 2 * (exact_precision * exact_recall) / (exact_precision + exact_recall) if (exact_precision + exact_recall) > 0 else 0

    return {
        'exact_match_accuracy': exact_match_accuracy,
        'partial_match_accuracy': partial_match_accuracy,
        'exact_precision': exact_precision,
        'exact_recall': exact_recall,
        'exact_f1_score': exact_f1_score,
        'exact_true_positives': exact_true_positives,
        'exact_false_positives': exact_false_positives,
        'exact_false_negatives': exact_false_negatives,
        'total_predictions': total_pred_pairs,
        'total_ground_truth': total_true_pairs,
    }

def load_test_data_flexible(file_path: str) -> Tuple[List[str], List[List[Tuple[str, str]]]]:
    """Flexible data loading"""
    print(f"Loading: {os.path.basename(file_path)}")

    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    sentences = []
    ground_truth = []

    if isinstance(data, list):
        for item in data:
            if isinstance(item, dict):
                if 'text' in item and 'aspects' in item:
                    sentence = item['text']
                    pairs = []

                    aspects_dict = item['aspects']
                    if isinstance(aspects_dict, dict):
                        for aspect, actions in aspects_dict.items():
                            if isinstance(actions, list):
                                for action in actions:
                                    pairs.append((aspect.strip(), action.strip()))
                            elif isinstance(actions, str):
                                pairs.append((aspect.strip(), actions.strip()))

                    sentences.append(sentence)
                    ground_truth.append(pairs)

    print(f"Loaded {len(sentences)} samples with {sum(len(gt) for gt in ground_truth)} total pairs")
    return sentences, ground_truth

print("Exact match evaluation functions configured")

# STEP 5: Load test data
print("\nSTEP 5: Loading test data...")

test_files = {
    'seen_test': f"{DATA_DIR}/seen_test.json",
    'unseen_test': f"{DATA_DIR}/unseen_test.json"
}

test_data = {}
for name, file_path in test_files.items():
    if os.path.exists(file_path):
        try:
            sentences, ground_truth = load_test_data_flexible(file_path)
            if len(sentences) > 0:
                test_data[name] = (sentences, ground_truth)
                print(f"{name}: {len(sentences)} samples")
        except Exception as e:
            print(f"ERROR loading {name}: {e}")
    else:
        print(f"File not found: {file_path}")

if not test_data:
    print("WARNING: No test files found - creating demo data")
    test_data['demo'] = (
        ["This company has implemented strong environmental policies to reduce carbon emissions."],
        [[("environmental policies", "implemented"), ("carbon emissions", "implemented")]]
    )

# STEP 6: Model evaluation
print("\nSTEP 6: Model evaluation...")
print("=" * 60)
print(f"Metrics: Exact Match (paper implementation)")
print("=" * 60)

results = {}

for dataset_name, (sentences, ground_truth) in test_data.items():
    print(f"\nEvaluation on {dataset_name}...")
    print(f"Number of samples: {len(sentences)}")

    predictions = []
    start_time = time.time()

    # Evaluate on full dataset
    n_test = len(sentences)
    test_sentences = sentences[:n_test]
    test_ground_truth = ground_truth[:n_test]

    print(f"Testing on {n_test} samples...")

    for i, sentence in enumerate(test_sentences):
        if i % 10 == 0:
            print(f"   Progress: {i}/{n_test} ({i/n_test*100:.1f}%)")

        try:
            # Generate prediction
            prediction_text = generate_prediction_enhanced(model, tokenizer, sentence)

            # Parse prediction
            pred_pairs = parse_prediction_enhanced(prediction_text)
            predictions.append(pred_pairs)

            # Debug first few predictions
            if i < 3:
                print(f"   Sample {i+1}: {len(pred_pairs)} pairs predicted")
                print(f"   Raw output: {prediction_text[:100]}...")
                print(f"   Parsed pairs: {pred_pairs}")

        except Exception as e:
            print(f"WARNING: Error sample {i}: {e}")
            predictions.append([])

    evaluation_time = time.time() - start_time

    # Calculate exact match metrics
    print(f"Calculating metrics...")
    exact_metrics = calculate_metrics_exact_match(predictions, test_ground_truth)

    # Save results
    results[dataset_name] = {
        'exact_metrics': exact_metrics,
        'evaluation_time': evaluation_time,
        'samples_per_second': n_test / evaluation_time,
        'predictions': predictions[:5],
        'ground_truth': test_ground_truth[:5],
        'n_samples': n_test,
        'base_model': model_name,
    }

    print(f"Evaluation completed in {evaluation_time:.2f}s")
    print(f"Speed: {n_test/evaluation_time:.2f} samples/sec")

# STEP 7: Results display
print("\n" + "="*80)
print("EVALUATION RESULTS - EXACT MATCH")
print("="*80)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

for dataset_name, result in results.items():
    exact_metrics = result['exact_metrics']

    print(f"\nRESULTS - {dataset_name.upper()}")
    print(f"Base model: {result['base_model']}")
    print("-" * 60)

    print(f"EXACT MATCH METRICS (A3CG paper implementation):")
    print(f"   Exact Match Accuracy:     {exact_metrics['exact_match_accuracy']:.4f} ({exact_metrics['exact_match_accuracy']*100:.2f}%)")
    print(f"   Exact Precision:          {exact_metrics['exact_precision']:.4f} ({exact_metrics['exact_precision']*100:.2f}%)")
    print(f"   Exact Recall:             {exact_metrics['exact_recall']:.4f} ({exact_metrics['exact_recall']*100:.2f}%)")
    print(f"   Exact F1-Score:           {exact_metrics['exact_f1_score']:.4f} ({exact_metrics['exact_f1_score']*100:.2f}%)")

    print(f"\nPERFORMANCE:")
    print(f"   Evaluation speed:         {result['samples_per_second']:.2f} samples/sec")

# STEP 8: Detailed examples
print(f"\n" + "="*80)
print("DETAILED EXAMPLES ANALYSIS")
print("="*80)

for dataset_name, result in results.items():
    print(f"\nDETAILED EXAMPLES - {dataset_name.upper()}")
    print("-" * 70)

    sentences, ground_truth = test_data[dataset_name]
    predictions = result['predictions']

    for i in range(min(3, len(predictions))):
        print(f"\nExample {i+1}:")
        print(f"Sentence: {sentences[i][:120]}...")
        print(f"Ground truth: {ground_truth[i]}")
        print(f"Prediction: {predictions[i]}")

        # Exact match analysis
        pred_set = set(predictions[i])
        true_set = set(ground_truth[i])
        exact_matches = pred_set.intersection(true_set)

        print(f"Exact Match Analysis:")
        if exact_matches:
            print(f"   Exact matches: {exact_matches}")
        else:
            print(f"   No exact matches")

        if not predictions[i]:
            print(f"   WARNING: No prediction generated")

# STEP 9: Performance summary
print("\n" + "="*80)
print("PERFORMANCE SUMMARY LLAMA 3-8B BASELINE")
print("="*80)

# Calculate average metrics across datasets
if results:
    avg_exact_f1 = np.mean([r['exact_metrics']['exact_f1_score'] for r in results.values()])
    avg_exact_precision = np.mean([r['exact_metrics']['exact_precision'] for r in results.values()])
    avg_exact_recall = np.mean([r['exact_metrics']['exact_recall'] for r in results.values()])

    print(f"BASE MODEL: {model_name}")
    print(f"EVALUATION TYPE: Few-Shot Baseline (No Training)")
    print("-" * 50)

    print(f"EXACT MATCH METRICS (A3CG paper):")
    print(f"   F1-Score:        {avg_exact_f1:.4f} ({avg_exact_f1*100:.2f}%)")
    print(f"   Precision:       {avg_exact_precision:.4f} ({avg_exact_precision*100:.2f}%)")
    print(f"   Recall:          {avg_exact_recall:.4f} ({avg_exact_recall*100:.2f}%)")

    # Performance comparison with paper
    print(f"\nCOMPARISON WITH A3CG PAPER:")
    print(f"   Paper GRACE (best):          47.51% F1 (exact)")
    print(f"   Our Baseline:                {avg_exact_f1*100:.2f}% F1")

    if avg_exact_f1 >= 0.25:
        print(f"   Baseline performance acceptable")
    else:
        print(f"   Baseline performance below expectations")

print("\nBASELINE EVALUATION COMPLETED!")
print("=" * 70)
print(f"End time: {time.strftime('%H:%M:%S')}")
print(f"Datasets evaluated: {len(results)}")

if results:
    print(f"Baseline F1-Score: {avg_exact_f1:.4f} ({avg_exact_f1*100:.2f}%)")

print("=" * 70)

print(f"\nFINAL SUMMARY:")
print(f"   Exact Match F1 (baseline): {avg_exact_f1*100:.1f}%")
print(f"   Evaluation type: Few-Shot Baseline (No Training)")
print(f"   This serves as performance baseline for fine-tuned models")