In [1]:
import os
import json
import math
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from langchain_postgres import PGVector
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')
print('Imports ready')

Imports ready


In [4]:
# Paths setup
BASE = Path('c:/Users/rayaa/OneDrive/Documents/VSCode/CSCI5832/Semeval')
RAG_TASKS_PATH = BASE / 'human' / 'generation_tasks' / 'RAG.jsonl'
CORPUS_PATH = BASE / 'corpora' / 'passage_level' / 'cloud.jsonl'

# Model and database setup
EMBED_MODEL = 'Snowflake/snowflake-arctic-embed-l-v2.0'
GENERATION_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
PG_ENV_PATH = BASE / '.pg_env'

print('Paths and models configured')

Paths and models configured


In [3]:
def load_rag_tasks(jsonl_path, collection_name="mt-rag-ibmcloud-elser-512-100-20240502"):
    """Load RAG tasks from JSONL file, filtering by id.Collection."""
    tasks = []
    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for line in f:
            if not line.strip():
                continue

            obj = json.loads(line)

            # Safely extract Collection field
            collection = obj.get("Collection", "")

            if collection == collection_name:
                tasks.append(obj)

    return tasks

# Load RAG tasks
rag_tasks = load_rag_tasks(RAG_TASKS_PATH)
print(f"Loaded {len(rag_tasks)} filtered RAG tasks")


Loaded 205 filtered RAG tasks


In [4]:
def extract_conversation_text(task):
    """Extract the current question from conversation input"""
    input_data = task.get('input', [])
    if isinstance(input_data, list) and len(input_data) > 0:
        # Get the last user message
        for msg in reversed(input_data):
            if msg.get('speaker') == 'user':
                return msg.get('text', '')
    return ''

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

def setup_generator(model_name=GENERATION_MODEL):
    """Setup the text generation model"""
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map="auto",
            low_cpu_mem_usage=True
        )
        
        generator = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=512,
            temperature=0.7,
            do_sample=True
        )
        return generator
    except Exception as e:
        print(f"Error setting up generator: {e}")
        return None

# Setup the generator
generator = setup_generator()
print('Generator setup complete')

`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cuda:0


Generator setup complete


In [5]:
def create_generation_prompt(question, contexts, conversation_history=None):
    """Create a prompt for answer generation using retrieved contexts"""
    
    # Build context string
    context_text = ""
    for i, ctx in enumerate(contexts, 1):
        context_text += f"Context {i}: {ctx['text']}\n\n"
    
    # Build conversation history if available
    history = ""
    if conversation_history:
        turns = [
            f"{t['speaker'].capitalize()}: {t['text']}"
            for t in conversation_history
        ]
        history = "Conversation:\n" + "\n".join(turns) + "\n\n"

    prompt = f"""You are a concise conversational assistant.

Use ONLY the information found in the contexts.  
If the answer is not in the contexts, say exactly: **"The contexts do not contain the answer."**

Rules:
- Do NOT explain your reasoning.
- Do NOT mention the instructions.
- Do NOT invent information.
- Answer in a single short, natural paragraph (1‚Äì2 sentences).
- No meta-commentary (e.g., "I will now...").

{history}Contexts:
{context_text}

User: {question}
Assistant:"""

    return prompt

def generate_answer(prompt, generator):
    """Generate answer using the language model"""
    if generator is None:
        return "[Generation model not available]"
    
    try:
        outputs = generator(
            prompt,
            return_full_text=False,
            pad_token_id=generator.tokenizer.eos_token_id
        )
        
        if outputs and len(outputs) > 0:
            return outputs[0]['generated_text'].strip()
        else:
            return "[No response generated]"
    except Exception as e:
        print(f"Error in generation: {e}")
        return f"[Generation error: {e}]"
    
def trim_to_token_limit(text, tokenizer, max_length=4000):
    tokens = tokenizer.encode(text)
    if len(tokens) <= max_length:
        return text
    # Keep last max_length tokens so the question stays
    trimmed = tokenizer.decode(tokens[-max_length:])
    return trimmed


In [6]:
def run_task_b_rag(tasks, generator, output_path, do_subset=False):
    """Run full Task B pipeline: reference generation"""
    
    results = []
    
    if (do_subset):
        tasks = tasks[:100]
    
    for task in tqdm(tasks, desc="Processing RAG tasks"):
        # Extract current question
        current_question = extract_conversation_text(task)
        
        if not current_question:
            print(f"Warning: No question found for task {task.get('task_id')}")
            continue

        provided_contexts = task.get('contexts', [])

        formatted_contexts = []
        for i, ctx in enumerate(provided_contexts):
            if isinstance(ctx, dict):
                formatted_contexts.append({
                    'document_id': ctx.get('document_id', f'provided_{i}'),
                    'text': ctx.get('text', ''),
                    'score': 1.0  # All provided contexts get max score
                })
            else:
                # If context is just text
                formatted_contexts.append({
                    'document_id': f'provided_{i}',
                    'text': str(ctx),
                    'score': 1.0
                })
        
        # Get conversation history (all but last user message)
        conversation_history = []
        input_data = task.get('input', [])
        if isinstance(input_data, list):
            # Include all but the last user message (current question)
            found_last_user = False
            for msg in reversed(input_data):
                if msg.get('speaker') == 'user' and not found_last_user:
                    found_last_user = True
                    continue
                conversation_history.insert(0, msg)
        
        # Generate answer
        prompt = create_generation_prompt(current_question, formatted_contexts, conversation_history)
        prompt = trim_to_token_limit(prompt, generator.tokenizer, max_length=4000)
        generated_answer = generate_answer(prompt, generator)
        
        # Prepare result in evaluation format
        result_task = task.copy()
        
        # Add prediction in the format expected by evaluation script
        result_task['predictions'] = [{
            'text': generated_answer
        }]
        
        results.append(result_task)
    
    # Save results
    output_dir = Path(output_path).parent
    output_dir.mkdir(parents=True, exist_ok=True)
    
    with open(output_path, 'w', encoding='utf-8') as f:
        for result in results:
            f.write(json.dumps(result) + '\n')
    
    print(f"Saved {len(results)} results to {output_path}")
    return results

# Run Task B
output_file = BASE / 'rayaan' / 'outputs' / 'task_b_rag_predictions.jsonl'
task_b_results = run_task_b_rag(rag_tasks, generator, output_file, do_subset=True)

Processing RAG tasks:   0%|          | 0/100 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Saved 100 results to c:\Users\rayaa\OneDrive\Documents\VSCode\CSCI5832\Semeval\rayaan\outputs\task_b_rag_predictions.jsonl


In [None]:
output_file = BASE / 'rayaan' / 'outputs' / 'task_b_rag_predictions.jsonl'

def run_generation_evaluation_script_advanced(input_file, output_file, provider="hf", judge_model=None, openai_key=None, azure_host=None):
    """Run the generation evaluation pipeline with comprehensive error handling"""
    
    print("\n" + "="*60)
    print("RUNNING GENERATION EVALUATION PIPELINE")
    print("="*60)
    
    # Validate inputs
    input_path = Path(input_file)
    if not input_path.exists():
        print(f"‚ùå ERROR: Input file not found at {input_path}")
        return False
    
    # Count input tasks
    with open(input_path, 'r', encoding='utf-8') as f:
        input_count = sum(1 for line in f if line.strip())
    print(f"üìÅ Input file: {input_path} ({input_count} tasks)")
    
    # Ensure output directory exists
    output_path = Path(output_file)
    output_dir = output_path.parent
    output_dir.mkdir(parents=True, exist_ok=True)
    print(f"üìÅ Output directory: {output_dir}")
    
    # Locate evaluation script
    eval_script_path = BASE / 'scripts' / 'evaluation' / 'run_generation_eval.py'
    if not eval_script_path.exists():
        print(f"‚ùå ERROR: Evaluation script not found at {eval_script_path}")
        return False
    
    # Locate config file
    config_path = BASE / 'scripts' / 'evaluation' / 'config.yaml'
    if not config_path.exists():
        print(f"‚ùå ERROR: Config file not found at {config_path}")
        return False
    
    print(f"üîß Evaluation script: {eval_script_path}")
    
    # Build command
    cmd = [
        'python', str(eval_script_path),
        '--input', str(input_path),
        '--output', str(output_path),
        '--algorithmic_evaluators', str(config_path),
        '--provider', provider
    ]
    
    # Add provider-specific arguments
    if provider == "hf" and judge_model:
        cmd.extend(['--judge_model', judge_model])
        print(f"ü§ñ Using HF judge model: {judge_model}")
    elif provider == "openai":
        if openai_key:
            cmd.extend(['--openai_key', openai_key])
            print("üîë OpenAI key provided")
        if azure_host:
            cmd.extend(['--azure_host', azure_host])
            print(f"üåê Azure host: {azure_host}")
    
    print(f"üöÄ Command: {' '.join(cmd)}")
    print("\n‚è≥ Starting evaluation... This may take a while...")
    
    try:
        # Run the evaluation script with timeout
        import subprocess
        import time
        
        start_time = time.time()
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=17200)
        end_time = time.time()
        
        execution_time = end_time - start_time
        print(f"‚è±Ô∏è  Execution time: {execution_time:.2f} seconds ({execution_time/60:.2f} minutes)")
        
        # Process results
        print("\n" + "="*40)
        print("EVALUATION RESULTS")
        print("="*40)
        
        if result.stdout:
            print("üìã Script output:")
            print(result.stdout)
        
        if result.returncode == 0:
            print("‚úÖ Evaluation completed successfully!")
            
            # Verify output file
            if output_path.exists():
                with open(output_path, 'r', encoding='utf-8') as f:
                    output_count = sum(1 for line in f if line.strip())
                print(f"üìä Results: {output_count}/{input_count} tasks evaluated")
                print(f"üíæ Results saved to: {output_path}")
                return True
            else:
                print("‚ùå ERROR: Output file was not created")
                return False
        else:
            print(f"‚ùå Evaluation failed with return code: {result.returncode}")
            if result.stderr:
                print("üî¥ Error details:")
                print(result.stderr)
            return False
            
    except subprocess.TimeoutExpired:
        print("‚è∞ ERROR: Evaluation script timed out after 2 hours")
        return False
    except Exception as e:
        print(f"üí• Unexpected error: {e}")
        return False

# Run evaluation on Task B results
eval_output_file = BASE / 'rayaan' / 'outputs' / 'task_b_evaluation_results.jsonl'

# Configuration - Choose your setup
provider = "hf"  # Options: "hf" or "openai"
judge_model = GENERATION_MODEL  # Required for HF provider

# For OpenAI provider, uncomment and fill these:
# provider = "openai"
# openai_key = "your_openai_key_here"
# azure_host = "your_azure_endpoint_here"

success = run_generation_evaluation_script_advanced(
    input_file=output_file,
    output_file=eval_output_file,
    provider=provider,
    judge_model=judge_model
    # Add if using OpenAI:
    # openai_key=openai_key,
    # azure_host=azure_host
)

if success:
    print("\nüéâ Task B evaluation pipeline completed successfully!")
    print("You can now run the analysis cells to see the results.")
else:
    print("\nüí• Task B evaluation pipeline failed. Please check the errors above.")


RUNNING GENERATION EVALUATION PIPELINE
üìÅ Input file: c:\Users\rayaa\OneDrive\Documents\VSCode\CSCI5832\Semeval\rayaan\outputs\task_b_rag_predictions.jsonl (100 tasks)
üìÅ Output directory: c:\Users\rayaa\OneDrive\Documents\VSCode\CSCI5832\Semeval\rayaan\outputs
üîß Evaluation script: c:\Users\rayaa\OneDrive\Documents\VSCode\CSCI5832\Semeval\scripts\evaluation\run_generation_eval.py
ü§ñ Using HF judge model: Qwen/Qwen2.5-0.5B-Instruct
üöÄ Command: python c:\Users\rayaa\OneDrive\Documents\VSCode\CSCI5832\Semeval\scripts\evaluation\run_generation_eval.py --input c:\Users\rayaa\OneDrive\Documents\VSCode\CSCI5832\Semeval\rayaan\outputs\task_b_rag_predictions.jsonl --output c:\Users\rayaa\OneDrive\Documents\VSCode\CSCI5832\Semeval\rayaan\outputs\task_b_evaluation_results.jsonl --algorithmic_evaluators c:\Users\rayaa\OneDrive\Documents\VSCode\CSCI5832\Semeval\scripts\evaluation\config.yaml --provider hf --judge_model Qwen/Qwen2.5-0.5B-Instruct

‚è≥ Starting evaluation... This may take

In [None]:
def analyze_evaluation_results(results_file):
    """Analyze and display evaluation results"""
    
    results = []
    with open(results_file, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                results.append(json.loads(line))
    
    print(f"Analyzing {len(results)} evaluation results...")
    
    # Extract metrics
    metrics_data = []
    for result in results:
        task_id = result.get('task_id')
        metrics = result.get('metrics', {})
        
        metric_row = {'task_id': task_id}
        for metric_name, metric_values in metrics.items():
            if isinstance(metric_values, list) and len(metric_values) > 0:
                metric_row[metric_name] = metric_values[0]
            else:
                metric_row[metric_name] = metric_values
        
        metrics_data.append(metric_row)
    
    if metrics_data:
        metrics_df = pd.DataFrame(metrics_data)
        
        print("\n=== Evaluation Metrics Summary ===")
        
        # Display average scores for key metrics
        key_metrics = ['RL_F', 'RB_llm', 'RB_agg', 'RL_F_idk', 'RB_llm_idk', 'RB_agg_idk']
        available_metrics = [m for m in key_metrics if m in metrics_df.columns]
        
        if available_metrics:
            summary = metrics_df[available_metrics].mean()
            print("\nAverage Scores:")
            for metric, score in summary.items():
                print(f"  {metric}: {score:.4f}")
        
        # Display score distributions
        print("\nScore Distributions:")
        for metric in available_metrics:
            if metric in metrics_df.columns:
                print(f"\n{metric}:")
                print(metrics_df[metric].describe())
    
    return metrics_data

# Analyze results
metrics_data = analyze_evaluation_results(eval_output_file)

Analyzing 100 evaluation results...

=== Evaluation Metrics Summary ===

Average Scores:
  RB_agg: 0.2636

Score Distributions:

RB_agg:
count    100.000000
mean       0.263619
std        0.082125
min        0.061627
25%        0.219482
50%        0.264770
75%        0.314496
max        0.515473
Name: RB_agg, dtype: float64


In [8]:
def create_final_report(results_file, output_path):
    """Create a comprehensive final report"""
    
    results = []
    with open(results_file, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                results.append(json.loads(line))
    
    # Calculate overall metrics
    all_metrics = {}
    for result in results:
        metrics = result.get('metrics', {})
        for metric_name, metric_values in metrics.items():
            if isinstance(metric_values, list) and len(metric_values) > 0:
                if metric_name not in all_metrics:
                    all_metrics[metric_name] = []
                all_metrics[metric_name].append(metric_values[0])
    
    # Create report
    report = {
        "task": "Subtask B - Generation with Reference Passages (RAG)",
        "total_tasks": len(results),
        "evaluation_timestamp": pd.Timestamp.now().isoformat(),
        "metrics_summary": {},
        "model_used": GENERATION_MODEL,
        "retrieval_setting": "Reference RAG (top 5 passages)",
        "warnings": []
    }
    
    for metric_name, values in all_metrics.items():
        # Filter out None values before calculating statistics
        clean_values = [v for v in values if v is not None]
        
        if not clean_values:  # If all values are None
            report["warnings"].append(f"Metric '{metric_name}' has no valid values (all None)")
            report["metrics_summary"][metric_name] = {
                "mean": None,
                "std": None,
                "min": None,
                "max": None,
                "median": None,
                "count": 0,
                "total_count": len(values),
                "valid_count": 0
            }
            continue
        
        # Calculate statistics safely and convert to native Python types
        try:
            stats_dict = {
                "mean": float(np.mean(clean_values)) if np.mean(clean_values) is not None else None,
                "std": float(np.std(clean_values)) if np.std(clean_values) is not None else None,
                "min": float(np.min(clean_values)) if np.min(clean_values) is not None else None,
                "max": float(np.max(clean_values)) if np.max(clean_values) is not None else None,
                "median": float(np.median(clean_values)) if np.median(clean_values) is not None else None,
                "count": len(clean_values),
                "total_count": len(values),
                "valid_count": len(clean_values)
            }
            
            # Handle potential NaN values
            for key in ["mean", "std", "min", "max", "median"]:
                if stats_dict[key] is not None and np.isnan(stats_dict[key]):
                    stats_dict[key] = None
                    report["warnings"].append(f"Metric '{metric_name}' has NaN for {key}")
            
            report["metrics_summary"][metric_name] = stats_dict
            
        except (TypeError, ValueError) as e:
            report["warnings"].append(f"Error calculating stats for '{metric_name}': {str(e)}")
            report["metrics_summary"][metric_name] = {
                "mean": None,
                "std": None,
                "min": None,
                "max": None,
                "median": None,
                "count": len(clean_values),
                "total_count": len(values),
                "valid_count": len(clean_values),
                "error": str(e)
            }
        
        # Optional: add warning if some values were None
        if len(clean_values) < len(values):
            report["warnings"].append(
                f"Metric '{metric_name}' had {len(values) - len(clean_values)} None values (ignored in stats)"
            )
    
    # Convert report to JSON-serializable format
    def convert_to_serializable(obj):
        """Recursively convert numpy types to native Python types"""
        if isinstance(obj, dict):
            return {key: convert_to_serializable(value) for key, value in obj.items()}
        elif isinstance(obj, list):
            return [convert_to_serializable(item) for item in obj]
        elif isinstance(obj, (np.integer, np.int32, np.int64)):
            return int(obj)
        elif isinstance(obj, (np.floating, np.float32, np.float64)):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, np.bool_):
            return bool(obj)
        elif obj is pd.NaT or pd.isna(obj):
            return None
        else:
            return obj
    
    # Apply conversion
    serializable_report = convert_to_serializable(report)
    
    # Save report
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(serializable_report, f, indent=2, default=str)
    
    print(f"Final report saved to: {output_path}")
    
    # Print summary
    print("\n=== TASK B FINAL REPORT ===")
    print(f"Total tasks evaluated: {report['total_tasks']}")
    print(f"Generation model: {report['model_used']}")
    print(f"Retrieval setting: {report['retrieval_setting']}")
    
    # Show warnings if any
    if report['warnings']:
        print("\n‚ö†Ô∏è  Warnings:")
        for warning in report['warnings'][:10]:  # Show only first 10 warnings
            print(f"  - {warning}")
        if len(report['warnings']) > 10:
            print(f"  ... and {len(report['warnings']) - 10} more warnings")
    
    print("\nKey Metrics:")
    
    key_metrics = ['RL_F', 'RB_llm', 'RB_agg', 'RL_F_idk', 'RB_llm_idk', 'RB_agg_idk']
    for metric in key_metrics:
        if metric in report['metrics_summary']:
            stats = report['metrics_summary'][metric]
            if stats.get('mean') is not None and not np.isnan(stats['mean']):
                std_val = stats.get('std', 0)
                if std_val is not None and not np.isnan(std_val):
                    print(f"  {metric}: {stats['mean']:.4f} (¬±{std_val:.4f}) [valid: {stats.get('valid_count', 0)}/{stats.get('total_count', 0)}]")
                else:
                    print(f"  {metric}: {stats['mean']:.4f} (¬±N/A) [valid: {stats.get('valid_count', 0)}/{stats.get('total_count', 0)}]")
            else:
                print(f"  {metric}: No valid values [valid: {stats.get('valid_count', 0)}/{stats.get('total_count', 0)}]")
    
    return report

# Create final report
report_file = BASE / 'rayaan' / 'outputs' / 'task_b_final_report.json'
final_report = create_final_report(eval_output_file, report_file)

Final report saved to: c:\Users\rayaa\OneDrive\Documents\VSCode\CSCI5832\Semeval\rayaan\outputs\task_b_final_report.json

=== TASK B FINAL REPORT ===
Total tasks evaluated: 100
Generation model: Qwen/Qwen2.5-0.5B-Instruct
Retrieval setting: Reference RAG (top 5 passages)

  - Error calculating stats for 'idk_eval': ufunc 'add' did not contain a loop with signature matching types (dtype('<U32'), dtype('<U32')) -> None

Key Metrics:
  RB_agg: 0.2636 (¬±0.0817) [valid: 100/100]
