In [1]:
%pip install --user ragas langchain-openai pandas datasets




In [2]:
# Imports and Model Configuration (Ragas)
import json
import pandas as pd
import os
import numpy as np
from datasets import Dataset
import sys
import traceback

# Ragas specific imports
from ragas import evaluate
from ragas.metrics import (
    # Independent of Ground Truth (Reference-Free)
    context_precision,      
    answer_relevancy,       
    faithfulness,           
    
    # Dependent on Ground Truth (Reference-Required)
    context_recall,         
    context_entity_recall,  
    answer_correctness,     
    answer_similarity       
)

# Ragas 1.0+ uses Langchain models
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings 

from ragas.exceptions import RagasException

In [None]:
# --- AZURE CONFIGURATION (!!! REPLACE PLACEHOLDERS BELOW !!!) ---
# ⚠️ IMPORTANT: Replace with your actual key
AZURE_API_KEY = "<YOUR_AZURE_OPENAI" 
AZURE_ENDPOINT = "https://<YOUR_AZURE_OPENAI_ENDPOINT>.openai.azure.com/"
AZURE_CHAT_DEPLOYMENT = "gpt-4o-08-06" # Model used for evaluation logic (e.g., faithfulness, relevancy)
AZURE_EMBEDDINGS_DEPLOYMENT = "text_embedding_ada_002" # Model used for semantic metrics (e.g., similarity)
AZURE_API_VERSION = "2024-12-01-preview" 
# -----------------------------------------------------------------

# Set environment variable for Langchain/Ragas to pick up the key
os.environ["AZURE_OPENAI_API_KEY"] = AZURE_API_KEY

In [4]:
# Initialize Ragas LLM (for the main evaluation)
try:
    ragas_llm = AzureChatOpenAI(
        api_key=AZURE_API_KEY, 
        azure_endpoint=AZURE_ENDPOINT,
        deployment_name=AZURE_CHAT_DEPLOYMENT, 
        api_version=AZURE_API_VERSION,
        temperature=0.0
    )
    print("✅ Ragas Chat LLM initialized successfully!")
    
except Exception as e:
    print(f"❌ Error initializing Ragas AzureChatOpenAI LLM: {e}")
    sys.exit(1)

# Initialize Ragas Embeddings LLM
try:
    ragas_embeddings_llm = AzureOpenAIEmbeddings(
        api_key=AZURE_API_KEY,
        azure_endpoint=AZURE_ENDPOINT,
        deployment=AZURE_EMBEDDINGS_DEPLOYMENT, 
        openai_api_version=AZURE_API_VERSION,
        model=AZURE_EMBEDDINGS_DEPLOYMENT
    )
    print("✅ Ragas Embeddings LLM initialized successfully!")
    
except Exception as e:
    print(f"❌ Error initializing Ragas AzureOpenAIEmbeddings LLM: {e}")
    sys.exit(1)

✅ Ragas Chat LLM initialized successfully!
✅ Ragas Embeddings LLM initialized successfully!


In [5]:
# Metrics to run (7 Core Functional Metrics)
REQUESTED_METRICS = [
    # Reference-Free Metrics (Independent of Ground Truth)
    faithfulness,         # Measures how factually accurate the generated answer is based on the context.
    answer_relevancy,     # Measures how relevant the generated answer is to the question.
    context_precision,    # Measures how relevant the retrieved context chunks are to answering the question.
    
    # Reference-Required Metrics (Dependent on Ground Truth)
    context_recall,       # Measures the ability of the retrieval system to surface all necessary information.
    context_entity_recall,# Measures recall based on entities present in the ground truth.
    answer_correctness,   # Measures how close the generated answer is to the ground truth.
    answer_similarity     # Measures the semantic similarity between the generated answer and ground truth.
]

print(f"Running evaluation on: {[m.name for m in REQUESTED_METRICS]}")

Running evaluation on: ['faithfulness', 'answer_relevancy', 'context_precision', 'context_recall', 'context_entity_recall', 'answer_correctness', 'answer_similarity']


In [6]:
def evaluate_dataset_iteratively(json_file_path):
    """
    Load data (as a single JSON list or JSONL) and evaluate all samples 
    in the JSON file one by one (iteratively).
    """
    print(f"Loading data from: {json_file_path}")
    
    if not os.path.exists(json_file_path):
        raise FileNotFoundError(f"File not found: {json_file_path}")
    
    all_results_list = []
    data_list = []
    
    # Load data robustly to handle single JSON list or JSONL
    try:
        with open(json_file_path, 'r', encoding='utf-8') as f:
            full_content = f.read().strip()
            
            if not full_content:
                print("File is empty.")
                return None
                
            if full_content.startswith('['):
                data_list = json.loads(full_content)
                print(f"Loaded {len(data_list)} samples from a single JSON list format.")
            else:
                for line in full_content.splitlines():
                    line = line.strip()
                    if line:
                        data_list.append(json.loads(line))
                print(f"Loaded {len(data_list)} samples from JSONL format.")
                
    except json.JSONDecodeError as e:
        print(f"Fatal Error: Could not parse JSON file. Check for formatting errors. {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during file loading: {e}")
        return None

    if not data_list:
        print("No valid data loaded for evaluation.")
        return None
        
    for line_num, raw_data in enumerate(data_list, 1):
        
        if not isinstance(raw_data, dict):
            print(f"Warning: Skipping item {line_num}. Expected dict, got {type(raw_data)}.")
            continue
            
        print(f"\n" + "="*50)
        print(f"Processing Sample {line_num} / {len(data_list)}")
        print("="*50)
        
        try:
            # 2. Transform the single sample into Ragas format
            sample_dict = {}
            sample_dict['question'] = raw_data.get('question', '')
            sample_dict['answer'] = raw_data.get('answer', '')
            
            # Handle context: If it's a list already, use it. If it's a string, wrap it.
            context_data = raw_data.get('context')
            if isinstance(context_data, list):
                sample_dict['contexts'] = context_data
            elif pd.notna(context_data) and context_data is not None:
                # Assuming context is a single string which Ragas expects as a list of strings
                sample_dict['contexts'] = [str(context_data)]
            else:
                 sample_dict['contexts'] = []

            # 'truth' maps to 'reference' (str) and 'ground_truths' (list[str])
            truth_str = raw_data.get('truth', '')
            sample_dict['reference'] = truth_str
            sample_dict['ground_truths'] = [truth_str]
            
            # Create a single-row Dataset
            sample_dataset = Dataset.from_list([sample_dict]) 

            # 3. Run Ragas evaluation on the single sample
            result = evaluate(
                sample_dataset,
                metrics=REQUESTED_METRICS,
                llm=ragas_llm,
                embeddings=ragas_embeddings_llm,
            )
            
            print(f"Sample {line_num} Scores:")
            single_row_scores = result.to_pandas().iloc[0].to_dict()
            for metric in [m.name for m in REQUESTED_METRICS]:
                score = single_row_scores.get(metric)
                if score is not None:
                    print(f"  {metric.replace('_', ' ').title():<25}: {score:.3f}")

            # 4. Store the result 
            all_results_list.append(result.to_pandas())

        except Exception as e:
            question_preview = raw_data.get('question', 'N/A')
            # Print a warning but continue processing other samples
            print(f"Warning: Failed to evaluate sample at index {line_num}: {e}")
            print(f"  Question: {question_preview[:100]}...")
    
    if not all_results_list:
        print("No data processed, skipping evaluation.")
        return None
    
    # 5. Combine all 1-row DataFrames into a single DataFrame
    print("\n" + "="*50)
    print("Combining all results...")
    print("="*50)
    final_results_df = pd.concat(all_results_list, ignore_index=True)
    
    return final_results_df

In [7]:
def save_results(results_df, output_file="evaluation_results.json"):
    """Save detailed results DataFrame to JSON file"""
    if results_df is None: return
    # Filter out non-JSON serializable objects (like list of contexts/ground_truths) for clean export
    export_df = results_df.copy()
    
    # Ragas uses the 'contexts' and 'ground_truths' columns which contain lists/objects.
    records = export_df.to_dict('records')
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(records, f, indent=2, ensure_ascii=False)
    print(f"Detailed results saved to {output_file}")

def export_to_csv(results_df, csv_file="evaluation_results.csv"):
    """Export results DataFrame to CSV for easier analysis"""
    if results_df is None: return
    
    # Convert list columns to strings for CSV compatibility
    csv_df = results_df.copy()
    for col in ['contexts', 'ground_truths']:
        if col in csv_df.columns:
            csv_df[col] = csv_df[col].apply(lambda x: '|'.join(x) if isinstance(x, list) else x)
            
    csv_df.to_csv(csv_file, index=False, encoding='utf-8')
    print(f"Results exported to {csv_file}")

def create_summary_statistics(results_df, summary_file="evaluation_summary.json"):
    """Create summary statistics from results DataFrame"""
    if results_df is None: return {}
        
    metric_names = [m.name for m in REQUESTED_METRICS]
    enhanced_summary = {}
    
    for metric_name in metric_names: 
        if metric_name in results_df.columns:
            # Drop NaN scores, which occur if an evaluation fails for a specific metric on a sample
            scores = results_df[metric_name].dropna().tolist() 
            if scores:
                enhanced_summary[metric_name] = {
                    'mean': round(np.mean(scores), 3), 
                    'min': round(min(scores), 3),
                    'max': round(max(scores), 3),
                    'count': len(scores),
                    'std_dev': round(np.std(scores), 3) if len(scores) > 1 else 0
                }
            else:
                enhanced_summary[metric_name] = {'mean': 0, 'count': 0}
    
    with open(summary_file, 'w', encoding='utf-8') as f:
        json.dump(enhanced_summary, f, indent=2, ensure_ascii=False)
    
    print(f"Summary statistics saved to {summary_file}")
    return enhanced_summary 

def print_quick_summary(summary_dict):
    """Print a quick summary (mean scores) to console from the summary dict"""
    if not summary_dict: return
        
    print("\n" + "="*50)
    print("RAGAS QUICK SUMMARY (Mean Scores)")
    print("="*50)
    
    for metric, stats in summary_dict.items():
        print(f"{metric.replace('_', ' ').title():<25}: {stats['mean']:.3f}")
    
    print("="*50)

In [9]:
results_df = None
summary_stats = {}

json_file_path = "2_sample_test_qa_eval.jsonl"

try:
    print(f"Starting iterative evaluation for: {json_file_path}")
    
    results_df = evaluate_dataset_iteratively(json_file_path)
    
    if results_df is not None and not results_df.empty:
        # Save all results
        save_results(results_df)
        export_to_csv(results_df)
        
        # Create and print summary
        summary_stats = create_summary_statistics(results_df)
        print_quick_summary(summary_stats)
        
        print("\n ✅ Evaluation completed successfully!")
        print(f"Total successful samples evaluated: {len(results_df)}")

        print("\nFirst 3 results preview:")
        print("="*80)
        
        # Display the start of the question in the preview
        for i in range(min(3, len(results_df))):
            result = results_df.iloc[i]
            print(f"\nSample {i+1}:")
            
            # Get the question and limit it to the first 100 characters for a preview
            question_preview = result.get('question', 'N/A')
            print(f"Question (Start): {question_preview[:100]}...")
            
            for metric in REQUESTED_METRICS:
                score = result.get(metric.name, 'N/A')
                if isinstance(score, float):
                    score = f"{score:.3f}"
                print(f"{metric.name.replace('_', ' ').title()}: {score}")
            
            print("-" * 40)

    else:
        print("Evaluation did not produce any successful results.")

except FileNotFoundError as e:
    print(f"❌ File not found: {e}")
    print(f"Please check if the file exists at: {json_file_path}")

except RagasException as e:
    print(f"\n❌ [RAGAS ERROR] {e}")
    print("Evaluation halted. Check API keys, endpoints, and rate limits.")
    
except Exception as e:
    print(f"❌ Unexpected error in main execution: {e}")
    print(f"Full traceback:\n{traceback.format_exc()}")

Starting iterative evaluation for: 2_sample_test_qa_eval.jsonl
Loading data from: 2_sample_test_qa_eval.jsonl
Loaded 3 samples from JSONL format.

Processing Sample 1 / 3


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Sample 1 Scores:
  Faithfulness             : 0.917
  Answer Relevancy         : 0.975
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.329
  Answer Similarity        : 0.964

Processing Sample 2 / 3


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Sample 2 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.974
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 1.000
  Answer Correctness       : 0.370
  Answer Similarity        : 0.959

Processing Sample 3 / 3


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


Sample 3 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.955
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 1.000
  Answer Correctness       : 0.579
  Answer Similarity        : 0.983

Combining all results...
Detailed results saved to evaluation_results.json
Results exported to evaluation_results.csv
Summary statistics saved to evaluation_summary.json

RAGAS QUICK SUMMARY (Mean Scores)
Faithfulness             : 0.972
Answer Relevancy         : 0.968
Context Precision        : 1.000
Context Recall           : 1.000
Context Entity Recall    : 0.833
Answer Correctness       : 0.426
Answer Similarity        : 0.969

 ✅ Evaluation completed successfully!
Total successful samples evaluated: 3

First 3 results preview:

Sample 1:
Question (Start): N/A...
Faithfulness: 0.917
Answer Relevancy: 0.975
Context Precision: 1.000
Context Recall: 1.000
Context Entity Recall: 0.500
Answer Correctness: 0.329
Answer Similarit

In [16]:
# --- This is the main execution block ---
# (Note: We remove the 'if __name__ == "__main__":' check for notebooks)

results_df = None
summary_stats = {}

# ⚠️ Set the path to your data file
# This path was from your original script. Update it if needed.
json_file_path = "200_sample_test_qa_eval.jsonl"

try:
    print(f"Starting iterative evaluation for: {json_file_path}")
    
    results_df = evaluate_dataset_iteratively(json_file_path)
    
    if results_df is not None and not results_df.empty:
        # Save all results
        save_results(results_df)
        export_to_csv(results_df)
        
        # Create and print summary
        summary_stats = create_summary_statistics(results_df)
        print_quick_summary(summary_stats)
        
        print("\n ✅ Evaluation completed successfully!")
        print(f"Total successful samples evaluated: {len(results_df)}")

        print("\nFirst 3 results preview:")
        print("="*80)
        
        # Display the start of the question in the preview
        for i in range(min(3, len(results_df))):
            result = results_df.iloc[i]
            print(f"\nSample {i+1}:")
            
            # Get the question and limit it to the first 100 characters for a preview
            question_preview = result.get('question', 'N/A')
            print(f"Question (Start): {question_preview[:100]}...")
            
            for metric in REQUESTED_METRICS:
                score = result.get(metric.name, 'N/A')
                if isinstance(score, float):
                    score = f"{score:.3f}"
                print(f"{metric.name.replace('_', ' ').title()}: {score}")
            
            print("-" * 40)

    else:
        print("Evaluation did not produce any successful results.")

except FileNotFoundError as e:
    print(f"❌ File not found: {e}")
    print(f"Please check if the file exists at: {json_file_path}")

except RagasException as e:
    print(f"\n❌ [RAGAS ERROR] {e}")
    print("Evaluation halted. Check API keys, endpoints, and rate limits.")
    
except Exception as e:
    print(f"❌ Unexpected error in main execution: {e}")
    print(f"Full traceback:\n{traceback.format_exc()}")

Starting iterative evaluation for: 200_sample_test_qa_eval.jsonl
Loading data from: 200_sample_test_qa_eval.jsonl
Loaded 200 samples from JSONL format.

Processing Sample 1 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:45<00:00,  6.57s/it]


Sample 1 Scores:
  Faithfulness             : 0.923
  Answer Relevancy         : 0.975
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.333
  Answer Correctness       : 0.429
  Answer Similarity        : 0.964

Processing Sample 2 / 200


Evaluating:  29%|██▊       | 2/7 [00:08<00:18,  3.74s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:48<00:00,  6.91s/it]


Sample 2 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.983
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.667
  Answer Correctness       : 0.311
  Answer Similarity        : 0.959

Processing Sample 3 / 200


Evaluating:  14%|█▍        | 1/7 [00:03<00:18,  3.07s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:42<00:00,  6.14s/it]


Sample 3 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.955
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 1.000
  Answer Correctness       : 0.579
  Answer Similarity        : 0.983

Processing Sample 4 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:48<00:00,  6.90s/it]


Sample 4 Scores:
  Faithfulness             : 0.727
  Answer Relevancy         : 0.895
  Context Precision        : 0.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.227
  Answer Similarity        : 0.907

Processing Sample 5 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:46<00:00,  6.60s/it]


Sample 5 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.924
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.558
  Answer Similarity        : 0.969

Processing Sample 6 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:53<00:00,  7.68s/it]


Sample 6 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.965
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.250
  Answer Correctness       : 0.312
  Answer Similarity        : 0.961

Processing Sample 7 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:17,  2.92s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:50<00:00,  7.16s/it]


Sample 7 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.929
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 1.000
  Answer Correctness       : 0.350
  Answer Similarity        : 0.954

Processing Sample 8 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:46<00:00,  6.65s/it]


Sample 8 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.899
  Context Precision        : 1.000
  Context Recall           : 0.333
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.319
  Answer Similarity        : 0.901

Processing Sample 9 / 200


Evaluating:  29%|██▊       | 2/7 [00:08<00:23,  4.69s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:45<00:00,  6.46s/it]


Sample 9 Scores:
  Faithfulness             : 0.941
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.489
  Answer Similarity        : 0.957

Processing Sample 10 / 200


Evaluating:  14%|█▍        | 1/7 [00:03<00:20,  3.44s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:49<00:00,  7.11s/it]


Sample 10 Scores:
  Faithfulness             : 0.950
  Answer Relevancy         : 0.962
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.435
  Answer Similarity        : 0.957

Processing Sample 11 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:49<00:00,  7.05s/it]


Sample 11 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.984
  Context Precision        : 1.000
  Context Recall           : 0.000
  Context Entity Recall    : 1.000
  Answer Correctness       : 0.234
  Answer Similarity        : 0.936

Processing Sample 12 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:44<00:00,  6.38s/it]


Sample 12 Scores:
  Faithfulness             : 0.909
  Answer Relevancy         : 0.972
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.200
  Answer Correctness       : 0.612
  Answer Similarity        : 0.948

Processing Sample 13 / 200


Evaluating:  29%|██▊       | 2/7 [00:08<00:21,  4.32s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:43<00:00,  6.18s/it]


Sample 13 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.881
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.313
  Answer Similarity        : 0.937

Processing Sample 14 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:16,  2.83s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:39<00:00,  5.66s/it]


Sample 14 Scores:
  Faithfulness             : 0.900
  Answer Relevancy         : 0.989
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.371
  Answer Similarity        : 0.937

Processing Sample 15 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:16,  2.70s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:37<00:00,  5.42s/it]


Sample 15 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.999
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.400
  Answer Correctness       : 0.462
  Answer Similarity        : 0.926

Processing Sample 16 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:16,  2.80s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:34<00:00,  4.95s/it]


Sample 16 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 1.000
  Answer Correctness       : 0.871
  Answer Similarity        : 0.985

Processing Sample 17 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:16,  2.68s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:33<00:00,  4.82s/it]


Sample 17 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.740
  Answer Similarity        : 0.959

Processing Sample 18 / 200


Evaluating:  29%|██▊       | 2/7 [00:08<00:21,  4.33s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:44<00:00,  6.38s/it]


Sample 18 Scores:
  Faithfulness             : 0.941
  Answer Relevancy         : 0.978
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.570
  Answer Similarity        : 0.946

Processing Sample 19 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:37<00:00,  5.35s/it]


Sample 19 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.912
  Context Precision        : 1.000
  Context Recall           : 0.667
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.738
  Answer Similarity        : 0.952

Processing Sample 20 / 200


Evaluating:  43%|████▎     | 3/7 [00:15<00:18,  4.75s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:41<00:00,  5.99s/it]


Sample 20 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.853
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.326
  Answer Similarity        : 0.930

Processing Sample 21 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:47<00:00,  6.73s/it]


Sample 21 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.250
  Answer Correctness       : 0.459
  Answer Similarity        : 0.935

Processing Sample 22 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:47<00:00,  6.83s/it]


Sample 22 Scores:
  Faithfulness             : 0.857
  Answer Relevancy         : 0.973
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 1.000
  Answer Correctness       : 0.487
  Answer Similarity        : 0.948

Processing Sample 23 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:40<00:00,  5.78s/it]


Sample 23 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.860
  Context Precision        : 1.000
  Context Recall           : 0.667
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.370
  Answer Similarity        : 0.907

Processing Sample 24 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:40<00:00,  5.81s/it]


Sample 24 Scores:
  Faithfulness             : 0.818
  Answer Relevancy         : 0.919
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.714
  Answer Correctness       : 0.763
  Answer Similarity        : 0.933

Processing Sample 25 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:49<00:00,  7.08s/it]


Sample 25 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.993
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.250
  Answer Correctness       : 0.311
  Answer Similarity        : 0.944

Processing Sample 26 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:41<00:00,  5.96s/it]


Sample 26 Scores:
  Faithfulness             : 0.667
  Answer Relevancy         : 0.916
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.232
  Answer Similarity        : 0.929

Processing Sample 27 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:40<00:00,  5.79s/it]


Sample 27 Scores:
  Faithfulness             : 0.889
  Answer Relevancy         : 0.956
  Context Precision        : 1.000
  Context Recall           : 0.000
  Context Entity Recall    : 1.000
  Answer Correctness       : 0.477
  Answer Similarity        : 0.909

Processing Sample 28 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:48<00:00,  6.90s/it]


Sample 28 Scores:
  Faithfulness             : 0.500
  Answer Relevancy         : 0.973
  Context Precision        : 0.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.221
  Answer Similarity        : 0.883

Processing Sample 29 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:16,  2.78s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:51<00:00,  7.34s/it]


Sample 29 Scores:
  Faithfulness             : 0.895
  Answer Relevancy         : 0.986
  Context Precision        : 0.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.125
  Answer Correctness       : 0.226
  Answer Similarity        : 0.906

Processing Sample 30 / 200


Evaluating:  14%|█▍        | 1/7 [00:04<00:24,  4.01s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:48<00:00,  6.95s/it]


Sample 30 Scores:
  Faithfulness             : 0.964
  Answer Relevancy         : 1.000
  Context Precision        : 0.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.227
  Answer Similarity        : 0.908

Processing Sample 31 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:16,  2.81s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:47<00:00,  6.76s/it]


Sample 31 Scores:
  Faithfulness             : 0.750
  Answer Relevancy         : 0.000
  Context Precision        : 0.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.083
  Answer Correctness       : 0.234
  Answer Similarity        : 0.936

Processing Sample 32 / 200


Evaluating:  14%|█▍        | 1/7 [00:03<00:23,  3.90s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:45<00:00,  6.51s/it]


Sample 32 Scores:
  Faithfulness             : 0.667
  Answer Relevancy         : 0.000
  Context Precision        : 0.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.143
  Answer Correctness       : 0.234
  Answer Similarity        : 0.936

Processing Sample 33 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:17,  2.99s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:49<00:00,  7.01s/it]


Sample 33 Scores:
  Faithfulness             : 0.889
  Answer Relevancy         : 1.000
  Context Precision        : 0.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.250
  Answer Correctness       : 0.231
  Answer Similarity        : 0.922

Processing Sample 34 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:59<00:00,  8.44s/it]


Sample 34 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.990
  Context Precision        : 1.000
  Context Recall           : 0.667
  Context Entity Recall    : 0.750
  Answer Correctness       : 0.379
  Answer Similarity        : 0.970

Processing Sample 35 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:17,  2.96s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:49<00:00,  7.06s/it]


Sample 35 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.986
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.745
  Answer Similarity        : 0.981

Processing Sample 36 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:46<00:00,  6.71s/it]


Sample 36 Scores:
  Faithfulness             : 0.750
  Answer Relevancy         : 0.919
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.223
  Answer Similarity        : 0.894

Processing Sample 37 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:53<00:00,  7.61s/it]


Sample 37 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.333
  Answer Correctness       : 0.288
  Answer Similarity        : 0.912

Processing Sample 38 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:41<00:00,  6.00s/it]


Sample 38 Scores:
  Faithfulness             : 0.600
  Answer Relevancy         : 0.992
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.833
  Answer Correctness       : 0.490
  Answer Similarity        : 0.961

Processing Sample 39 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:15,  2.58s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:42<00:00,  6.06s/it]


Sample 39 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.960
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.394
  Answer Similarity        : 0.943

Processing Sample 40 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:40<00:00,  5.85s/it]


Sample 40 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.855
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.333
  Answer Correctness       : 0.539
  Answer Similarity        : 0.955

Processing Sample 41 / 200


Evaluating:  14%|█▍        | 1/7 [00:03<00:20,  3.39s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:48<00:00,  6.97s/it]


Sample 41 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.901
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 1.000
  Answer Correctness       : 0.465
  Answer Similarity        : 0.935

Processing Sample 42 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:53<00:00,  7.61s/it]


Sample 42 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.954
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 1.000
  Answer Correctness       : 0.587
  Answer Similarity        : 0.936

Processing Sample 43 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:48<00:00,  6.93s/it]


Sample 43 Scores:
  Faithfulness             : 0.600
  Answer Relevancy         : 0.988
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.667
  Answer Correctness       : 0.558
  Answer Similarity        : 0.967

Processing Sample 44 / 200


Evaluating:  14%|█▍        | 1/7 [00:03<00:20,  3.49s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:45<00:00,  6.55s/it]


Sample 44 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.333
  Answer Correctness       : 0.334
  Answer Similarity        : 0.960

Processing Sample 45 / 200


Evaluating:  14%|█▍        | 1/7 [00:03<00:22,  3.80s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:42<00:00,  6.07s/it]


Sample 45 Scores:
  Faithfulness             : 0.875
  Answer Relevancy         : 0.995
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 1.000
  Answer Correctness       : 0.406
  Answer Similarity        : 0.958

Processing Sample 46 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:51<00:00,  7.40s/it]


Sample 46 Scores:
  Faithfulness             : 0.964
  Answer Relevancy         : 0.966
  Context Precision        : 1.000
  Context Recall           : 0.667
  Context Entity Recall    : 0.333
  Answer Correctness       : 0.373
  Answer Similarity        : 0.931

Processing Sample 47 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:43<00:00,  6.23s/it]


Sample 47 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.891
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 1.000
  Answer Correctness       : 0.426
  Answer Similarity        : 0.955

Processing Sample 48 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:47<00:00,  6.80s/it]


Sample 48 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.600
  Answer Correctness       : 0.408
  Answer Similarity        : 0.966

Processing Sample 49 / 200


Evaluating:  14%|█▍        | 1/7 [00:03<00:18,  3.08s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:43<00:00,  6.23s/it]


Sample 49 Scores:
  Faithfulness             : 0.667
  Answer Relevancy         : 0.975
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.833
  Answer Correctness       : 0.478
  Answer Similarity        : 0.965

Processing Sample 50 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:39<00:00,  5.63s/it]


Sample 50 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.979
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.400
  Answer Correctness       : 0.560
  Answer Similarity        : 0.956

Processing Sample 51 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:41<00:00,  5.96s/it]


Sample 51 Scores:
  Faithfulness             : 0.917
  Answer Relevancy         : 0.948
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.333
  Answer Correctness       : 0.329
  Answer Similarity        : 0.940

Processing Sample 52 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:46<00:00,  6.58s/it]


Sample 52 Scores:
  Faithfulness             : 0.944
  Answer Relevancy         : 0.912
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.250
  Answer Correctness       : 0.370
  Answer Similarity        : 0.960

Processing Sample 53 / 200


Evaluating:  14%|█▍        | 1/7 [00:03<00:18,  3.15s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:42<00:00,  6.06s/it]


Sample 53 Scores:
  Faithfulness             : 0.889
  Answer Relevancy         : 0.980
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.167
  Answer Correctness       : 0.642
  Answer Similarity        : 0.970

Processing Sample 54 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:43<00:00,  6.26s/it]


Sample 54 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 0.667
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.391
  Answer Similarity        : 0.965

Processing Sample 55 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:17,  2.98s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:47<00:00,  6.79s/it]


Sample 55 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.167
  Answer Correctness       : 0.516
  Answer Similarity        : 0.973

Processing Sample 56 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:16,  2.77s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:49<00:00,  7.09s/it]


Sample 56 Scores:
  Faithfulness             : 0.950
  Answer Relevancy         : 0.968
  Context Precision        : 0.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.200
  Answer Correctness       : 0.286
  Answer Similarity        : 0.931

Processing Sample 57 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:41<00:00,  5.97s/it]


Sample 57 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.848
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.333
  Answer Correctness       : 0.318
  Answer Similarity        : 0.940

Processing Sample 58 / 200


Evaluating:  29%|██▊       | 2/7 [00:07<00:20,  4.09s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:52<00:00,  7.53s/it]


Sample 58 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.862
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.373
  Answer Similarity        : 0.947

Processing Sample 59 / 200


Evaluating:  14%|█▍        | 1/7 [00:03<00:19,  3.21s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:52<00:00,  7.48s/it]


Sample 59 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.968
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.298
  Answer Similarity        : 0.959

Processing Sample 60 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:16,  2.68s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:41<00:00,  5.87s/it]


Sample 60 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.000
  Context Precision        : 0.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.250
  Answer Correctness       : 0.234
  Answer Similarity        : 0.935

Processing Sample 61 / 200


Evaluating:  14%|█▍        | 1/7 [00:04<00:25,  4.33s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:47<00:00,  6.81s/it]


Sample 61 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.994
  Context Precision        : 1.000
  Context Recall           : 0.667
  Context Entity Recall    : 0.667
  Answer Correctness       : 0.470
  Answer Similarity        : 0.934

Processing Sample 62 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:17,  2.99s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:41<00:00,  5.91s/it]


Sample 62 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.951
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.510
  Answer Similarity        : 0.950

Processing Sample 63 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:52<00:00,  7.49s/it]


Sample 63 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.942
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.290
  Answer Similarity        : 0.928

Processing Sample 64 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:51<00:00,  7.39s/it]


Sample 64 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.997
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.834
  Answer Similarity        : 0.936

Processing Sample 65 / 200


Evaluating:  14%|█▍        | 1/7 [00:06<00:39,  6.63s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:45<00:00,  6.46s/it]


Sample 65 Scores:
  Faithfulness             : 0.933
  Answer Relevancy         : 0.976
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.750
  Answer Correctness       : 0.328
  Answer Similarity        : 0.958

Processing Sample 66 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:42<00:00,  6.12s/it]


Sample 66 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.899
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.651
  Answer Similarity        : 0.966

Processing Sample 67 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:44<00:00,  6.33s/it]


Sample 67 Scores:
  Faithfulness             : 0.818
  Answer Relevancy         : 0.894
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.449
  Answer Similarity        : 0.937

Processing Sample 68 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:49<00:00,  7.06s/it]


Sample 68 Scores:
  Faithfulness             : 0.800
  Answer Relevancy         : 0.950
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.750
  Answer Correctness       : 0.231
  Answer Similarity        : 0.924

Processing Sample 69 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:40<00:00,  5.83s/it]


Sample 69 Scores:
  Faithfulness             : 0.857
  Answer Relevancy         : 0.976
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.572
  Answer Similarity        : 0.956

Processing Sample 70 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:17,  2.85s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:51<00:00,  7.29s/it]


Sample 70 Scores:
  Faithfulness             : 0.909
  Answer Relevancy         : 0.969
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.464
  Answer Similarity        : 0.933

Processing Sample 71 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:46<00:00,  6.71s/it]


Sample 71 Scores:
  Faithfulness             : 0.941
  Answer Relevancy         : 0.941
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.303
  Answer Similarity        : 0.898

Processing Sample 72 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:41<00:00,  5.89s/it]


Sample 72 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.970
  Context Precision        : 0.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.231
  Answer Similarity        : 0.925

Processing Sample 73 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:42<00:00,  6.13s/it]


Sample 73 Scores:
  Faithfulness             : 0.778
  Answer Relevancy         : 0.992
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.667
  Answer Correctness       : 0.382
  Answer Similarity        : 0.958

Processing Sample 74 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:17,  2.98s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:46<00:00,  6.63s/it]


Sample 74 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.971
  Context Precision        : 1.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.308
  Answer Similarity        : 0.915

Processing Sample 75 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:50<00:00,  7.18s/it]


Sample 75 Scores:
  Faithfulness             : 0.933
  Answer Relevancy         : 0.887
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.200
  Answer Correctness       : 0.537
  Answer Similarity        : 0.950

Processing Sample 76 / 200


Evaluating:  14%|█▍        | 1/7 [00:03<00:19,  3.23s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:41<00:00,  5.99s/it]


Sample 76 Scores:
  Faithfulness             : 0.917
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 1.000
  Answer Correctness       : 0.651
  Answer Similarity        : 0.936

Processing Sample 77 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:40<00:00,  5.84s/it]


Sample 77 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.891
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.478
  Answer Similarity        : 0.913

Processing Sample 78 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:45<00:00,  6.46s/it]


Sample 78 Scores:
  Faithfulness             : 0.933
  Answer Relevancy         : 0.980
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.750
  Answer Correctness       : 0.564
  Answer Similarity        : 0.968

Processing Sample 79 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:41<00:00,  5.96s/it]


Sample 79 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.403
  Answer Similarity        : 0.946

Processing Sample 80 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:  86%|████████▌ | 6/7 [00:58<00:08,  8.30s/it]Exception raised in Job[5]: TimeoutError()
Evaluating: 100%|██████████| 7/7 [03:00<00:00, 25.73s/it]


Sample 80 Scores:
  Faithfulness             : 0.867
  Answer Relevancy         : 0.986
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.500
  Answer Correctness       : nan
  Answer Similarity        : 0.955

Processing Sample 81 / 200


Evaluating:  14%|█▍        | 1/7 [00:06<00:40,  6.73s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:49<00:00,  7.03s/it]


Sample 81 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.983
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.250
  Answer Correctness       : 0.317
  Answer Similarity        : 0.893

Processing Sample 82 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:41<00:00,  5.94s/it]


Sample 82 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.956
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.333
  Answer Correctness       : 0.544
  Answer Similarity        : 0.974

Processing Sample 83 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:16,  2.71s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:44<00:00,  6.41s/it]


Sample 83 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.250
  Answer Correctness       : 0.241
  Answer Similarity        : 0.962

Processing Sample 84 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:49<00:00,  7.05s/it]


Sample 84 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.911
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.344
  Answer Similarity        : 0.947

Processing Sample 85 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:17,  2.99s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:48<00:00,  6.87s/it]


Sample 85 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.984
  Context Precision        : 0.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.235
  Answer Similarity        : 0.941

Processing Sample 86 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:16,  2.68s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:42<00:00,  6.04s/it]


Sample 86 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.949
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.333
  Answer Correctness       : 0.667
  Answer Similarity        : 0.954

Processing Sample 87 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:46<00:00,  6.68s/it]


Sample 87 Scores:
  Faithfulness             : 0.773
  Answer Relevancy         : 0.968
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.333
  Answer Correctness       : 0.405
  Answer Similarity        : 0.929

Processing Sample 88 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:49<00:00,  7.01s/it]


Sample 88 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.844
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.333
  Answer Correctness       : 0.330
  Answer Similarity        : 0.892

Processing Sample 89 / 200


Evaluating:  14%|█▍        | 1/7 [00:03<00:18,  3.08s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:52<00:00,  7.47s/it]


Sample 89 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.991
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.667
  Answer Correctness       : 0.363
  Answer Similarity        : 0.931

Processing Sample 90 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:47<00:00,  6.79s/it]


Sample 90 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.906
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.414
  Answer Similarity        : 0.905

Processing Sample 91 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:46<00:00,  6.62s/it]


Sample 91 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.975
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.438
  Answer Similarity        : 0.953

Processing Sample 92 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:53<00:00,  7.58s/it]


Sample 92 Scores:
  Faithfulness             : 0.947
  Answer Relevancy         : 0.921
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.333
  Answer Correctness       : 0.499
  Answer Similarity        : 0.954

Processing Sample 93 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:48<00:00,  6.90s/it]


Sample 93 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.832
  Context Precision        : 1.000
  Context Recall           : 0.667
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.552
  Answer Similarity        : 0.943

Processing Sample 94 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:52<00:00,  7.46s/it]


Sample 94 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.829
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.305
  Answer Similarity        : 0.920

Processing Sample 95 / 200


Evaluating:  14%|█▍        | 1/7 [00:12<01:14, 12.48s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:42<00:00,  6.12s/it]


Sample 95 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.600
  Answer Correctness       : 0.544
  Answer Similarity        : 0.977

Processing Sample 96 / 200


Evaluating:  29%|██▊       | 2/7 [00:11<00:26,  5.34s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:52<00:00,  7.52s/it]


Sample 96 Scores:
  Faithfulness             : 0.789
  Answer Relevancy         : 0.808
  Context Precision        : 0.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.223
  Answer Similarity        : 0.891

Processing Sample 97 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:51<00:00,  7.39s/it]


Sample 97 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.845
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 1.000
  Answer Correctness       : 0.420
  Answer Similarity        : 0.930

Processing Sample 98 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:46<00:00,  6.62s/it]


Sample 98 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.902
  Context Precision        : 0.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.219
  Answer Similarity        : 0.877

Processing Sample 99 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:17,  2.88s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:44<00:00,  6.41s/it]


Sample 99 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.490
  Answer Similarity        : 0.960

Processing Sample 100 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:53<00:00,  7.67s/it]


Sample 100 Scores:
  Faithfulness             : 0.500
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.746
  Answer Similarity        : 0.983

Processing Sample 101 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:48<00:00,  6.98s/it]


Sample 101 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.912
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.200
  Answer Correctness       : 0.325
  Answer Similarity        : 0.925

Processing Sample 102 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:49<00:00,  7.04s/it]


Sample 102 Scores:
  Faithfulness             : 0.917
  Answer Relevancy         : 0.874
  Context Precision        : 0.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.216
  Answer Similarity        : 0.863

Processing Sample 103 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [01:09<00:00,  9.86s/it]


Sample 103 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.961
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.301
  Answer Similarity        : 0.931

Processing Sample 104 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:17,  2.98s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:48<00:00,  6.95s/it]


Sample 104 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.921
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 1.000
  Answer Correctness       : 0.356
  Answer Similarity        : 0.944

Processing Sample 105 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:47<00:00,  6.84s/it]


Sample 105 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.890
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.167
  Answer Correctness       : 0.653
  Answer Similarity        : 0.944

Processing Sample 106 / 200


Evaluating:  14%|█▍        | 1/7 [00:07<00:42,  7.16s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:49<00:00,  7.01s/it]


Sample 106 Scores:
  Faithfulness             : 0.867
  Answer Relevancy         : 0.921
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.484
  Answer Similarity        : 0.937

Processing Sample 107 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:46<00:00,  6.64s/it]


Sample 107 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.936
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.563
  Answer Similarity        : 0.917

Processing Sample 108 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:59<00:00,  8.53s/it]


Sample 108 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.922
  Context Precision        : 0.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.227
  Answer Similarity        : 0.908

Processing Sample 109 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:43<00:00,  6.19s/it]


Sample 109 Scores:
  Faithfulness             : 0.875
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.650
  Answer Similarity        : 0.964

Processing Sample 110 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:45<00:00,  6.47s/it]


Sample 110 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.923
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.488
  Answer Similarity        : 0.951

Processing Sample 111 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:54<00:00,  7.85s/it]


Sample 111 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.901
  Context Precision        : 1.000
  Context Recall           : 0.667
  Context Entity Recall    : 0.444
  Answer Correctness       : 0.742
  Answer Similarity        : 0.968

Processing Sample 112 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:48<00:00,  6.92s/it]


Sample 112 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.962
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.333
  Answer Correctness       : 0.522
  Answer Similarity        : 0.963

Processing Sample 113 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:55<00:00,  7.91s/it]


Sample 113 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.985
  Context Precision        : 1.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.333
  Answer Correctness       : 0.357
  Answer Similarity        : 0.926

Processing Sample 114 / 200


Evaluating:  14%|█▍        | 1/7 [00:03<00:22,  3.69s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:45<00:00,  6.57s/it]


Sample 114 Scores:
  Faithfulness             : 0.750
  Answer Relevancy         : 0.965
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.333
  Answer Correctness       : 0.346
  Answer Similarity        : 0.954

Processing Sample 115 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:47<00:00,  6.76s/it]


Sample 115 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.929
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.600
  Answer Correctness       : 0.513
  Answer Similarity        : 0.940

Processing Sample 116 / 200


Evaluating:  14%|█▍        | 1/7 [00:03<00:19,  3.29s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:49<00:00,  7.02s/it]


Sample 116 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.978
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.667
  Answer Correctness       : 0.534
  Answer Similarity        : 0.936

Processing Sample 117 / 200


Evaluating:  14%|█▍        | 1/7 [00:12<01:13, 12.18s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:50<00:00,  7.26s/it]


Sample 117 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.940
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 1.000
  Answer Correctness       : 0.490
  Answer Similarity        : 0.962

Processing Sample 118 / 200


Evaluating:  14%|█▍        | 1/7 [00:05<00:33,  5.50s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:50<00:00,  7.19s/it]


Sample 118 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.919
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 1.000
  Answer Correctness       : 0.358
  Answer Similarity        : 0.933

Processing Sample 119 / 200


Evaluating:  14%|█▍        | 1/7 [00:07<00:42,  7.16s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:54<00:00,  7.82s/it]


Sample 119 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.993
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.355
  Answer Similarity        : 0.959

Processing Sample 120 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:49<00:00,  7.01s/it]


Sample 120 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.844
  Context Precision        : 1.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.667
  Answer Correctness       : 0.235
  Answer Similarity        : 0.939

Processing Sample 121 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:49<00:00,  7.09s/it]


Sample 121 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.988
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.398
  Answer Similarity        : 0.960

Processing Sample 122 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:58<00:00,  8.38s/it]


Sample 122 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.831
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.333
  Answer Correctness       : 0.303
  Answer Similarity        : 0.912

Processing Sample 123 / 200


Evaluating:  14%|█▍        | 1/7 [00:03<00:21,  3.60s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:44<00:00,  6.42s/it]


Sample 123 Scores:
  Faithfulness             : 0.667
  Answer Relevancy         : 0.000
  Context Precision        : 1.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.468
  Answer Similarity        : 0.949

Processing Sample 124 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [01:07<00:00,  9.61s/it]


Sample 124 Scores:
  Faithfulness             : 0.944
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.333
  Answer Correctness       : 0.379
  Answer Similarity        : 0.945

Processing Sample 125 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:43<00:00,  6.24s/it]


Sample 125 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.966
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.250
  Answer Correctness       : 0.228
  Answer Similarity        : 0.910

Processing Sample 126 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:49<00:00,  7.06s/it]


Sample 126 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 1.000
  Context Precision        : 0.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.208
  Answer Similarity        : 0.833

Processing Sample 127 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:52<00:00,  7.44s/it]


Sample 127 Scores:
  Faithfulness             : 0.636
  Answer Relevancy         : 0.987
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.636
  Answer Similarity        : 0.943

Processing Sample 128 / 200


Evaluating:  14%|█▍        | 1/7 [00:12<01:16, 12.80s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:56<00:00,  8.10s/it]


Sample 128 Scores:
  Faithfulness             : 0.920
  Answer Relevancy         : 0.989
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.346
  Answer Similarity        : 0.955

Processing Sample 129 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [01:11<00:00, 10.20s/it]


Sample 129 Scores:
  Faithfulness             : 0.889
  Answer Relevancy         : 0.877
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.354
  Answer Similarity        : 0.894

Processing Sample 130 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:46<00:00,  6.58s/it]


Sample 130 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.985
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.235
  Answer Similarity        : 0.938

Processing Sample 131 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:52<00:00,  7.56s/it]


Sample 131 Scores:
  Faithfulness             : 0.800
  Answer Relevancy         : 0.946
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.600
  Answer Similarity        : 0.960

Processing Sample 132 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:50<00:00,  7.17s/it]


Sample 132 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.913
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.750
  Answer Correctness       : 0.381
  Answer Similarity        : 0.925

Processing Sample 133 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:46<00:00,  6.70s/it]


Sample 133 Scores:
  Faithfulness             : 0.857
  Answer Relevancy         : 0.913
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 1.000
  Answer Correctness       : 0.403
  Answer Similarity        : 0.947

Processing Sample 134 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:50<00:00,  7.21s/it]


Sample 134 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.944
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.750
  Answer Correctness       : 0.538
  Answer Similarity        : 0.910

Processing Sample 135 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:44<00:00,  6.42s/it]


Sample 135 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.863
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.318
  Answer Similarity        : 0.842

Processing Sample 136 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:47<00:00,  6.80s/it]


Sample 136 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.925
  Context Precision        : 0.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.250
  Answer Correctness       : 0.361
  Answer Similarity        : 0.899

Processing Sample 137 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:46<00:00,  6.63s/it]


Sample 137 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.968
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.363
  Answer Similarity        : 0.951

Processing Sample 138 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:48<00:00,  6.96s/it]


Sample 138 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.833
  Answer Correctness       : 0.836
  Answer Similarity        : 0.944

Processing Sample 139 / 200


Evaluating:  14%|█▍        | 1/7 [00:13<01:22, 13.82s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:46<00:00,  6.67s/it]


Sample 139 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.966
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.643
  Answer Similarity        : 0.973

Processing Sample 140 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:51<00:00,  7.30s/it]


Sample 140 Scores:
  Faithfulness             : 0.947
  Answer Relevancy         : 0.991
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.333
  Answer Correctness       : 0.307
  Answer Similarity        : 0.928

Processing Sample 141 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:50<00:00,  7.15s/it]


Sample 141 Scores:
  Faithfulness             : 0.583
  Answer Relevancy         : 0.976
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.333
  Answer Correctness       : 0.681
  Answer Similarity        : 0.960

Processing Sample 142 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:51<00:00,  7.40s/it]


Sample 142 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.993
  Context Precision        : 1.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.274
  Answer Similarity        : 0.894

Processing Sample 143 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:16,  2.67s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:47<00:00,  6.77s/it]


Sample 143 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.000
  Context Precision        : 0.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.231
  Answer Similarity        : 0.924

Processing Sample 144 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:54<00:00,  7.81s/it]


Sample 144 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.971
  Context Precision        : 0.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.227
  Answer Similarity        : 0.908

Processing Sample 145 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:43<00:00,  6.25s/it]


Sample 145 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.994
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 1.000
  Answer Correctness       : 0.999
  Answer Similarity        : 0.997

Processing Sample 146 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:48<00:00,  6.96s/it]


Sample 146 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.934
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.335
  Answer Similarity        : 0.940

Processing Sample 147 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:  71%|███████▏  | 5/7 [20:13<10:18, 309.38s/it]Exception raised in Job[5]: TimeoutError()
Exception raised in Job[0]: TimeoutError()
Evaluating: 100%|██████████| 7/7 [20:13<00:00, 173.31s/it]


Sample 147 Scores:
  Faithfulness             : nan
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.000
  Answer Correctness       : nan
  Answer Similarity        : 0.913

Processing Sample 148 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]Exception raised in Job[1]: APIConnectionError(Connection error.)
Exception raised in Job[4]: APIConnectionError(Connection error.)
Evaluating:  14%|█▍        | 1/7 [00:01<00:10,  1.80s/it]Exception raised in Job[6]: APIConnectionError(Connection error.)
Evaluating:  43%|████▎     | 3/7 [00:01<00:02,  1.98it/s]Exception raised in Job[3]: APIConnectionError(Connection error.)
Exception raised in Job[0]: APIConnectionError(Connection error.)
Exception raised in Job[2]: APIConnectionError(Connection error.)
Exception raised in Job[5]: APIConnectionError(Connection error.)
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  3.50it/s]


Sample 148 Scores:
  Faithfulness             : nan
  Answer Relevancy         : nan
  Context Precision        : nan
  Context Recall           : nan
  Context Entity Recall    : nan
  Answer Correctness       : nan
  Answer Similarity        : nan

Processing Sample 149 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]Exception raised in Job[1]: APIConnectionError(Connection error.)
Evaluating:  14%|█▍        | 1/7 [00:01<00:08,  1.42s/it]Exception raised in Job[3]: APIConnectionError(Connection error.)
Exception raised in Job[6]: APIConnectionError(Connection error.)
Exception raised in Job[0]: APIConnectionError(Connection error.)
Evaluating:  57%|█████▋    | 4/7 [00:01<00:00,  3.16it/s]Exception raised in Job[5]: APIConnectionError(Connection error.)
Exception raised in Job[4]: APIConnectionError(Connection error.)
Evaluating:  86%|████████▌ | 6/7 [00:01<00:00,  4.83it/s]Exception raised in Job[2]: APIConnectionError(Connection error.)
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  4.04it/s]


Sample 149 Scores:
  Faithfulness             : nan
  Answer Relevancy         : nan
  Context Precision        : nan
  Context Recall           : nan
  Context Entity Recall    : nan
  Answer Correctness       : nan
  Answer Similarity        : nan

Processing Sample 150 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:48<00:00,  6.87s/it]


Sample 150 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.986
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.388
  Answer Similarity        : 0.922

Processing Sample 151 / 200


Evaluating:  29%|██▊       | 2/7 [00:04<00:11,  2.31s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Exception raised in Job[1]: APIConnectionError(Connection error.)
Evaluating: 100%|██████████| 7/7 [00:46<00:00,  6.65s/it]


Sample 151 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : nan
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.870
  Answer Similarity        : 0.979

Processing Sample 152 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.25s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:29<00:00,  4.19s/it]


Sample 152 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.989
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.667
  Answer Correctness       : 0.997
  Answer Similarity        : 0.990

Processing Sample 153 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.25s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:  86%|████████▌ | 6/7 [00:26<00:03,  3.88s/it]Exception raised in Job[5]: APIConnectionError(Connection error.)
Evaluating: 100%|██████████| 7/7 [00:40<00:00,  5.79s/it]


Sample 153 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 0.667
  Context Entity Recall    : 0.000
  Answer Correctness       : nan
  Answer Similarity        : 0.976

Processing Sample 154 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]Exception raised in Job[6]: APIConnectionError(Connection error.)
Evaluating:  14%|█▍        | 1/7 [00:01<00:07,  1.19s/it]Exception raised in Job[5]: APIConnectionError(Connection error.)
Exception raised in Job[3]: APIConnectionError(Connection error.)
Evaluating:  29%|██▊       | 2/7 [00:01<00:02,  1.81it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:18<00:00,  2.68s/it]


Sample 154 Scores:
  Faithfulness             : 0.833
  Answer Relevancy         : 0.990
  Context Precision        : 1.000
  Context Recall           : nan
  Context Entity Recall    : 0.667
  Answer Correctness       : nan
  Answer Similarity        : nan

Processing Sample 155 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:31<00:00,  4.55s/it]


Sample 155 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.667
  Answer Correctness       : 0.844
  Answer Similarity        : 0.977

Processing Sample 156 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:15,  2.50s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:36<00:00,  5.28s/it]


Sample 156 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.882
  Context Precision        : 1.000
  Context Recall           : 0.667
  Context Entity Recall    : 0.111
  Answer Correctness       : 0.325
  Answer Similarity        : 0.926

Processing Sample 157 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.27s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:41<00:00,  6.00s/it]


Sample 157 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.400
  Answer Correctness       : 0.501
  Answer Similarity        : 0.943

Processing Sample 158 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.30s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:36<00:00,  5.22s/it]


Sample 158 Scores:
  Faithfulness             : 0.800
  Answer Relevancy         : 0.985
  Context Precision        : 1.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.200
  Answer Correctness       : 0.333
  Answer Similarity        : 0.930

Processing Sample 159 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.29s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:  86%|████████▌ | 6/7 [00:30<00:04,  4.51s/it]Exception raised in Job[5]: APIConnectionError(Connection error.)
Evaluating: 100%|██████████| 7/7 [00:42<00:00,  6.13s/it]


Sample 159 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.982
  Context Precision        : 0.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.333
  Answer Correctness       : nan
  Answer Similarity        : 0.892

Processing Sample 160 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:37<00:00,  5.31s/it]


Sample 160 Scores:
  Faithfulness             : 0.727
  Answer Relevancy         : 0.968
  Context Precision        : 0.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.229
  Answer Similarity        : 0.917

Processing Sample 161 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:14,  2.38s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:33<00:00,  4.75s/it]


Sample 161 Scores:
  Faithfulness             : 0.375
  Answer Relevancy         : 1.000
  Context Precision        : 0.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.223
  Answer Similarity        : 0.892

Processing Sample 162 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:14,  2.34s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:28<00:00,  4.09s/it]


Sample 162 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.667
  Answer Correctness       : 0.481
  Answer Similarity        : 0.922

Processing Sample 163 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.27s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:27<00:00,  3.90s/it]


Sample 163 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.667
  Answer Correctness       : 0.995
  Answer Similarity        : 0.981

Processing Sample 164 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.31s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:  43%|████▎     | 3/7 [00:16<00:24,  6.05s/it]Exception raised in Job[4]: APIConnectionError(Connection error.)
Evaluating:  71%|███████▏  | 5/7 [00:23<00:09,  4.71s/it]Exception raised in Job[5]: APIConnectionError(Connection error.)
Exception raised in Job[0]: APIConnectionError(Connection error.)
Evaluating: 100%|██████████| 7/7 [00:23<00:00,  3.37s/it]


Sample 164 Scores:
  Faithfulness             : nan
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : nan
  Answer Correctness       : nan
  Answer Similarity        : 0.985

Processing Sample 165 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]Exception raised in Job[6]: APIConnectionError(Connection error.)
Evaluating:  14%|█▍        | 1/7 [00:01<00:07,  1.25s/it]Exception raised in Job[4]: APIConnectionError(Connection error.)
Exception raised in Job[3]: APIConnectionError(Connection error.)
Exception raised in Job[5]: APIConnectionError(Connection error.)
Exception raised in Job[0]: APIConnectionError(Connection error.)
Exception raised in Job[2]: APIConnectionError(Connection error.)
Evaluating:  71%|███████▏  | 5/7 [00:01<00:00,  4.60it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:07<00:00,  1.14s/it]


Sample 165 Scores:
  Faithfulness             : nan
  Answer Relevancy         : 0.933
  Context Precision        : nan
  Context Recall           : nan
  Context Entity Recall    : nan
  Answer Correctness       : nan
  Answer Similarity        : nan

Processing Sample 166 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:39<00:00,  5.67s/it]


Sample 166 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.333
  Answer Correctness       : 0.739
  Answer Similarity        : 0.955

Processing Sample 167 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.25s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:28<00:00,  4.06s/it]


Sample 167 Scores:
  Faithfulness             : 0.700
  Answer Relevancy         : 0.986
  Context Precision        : 1.000
  Context Recall           : 0.667
  Context Entity Recall    : 0.875
  Answer Correctness       : 0.863
  Answer Similarity        : 0.982

Processing Sample 168 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.27s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:34<00:00,  4.91s/it]


Sample 168 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.982
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.857
  Answer Correctness       : 0.860
  Answer Similarity        : 0.967

Processing Sample 169 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:14,  2.35s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:36<00:00,  5.23s/it]


Sample 169 Scores:
  Faithfulness             : 0.800
  Answer Relevancy         : 0.966
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 1.000
  Answer Correctness       : 0.385
  Answer Similarity        : 0.939

Processing Sample 170 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:14,  2.35s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Exception raised in Job[1]: APIConnectionError(Connection error.)
Evaluating: 100%|██████████| 7/7 [00:34<00:00,  4.90s/it]


Sample 170 Scores:
  Faithfulness             : 0.750
  Answer Relevancy         : nan
  Context Precision        : 0.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.333
  Answer Correctness       : 0.441
  Answer Similarity        : 0.963

Processing Sample 171 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.26s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:40<00:00,  5.83s/it]


Sample 171 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.989
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.250
  Answer Correctness       : 0.850
  Answer Similarity        : 0.929

Processing Sample 172 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.29s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:30<00:00,  4.40s/it]


Sample 172 Scores:
  Faithfulness             : 0.800
  Answer Relevancy         : 0.995
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.632
  Answer Correctness       : 0.906
  Answer Similarity        : 0.957

Processing Sample 173 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.30s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:27<00:00,  3.88s/it]


Sample 173 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.200
  Answer Correctness       : 0.997
  Answer Similarity        : 0.989

Processing Sample 174 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:15,  2.55s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Exception raised in Job[1]: APIConnectionError(Connection error.)
Evaluating: 100%|██████████| 7/7 [00:41<00:00,  5.98s/it]


Sample 174 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : nan
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.468
  Answer Similarity        : 0.925

Processing Sample 175 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.31s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:28<00:00,  4.03s/it]


Sample 175 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.424
  Answer Similarity        : 0.948

Processing Sample 176 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.30s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:31<00:00,  4.52s/it]


Sample 176 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.987
  Context Precision        : 0.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.522
  Answer Similarity        : 0.887

Processing Sample 177 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:26<00:00,  3.74s/it]


Sample 177 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.667
  Answer Correctness       : 0.496
  Answer Similarity        : 0.985

Processing Sample 178 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.30s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:36<00:00,  5.22s/it]


Sample 178 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.868
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.800
  Answer Correctness       : 0.609
  Answer Similarity        : 0.938

Processing Sample 179 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:14,  2.35s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:34<00:00,  4.93s/it]


Sample 179 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.995
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 1.000
  Answer Correctness       : 0.871
  Answer Similarity        : 0.983

Processing Sample 180 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.25s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:28<00:00,  4.04s/it]


Sample 180 Scores:
  Faithfulness             : 0.600
  Answer Relevancy         : 0.868
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.250
  Answer Correctness       : 0.475
  Answer Similarity        : 0.900

Processing Sample 181 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:14,  2.41s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:35<00:00,  5.13s/it]


Sample 181 Scores:
  Faithfulness             : 0.769
  Answer Relevancy         : 0.980
  Context Precision        : 0.000
  Context Recall           : 0.429
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.354
  Answer Similarity        : 0.917

Processing Sample 182 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:14,  2.38s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:  71%|███████▏  | 5/7 [00:17<00:06,  3.05s/it]Exception raised in Job[0]: APIConnectionError(Connection error.)
Evaluating: 100%|██████████| 7/7 [00:30<00:00,  4.41s/it]


Sample 182 Scores:
  Faithfulness             : nan
  Answer Relevancy         : 0.994
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.311
  Answer Similarity        : 0.912

Processing Sample 183 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:14,  2.35s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:35<00:00,  5.09s/it]


Sample 183 Scores:
  Faithfulness             : 0.667
  Answer Relevancy         : 0.000
  Context Precision        : 0.000
  Context Recall           : 0.000
  Context Entity Recall    : 0.333
  Answer Correctness       : 0.186
  Answer Similarity        : 0.745

Processing Sample 184 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.31s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:34<00:00,  4.87s/it]


Sample 184 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.000
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.750
  Answer Correctness       : 0.846
  Answer Similarity        : 0.985

Processing Sample 185 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.31s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:31<00:00,  4.51s/it]


Sample 185 Scores:
  Faithfulness             : 0.125
  Answer Relevancy         : 0.972
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.667
  Answer Correctness       : 0.229
  Answer Similarity        : 0.916

Processing Sample 186 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.27s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Exception raised in Job[1]: APIConnectionError(Connection error.)
Evaluating: 100%|██████████| 7/7 [00:35<00:00,  5.10s/it]


Sample 186 Scores:
  Faithfulness             : 0.286
  Answer Relevancy         : nan
  Context Precision        : 1.000
  Context Recall           : 0.667
  Context Entity Recall    : 0.750
  Answer Correctness       : 0.207
  Answer Similarity        : 0.825

Processing Sample 187 / 200


Evaluating:  29%|██▊       | 2/7 [00:05<00:14,  2.87s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:36<00:00,  5.20s/it]


Sample 187 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.927
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.667
  Answer Correctness       : 0.221
  Answer Similarity        : 0.885

Processing Sample 188 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:17,  2.98s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Exception raised in Job[1]: APIConnectionError(Connection error.)
Evaluating:  86%|████████▌ | 6/7 [00:33<00:05,  5.41s/it]Exception raised in Job[5]: APIConnectionError(Connection error.)
Evaluating: 100%|██████████| 7/7 [00:46<00:00,  6.62s/it]


Sample 188 Scores:
  Faithfulness             : 0.938
  Answer Relevancy         : nan
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 1.000
  Answer Correctness       : nan
  Answer Similarity        : 0.953

Processing Sample 189 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:39<00:00,  5.66s/it]


Sample 189 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.971
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.125
  Answer Correctness       : 0.450
  Answer Similarity        : 0.943

Processing Sample 190 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.29s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:34<00:00,  4.87s/it]


Sample 190 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.966
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.200
  Answer Correctness       : 0.516
  Answer Similarity        : 0.922

Processing Sample 191 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:14,  2.48s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:36<00:00,  5.24s/it]


Sample 191 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 0.333
  Answer Correctness       : 0.408
  Answer Similarity        : 0.964

Processing Sample 192 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.28s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:  71%|███████▏  | 5/7 [00:17<00:07,  3.58s/it]Exception raised in Job[0]: APIConnectionError(Connection error.)
Evaluating:  86%|████████▌ | 6/7 [00:24<00:04,  4.63s/it]Exception raised in Job[5]: APIConnectionError(Connection error.)
Evaluating: 100%|██████████| 7/7 [00:24<00:00,  3.53s/it]


Sample 192 Scores:
  Faithfulness             : nan
  Answer Relevancy         : 0.951
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.500
  Answer Correctness       : nan
  Answer Similarity        : 0.974

Processing Sample 193 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]Exception raised in Job[0]: APIConnectionError(Connection error.)
Exception raised in Job[1]: APIConnectionError(Connection error.)
Evaluating:  14%|█▍        | 1/7 [00:01<00:07,  1.30s/it]Exception raised in Job[4]: APIConnectionError(Connection error.)
Exception raised in Job[3]: APIConnectionError(Connection error.)
Exception raised in Job[2]: APIConnectionError(Connection error.)
Exception raised in Job[5]: APIConnectionError(Connection error.)
Evaluating:  86%|████████▌ | 6/7 [00:01<00:00,  5.48it/s]Exception raised in Job[6]: APIConnectionError(Connection error.)
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  4.79it/s]


Sample 193 Scores:
  Faithfulness             : nan
  Answer Relevancy         : nan
  Context Precision        : nan
  Context Recall           : nan
  Context Entity Recall    : nan
  Answer Correctness       : nan
  Answer Similarity        : nan

Processing Sample 194 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:27<00:00,  3.96s/it]


Sample 194 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.943
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.000
  Answer Correctness       : 0.646
  Answer Similarity        : 0.949

Processing Sample 195 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.28s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:  71%|███████▏  | 5/7 [00:17<00:06,  3.05s/it]Exception raised in Job[5]: APIConnectionError(Connection error.)
Evaluating: 100%|██████████| 7/7 [00:30<00:00,  4.42s/it]


Sample 195 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.985
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 1.000
  Answer Correctness       : nan
  Answer Similarity        : 0.950

Processing Sample 196 / 200


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:31<00:00,  4.56s/it]


Sample 196 Scores:
  Faithfulness             : 0.929
  Answer Relevancy         : 1.000
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 0.250
  Answer Correctness       : 0.413
  Answer Similarity        : 0.945

Processing Sample 197 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.30s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:40<00:00,  5.81s/it]


Sample 197 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.993
  Context Precision        : 1.000
  Context Recall           : 0.667
  Context Entity Recall    : 0.500
  Answer Correctness       : 0.404
  Answer Similarity        : 0.948

Processing Sample 198 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.32s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:32<00:00,  4.68s/it]


Sample 198 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.972
  Context Precision        : 1.000
  Context Recall           : 0.500
  Context Entity Recall    : 1.000
  Answer Correctness       : 0.426
  Answer Similarity        : 0.954

Processing Sample 199 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.31s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:38<00:00,  5.54s/it]


Sample 199 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.984
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 1.000
  Answer Correctness       : 0.642
  Answer Similarity        : 0.968

Processing Sample 200 / 200


Evaluating:  14%|█▍        | 1/7 [00:02<00:13,  2.26s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 7/7 [00:34<00:00,  4.93s/it]


Sample 200 Scores:
  Faithfulness             : 1.000
  Answer Relevancy         : 0.990
  Context Precision        : 1.000
  Context Recall           : 1.000
  Context Entity Recall    : 1.000
  Answer Correctness       : 0.315
  Answer Similarity        : 0.946

Combining all results...
Detailed results saved to evaluation_results.json
Results exported to evaluation_results.csv
Summary statistics saved to evaluation_summary.json

RAGAS QUICK SUMMARY (Mean Scores)
Faithfulness             : 0.926
Answer Relevancy         : 0.921
Context Precision        : 0.867
Context Recall           : 0.687
Context Entity Recall    : 0.423
Answer Correctness       : 0.452
Answer Similarity        : 0.939

 ✅ Evaluation completed successfully!
Total successful samples evaluated: 200

First 3 results preview:

Sample 1:
Question (Start): N/A...
Faithfulness: 0.923
Answer Relevancy: 0.975
Context Precision: 1.000
Context Recall: 1.000
Context Entity Recall: 0.333
Answer Correctness: 0.429
Answer Simil