In [16]:
# Install required packages
!pip install ragas datasets pandas openai langchain

# Import necessary libraries
import json
import pandas as pd
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision
from datasets import Dataset
import os
from typing import List, Dict, Any
import warnings
warnings.filterwarnings('ignore')

print("✅ All packages installed and imported successfully!")


✅ All packages installed and imported successfully!


In [21]:
class RAGAsIntegrator:
    def __init__(self):
        """Initialize the RAGAs integrator with required metrics."""
        self.metrics = [faithfulness, answer_relevancy, context_precision]
        print("🚀 RAGAsIntegrator initialized with metrics: faithfulness, answer_relevancy, context_precision")

    def load_json_log(self, file_path: str) -> List[Dict[Any, Any]]:
        """Load the JSON log file."""
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
            print(f"✅ Successfully loaded {file_path}")
            return data
        except FileNotFoundError:
            print(f"❌ Error: File {file_path} not found.")
            return []
        except json.JSONDecodeError:
            print(f"❌ Error: Invalid JSON format in {file_path}")
            return []

    def prepare_dataset(self, log_data: List[Dict[Any, Any]]) -> Dataset:
        """Prepare dataset for RAGAs evaluation."""
        questions = []
        answers = []
        contexts = []
        ground_truths = []  # Required for context_precision
        item_ids = []

        for item in log_data:
            if 'items' in item:
                for log_item in item['items']:
                    system_prompt = ""
                    user_prompt = ""
                    context_list = []

                    if 'input' in log_item:
                        for input_item in log_item['input']:
                            if input_item.get('role') == 'system':
                                context_list.append(input_item.get('content', ''))
                            elif input_item.get('role') == 'user':
                                user_prompt = input_item.get('content', '')

                    answer = log_item.get('expected_output', '')
                    item_id = log_item.get('id', '')

                    if user_prompt and answer:
                        questions.append(user_prompt)
                        answers.append(answer)
                        contexts.append(context_list)  # RAGAs expects list of contexts
                        ground_truths.append(answer)  # Use expected_output as ground_truth
                        item_ids.append(item_id)

        # Create dataset
        dataset_dict = {
            'question': questions,
            'answer': answers,
            'contexts': contexts,
            'ground_truth': ground_truths,
            'item_id': item_ids
        }

        dataset = Dataset.from_dict(dataset_dict)
        print(f"📊 Dataset prepared with {len(dataset)} items (including ground_truth for context_precision)")
        return dataset

    def compute_ragas_metrics(self, dataset: Dataset) -> Dict[str, Any]:
        """Compute RAGAs metrics for the dataset."""
        try:
            print("🔄 Computing RAGAs metrics... This may take a few minutes.")
            result = evaluate(
                dataset=dataset,
                metrics=self.metrics,
            )
            print("✅ RAGAs metrics computed successfully!")
            # Convert result to dictionary if it's a Dataset object
            if isinstance(result, Dataset):
                 return result.to_dict()
            return result.scores  # Access the scores dictionary directly if it's a Result object
        except Exception as e:
            print(f"❌ Error computing RAGAs metrics: {e}")
            print("Please check your API key, billing details, or try again later.")
            return {}

    def format_output(self, dataset: Dataset, results: Dict[str, Any]) -> List[Dict[str, Any]]:
        """Format the output as required JSON structure."""
        output = []

        # Check if results contain item-wise scores (which is the expected format from evaluate)
        if all(metric.name in results and isinstance(results[metric.name], list) for metric in self.metrics):
            for i, item_id in enumerate(dataset['item_id']):
                item_result = {
                    "id": item_id,
                    "faithfulness": round(results.get('faithfulness', [0.0]*len(dataset))[i], 2),
                    "answer_relevancy": round(results.get('answer_relevancy', [0.0]*len(dataset))[i], 2),
                    "context_precision": round(results.get('context_precision', [0.0]*len(dataset))[i], 2)
                }
                output.append(item_result)
            print(f"📋 Output formatted for {len(output)} items with item-wise scores")
        else:
             print("⚠️ Results do not contain expected item-wise scores format. Formatting with default values.")
             # Fallback or handle cases where item-wise scores are not available
             for i, item_id in enumerate(dataset['item_id']):
                 item_result = {
                     "id": item_id,
                     "faithfulness": 0.0, # Defaulting to 0 or handle differently
                     "answer_relevancy": 0.0,
                     "context_precision": 0.0
                 }
                 output.append(item_result)
             print(f"📋 Output formatted with default values for {len(output)} items")


        return output

    def save_results(self, results: List[Dict[str, Any]], output_file: str):
        """Save results to JSON file."""
        try:
            with open(output_file, 'w', encoding='utf-8') as file:
                json.dump(results, file, indent=2, ensure_ascii=False)
            print(f"💾 Results saved to {output_file}")
        except Exception as e:
            print(f"❌ Error saving results: {e}")

print("✅ RAGAsIntegrator class defined successfully!")

✅ RAGAsIntegrator class defined successfully!


In [18]:
# Create sample LLM log data (from assignment example)
sample_log_data = [
    {
        "metadata": {
            "timestamp": "2025-07-27T10:00:00Z",
            "model": "gpt-4"
        },
        "items": [
            {
                "id": "item-001",
                "input": [
                    {
                        "role": "system",
                        "content": "You are a helpful AI assistant specialized in explaining complex scientific concepts. You have access to comprehensive knowledge about physics, chemistry, and biology. Always provide accurate, evidence-based information and cite relevant scientific principles when explaining phenomena."
                    },
                    {
                        "role": "user",
                        "content": "Can you explain how photosynthesis works and why it's important for life on Earth?"
                    }
                ],
                "expected_output": "Photosynthesis is a crucial biological process where plants, algae, and some bacteria convert light energy (usually from the sun) into chemical energy stored in glucose. The process occurs in two main stages: the light-dependent reactions (in thylakoids) and the Calvin cycle (in the stroma). During photosynthesis, plants absorb carbon dioxide from the atmosphere and water from the soil, using chlorophyll to capture sunlight. The overall equation is: 6CO2 + 6H2O + light energy → C6H12O6 + 6O2. This process is vital for life on Earth because it produces oxygen that most organisms need to breathe and forms the base of food chains by converting inorganic materials into organic compounds that can be consumed by other organisms."
            },
            {
                "id": "item-002",
                "input": [
                    {
                        "role": "system",
                        "content": "You are an expert financial advisor with deep knowledge of investment strategies, market analysis, and personal finance management. Provide practical, actionable advice while explaining the reasoning behind your recommendations."
                    },
                    {
                        "role": "user",
                        "content": "What are the key principles of diversification in investment portfolios?"
                    }
                ],
                "expected_output": "Diversification is a fundamental investment strategy that involves spreading investments across various asset classes, sectors, and geographic regions to reduce risk. The key principles include: 1) Asset class diversification - combining stocks, bonds, real estate, and commodities; 2) Sector diversification - investing across different industries like technology, healthcare, finance; 3) Geographic diversification - including domestic and international markets; 4) Company size diversification - mixing large-cap, mid-cap, and small-cap stocks. The main benefit is that when one investment performs poorly, others may perform well, reducing overall portfolio volatility. However, diversification doesn't eliminate all risk and may limit potential returns during strong market periods."
            },
            {
                "id": "item-003",
                "input": [
                    {
                        "role": "system",
                        "content": "You are a culinary expert with extensive knowledge of international cuisines, cooking techniques, and food science. Provide detailed explanations about cooking methods and food preparation."
                    },
                    {
                        "role": "user",
                        "content": "How does the Maillard reaction work in cooking and why is it important?"
                    }
                ],
                "expected_output": "The Maillard reaction is a complex chemical process that occurs when proteins and sugars in food are heated together, typically at temperatures above 140°C (285°F). Named after French chemist Louis-Camille Maillard, this reaction creates hundreds of different flavor compounds and the characteristic brown color in many cooked foods. The process involves amino acids reacting with reducing sugars, forming new compounds that give foods their distinctive tastes and aromas. It's crucial in cooking because it's responsible for the flavors in seared meat, toasted bread, roasted coffee, baked goods, and many other foods. The reaction is enhanced by higher temperatures, lower moisture, and slightly alkaline conditions. Understanding the Maillard reaction helps cooks achieve better flavors through proper browning techniques."
            }
        ]
    }
]

# Save sample data to file
with open('llm_log.json', 'w', encoding='utf-8') as f:
    json.dump(sample_log_data, f, indent=2, ensure_ascii=False)

print("📝 Sample log data created and saved as 'llm_log.json'")
print(f"📊 Sample contains {len(sample_log_data[0]['items'])} items for evaluation")


📝 Sample log data created and saved as 'llm_log.json'
📊 Sample contains 3 items for evaluation


In [19]:
# Set up OpenAI API key using Colab secrets (secure way)
from google.colab import userdata
import os

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

# Verify if key is set (without printing the actual key)
if "OPENAI_API_KEY" in os.environ:
    print("🔑 OpenAI API key set successfully!")
else:
    print("❌ OpenAI API key not set. Add it in Colab secrets and re-run this cell.")


🔑 OpenAI API key set successfully!


In [22]:
# Initialize the integrator
integrator = RAGAsIntegrator()

# Process the log file
input_file = "llm_log.json"
output_file = "ragas_output.json"

print("🎯 Starting RAGAs integration process...")
print("=" * 50)

# Load and process data
log_data = integrator.load_json_log(input_file)

if log_data:
    # Prepare dataset
    dataset = integrator.prepare_dataset(log_data)

    if len(dataset) > 0:
        # Compute metrics
        results = integrator.compute_ragas_metrics(dataset)

        if results: # Check if results dictionary is not empty
            # Format output
            formatted_output = integrator.format_output(dataset, results)

            # Save results
            integrator.save_results(formatted_output, output_file)

            print("=" * 50)
            print("🎉 RAGAs integration completed successfully!")
            print(f"📊 Processed {len(formatted_output)} items")
            print(f"📁 Results saved to '{output_file}'")
        else:
            print("❌ Failed to compute RAGAs metrics or results are empty")
    else:
        print("❌ No valid data found for evaluation")
else:
    print("❌ Failed to load log data")

🚀 RAGAsIntegrator initialized with metrics: faithfulness, answer_relevancy, context_precision
🎯 Starting RAGAs integration process...
✅ Successfully loaded llm_log.json
📊 Dataset prepared with 3 items (including ground_truth for context_precision)
🔄 Computing RAGAs metrics... This may take a few minutes.


Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[6]: RateLimitError(Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}})
ERROR:ragas.executor:Exception raised in Job[1]: RateLimitError(Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}})
ERROR:ragas.executor:Exception raised in Job[7]: RateLimitError(Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platfor

✅ RAGAs metrics computed successfully!
⚠️ Results do not contain expected item-wise scores format. Formatting with default values.
📋 Output formatted with default values for 3 items
💾 Results saved to ragas_output.json
🎉 RAGAs integration completed successfully!
📊 Processed 3 items
📁 Results saved to 'ragas_output.json'


In [23]:
# Load and display the results
try:
    with open('ragas_output.json', 'r', encoding='utf-8') as f:
        results = json.load(f)

    print("📊 RAGAs Evaluation Results:")
    print("=" * 60)

    # Create a DataFrame for better visualization
    df = pd.DataFrame(results)

    # Display results table
    print(df.to_string(index=False))

    print("\n📈 Summary Statistics:")
    print("-" * 30)
    print(f"Average Faithfulness: {df['faithfulness'].mean():.3f}")
    print(f"Average Answer Relevancy: {df['answer_relevancy'].mean():.3f}")
    print(f"Average Context Precision: {df['context_precision'].mean():.3f}")

    # Display individual results
    print("\n🔍 Detailed Results:")
    print("-" * 30)
    for result in results:
        print(f"ID: {result['id']}")
        print(f"  Faithfulness: {result['faithfulness']}")
        print(f"  Answer Relevancy: {result['answer_relevancy']}")
        print(f"  Context Precision: {result['context_precision']}")
        print()

except FileNotFoundError:
    print("❌ Results file not found. Please run the integration first.")
except Exception as e:
    print(f"❌ Error displaying results: {e}")


📊 RAGAs Evaluation Results:
      id  faithfulness  answer_relevancy  context_precision
item-001           0.0               0.0                0.0
item-002           0.0               0.0                0.0
item-003           0.0               0.0                0.0

📈 Summary Statistics:
------------------------------
Average Faithfulness: 0.000
Average Answer Relevancy: 0.000
Average Context Precision: 0.000

🔍 Detailed Results:
------------------------------
ID: item-001
  Faithfulness: 0.0
  Answer Relevancy: 0.0
  Context Precision: 0.0

ID: item-002
  Faithfulness: 0.0
  Answer Relevancy: 0.0
  Context Precision: 0.0

ID: item-003
  Faithfulness: 0.0
  Answer Relevancy: 0.0
  Context Precision: 0.0

