# Prepare Evaluation Dataset

Source: https://docs.ragas.io/en/latest/getstarted/rag_testset_generation/#choose-your-llm

## Install Dependencies

In [None]:
%pip install ragas
%pip install unstructured
%pip install unstructured[pdf]
%pip install unstructured[docx]
%pip install 'langchain-openai>=0.2.1,<0.3.0'
%pip install 'tiktoken>=0.7.0,<0.8.0'

## Load Azure configurations

You always need to run this!

In [13]:
from dotenv import load_dotenv
import os
from pathlib import Path

azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_openai_key = os.getenv("AZURE_OPENAI_API_KEY")
azure_openai_deployment = os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME")
azure_openai_embeddings_deployment = os.getenv("AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT")
azure_openai_api_version = os.getenv("AZURE_OPENAI_API_VERSION")
azure_search_service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
azure_search_service_admin_key = os.getenv("AZURE_SEARCH_ADMIN_KEY")
azure_search_service_index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")
embedding_endpoint =  os.getenv("EMBEDDING_ENDPOINT_URL")

## Load Documents

Note: I had an error loading to files first time so I had to run this in the github codespaces terminal:
- sudo apt-get update
- sudo apt-get install -y libgl1-mesa-glx

In [4]:
from langchain_community.document_loaders import DirectoryLoader

path = "../data/hikingproducts/dataset"
loader = DirectoryLoader(path, glob="**/*.pdf")
docs = loader.load()

  from .autonotebook import tqdm as notebook_tqdm


## Prepare Language Model

In [8]:
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

generator_llm = LangchainLLMWrapper(AzureChatOpenAI(
    openai_api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    azure_deployment=azure_openai_deployment,
    model=azure_openai_deployment,
    validate_base_url=False,
    api_key=azure_openai_key
))

# init the embeddings for answer_relevancy, answer_correctness and answer_similarity
generator_embeddings = LangchainEmbeddingsWrapper(AzureOpenAIEmbeddings(
    openai_api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    azure_deployment=azure_openai_embeddings_deployment,
    model=azure_openai_embeddings_deployment,
    api_key=azure_openai_key
))

## Generate the Test Set

In [9]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(docs, testset_size=10)

Generating personas: 100%|██████████| 3/3 [00:01<00:00,  2.94it/s]                                             
Generating Scenarios: 100%|██████████| 3/3 [00:10<00:00,  3.62s/it]
Generating Samples: 100%|██████████| 12/12 [00:04<00:00,  2.78it/s]


## Save just the query and ground truth to a JSONL file for evaluation

In [14]:
import pandas as pd

# Create DataFrame
df = dataset.to_pandas()

# Create a new DataFrame for EvalCollection
eval_collection = pd.DataFrame(columns=['query', 'response', 'context', 'ground_truth'])

# Populate the new DataFrame
eval_collection['query'] = df['user_input']
eval_collection['ground_truth'] = df['reference']
eval_collection['response'] = ''
eval_collection['context'] = ''

# Save the DataFrame as a JSONL file
eval_collection.to_json('../data/hikingproducts/evaluation/hikingproductseval.jsonl', orient='records', lines=True)

## Generate the Response and Context from the Language Model

In [15]:
from openai import AzureOpenAI
import json

# Initialize Azure OpenAI client with key-based authentication
client = AzureOpenAI(
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_key,
    api_version=azure_openai_api_version,
)

# Iterate through the DataFrame and generate responses and their contexts
for index, row in eval_collection.iterrows():
    # User Query
    query = row['query']  

    chat_prompt = [
        {
            "role": "system",
            "content": "You are an AI assistant that helps people find information."
        },
        {
            "role": "user",
            "content": query
        }
    ]

    # Message structure for the chat prompt
    messages = chat_prompt

    # Generate response using the Azure OpenAI client
    completion = client.chat.completions.create(
        model=azure_openai_deployment,
        messages=messages,
        max_tokens=800,
        temperature=1,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None,
        stream=False,
        extra_body={
        "data_sources": [{
            "type": "azure_search",
            "parameters": {
                "endpoint": azure_search_service_endpoint,
                "index_name": azure_search_service_index_name,
                "semantic_configuration": "hikingproductsindex01-semantic-configuration",
                "query_type": "vector_semantic_hybrid",
                "fields_mapping": {
                "content_fields_separator": "\n",
                "content_fields": [
                    "chunk"
                ],
                "filepath_field": "title",
                "title_field": "chunk_id",
                "url_field": None,
                "vector_fields": [
                    "text_vector"
                ]
                },
                "in_scope": True,
                "filter": None,
                "strictness": 3,
                "top_n_documents": 3,
                "authentication": {
                "type": "api_key",
                "key": azure_search_service_admin_key
                },
                "embedding_dependency": {
                "type": "endpoint",
                "endpoint": embedding_endpoint,
                "authentication": {
                    "type": "api_key",
                    "key": azure_openai_key
                }
                }
            }
            }]
        }
    )

    citations_combined = ""
    response_json = json.loads(completion.to_json())
    citations = response_json['choices'][0]['message']['context']['citations']
    citations_combined = "\n=============\n".join(citation['content'] for citation in citations)

    # Update the response and context in the DataFrame
    eval_collection.at[index, 'response'] = completion.choices[0].message.content
    eval_collection.at[index, 'context'] = citations_combined
    
# Save the updated DataFrame as a JSONL file
eval_collection.to_json('../data/hikingproducts/evaluation/hikingproductsevalfinal.jsonl', orient='records', lines=True)

# Print success message
print("EvalCollection has been saved to hikingproductsevalfinal.jsonl")

EvalCollection has been saved to hikingproductsevalfinal.jsonl
