# Prepare Evaluation Dataset

Source: https://docs.ragas.io/en/latest/getstarted/rag_testset_generation/#choose-your-llm

## Install Dependencies

In [None]:
%pip install ragas
%pip install unstructured
%pip install unstructured[pdf]
%pip install langchain-openai

## Load Azure configurations

You always need to run this!

In [None]:
from dotenv import load_dotenv
import os

load_dotenv() # take environment variables from .env.

azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_openai_key = os.getenv("AZURE_OPENAI_KEY")
azure_openai_deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT")
azure_openai_embeddings_deployment = os.getenv("AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT")
azure_openai_api_version = "2024-10-01-preview"
azure_search_service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
azure_search_service_admin_key = os.getenv("AZURE_SEARCH_ADMIN_KEY")
azure_search_service_index_name = "az-search-index-001"
azure_storage_connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")

## Load Documents

Note: I had an error loading to files first time so I had to run this in the github codespaces terminal:
- sudo apt-get update
- sudo apt-get install -y libgl1-mesa-glx

In [None]:
from langchain_community.document_loaders import DirectoryLoader

path = "data/nasa-book/"
loader = DirectoryLoader(path, glob="**/*.pdf")
docs = loader.load()

## Prepare Language Model

In [None]:
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

generator_llm = LangchainLLMWrapper(AzureChatOpenAI(
    openai_api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    azure_deployment=azure_openai_deployment,
    model=azure_openai_deployment,
    validate_base_url=False,
    api_key=azure_openai_key
))

# init the embeddings for answer_relevancy, answer_correctness and answer_similarity
generator_embeddings = LangchainEmbeddingsWrapper(AzureOpenAIEmbeddings(
    openai_api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    azure_deployment=azure_openai_embeddings_deployment,
    model=azure_openai_embeddings_deployment,
    api_key=azure_openai_key
))

## Generate the Test Set

In [None]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(docs, testset_size=10)

## Save just the query and ground truth to a JSONL file for evaluation

In [None]:
import pandas as pd

df = dataset.to_pandas()

# Create DataFrame
df = dataset.to_pandas()

# Function to clean text
def clean_text(text_list):
    cleaned_text_list = []
    for text in text_list:
        # Remove the UUID (assuming it's always at the start and followed by two newlines)
        cleaned_text = text.split('\n\n', 1)[-1]
        cleaned_text_list.append(cleaned_text)
    return cleaned_text_list

# Apply the function to the 'reference_contexts' column to remove UUID at the start
df['reference_contexts'] = df['reference_contexts'].apply(clean_text)

# Save to CSV file
df.to_csv('data/output/evalset.csv', index=False)

# Create a new DataFrame for EvalCollection
eval_collection = pd.DataFrame(columns=['query', 'response', 'context', 'ground_truth'])

# Populate the new DataFrame
eval_collection['query'] = df['user_input']
eval_collection['ground_truth'] = df['reference']
eval_collection['response'] = ''
eval_collection['context'] = ''

# Save the DataFrame as a JSONL file
eval_collection.to_json('data/output/eval.jsonl', orient='records', lines=True)

## Generate the Response and Context from the Language Model

In [None]:
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery
from azure.core.credentials import AzureKeyCredential
from openai import AzureOpenAI

# Get credential from Azure AI Search Admin key
credential = AzureKeyCredential(azure_search_service_admin_key)
search_client = SearchClient(endpoint=azure_search_service_endpoint, 
                             credential=credential, 
                             index_name=azure_search_service_index_name)

# Azure OpenAI client
openai_client = AzureOpenAI(
    # to get version: https://learn.microsoft.com/en-us/azure/ai-services/openai/api-version-deprecation
    api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_key)

# Provide instructions to the model
SYSTEM_PROMPT="""
You are an AI assistant that helps users learn from the information found in the source material.
Answer the query using only the sources provided below.
Use bullets if the answer has multiple points.
If the answer is longer than 3 sentences, provide a summary.
Answer ONLY with the facts listed in the list of sources below. Cite your source when you answer the question
If there isn't enough information below, say you don't know.
Do not generate answers that don't use the sources below.
Query: {query}
Sources:\n{sources}
"""
# Iterate over each row in eval_collection
for index, row in eval_collection.iterrows():
    # User Query
    query = row['query']  

    # Convert query into vector form
    vector_query = VectorizableTextQuery(text=query, 
                                        k_nearest_neighbors=50, 
                                        fields="text_vector",
                                        weight=1)

    results = search_client.search(
        query_type="semantic", 
        semantic_configuration_name='my-semantic-config',
        search_text=query,
        vector_queries= [vector_query],
        select=["title","chunk"],
        top=3,
    )

    # Use a unique separator to make the sources distinct. 
    # We chose repeated equal signs (=) followed by a newline because it's unlikely the source documents contain this sequence.
    sources_formatted = "=================\n".join([f'TITLE: {document["title"]}, CONTENT: {document["chunk"]}' for document in results])

    # Update the context in the DataFrame
    eval_collection.at[index, 'context'] = sources_formatted

    response = openai_client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": SYSTEM_PROMPT.format(query=query, sources=sources_formatted)
            }
        ],
        model=azure_openai_deployment
    )

    # Update the response in the DataFrame
    eval_collection.at[index, 'response'] = response.choices[0].message.content

# Save the updated DataFrame as a JSONL file
eval_collection.to_json('data/output/eval.jsonl', orient='records', lines=True)

# Print success message
print("EvalCollection has been saved to eval.jsonl")
