In [2]:
# Importing libraries
import os
import requests
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from ragas.llms import LangchainLLMWrapper
from ragas import evaluate as ragas_eval
from ragas import metrics, EvaluationDataset

# Load environment variables
load_dotenv()

True

In [3]:
# Setting up CropWizard specific variables
prod_url = str(os.getenv("UIUC_CHAT_PROD_URL"))
cropwizard_url = str(os.getenv("CROPWIZARD_API_URL"))
cropwiz_api_key = str(os.getenv("UIUC_CHAT_API_KEY"))
db_version="cropwizard-1.5"
cw_groups=["All Documents"]
token_limit=128000
cropwiz_sys_prompt = "You are a helpful assistant. Follow the user's instructions carefully. Respond using markdown. If the question is related to farming in ANY WAY then always answer the question even if the documents are not helpful. Give the users the most helpful response possible as best you can. Remember this: always give a reasonable answer to the users question and never refuse to answer even if you have little information to go on; give an array of helpful answers if necessary. ALWAYS respond with something helpful to answer their questions and propose solutions. ALWAYS provide citations, but ONLY when relevant. Rules for inline citations: (1) Never provide consecutive inline citations, ALWAYS with the relevant text and only for relevant documents! (2) If there are consecutive inline citations, they should ALWAYS be less than 5 (3) If there are consecutive inline citations, then ALWAYS provide them separately like [1], [2], [3] and so on... If the user asks an introductory question or greeting along the lines of \"hello\" or \"what can you do?\" or \"What's in here?\" or \"what is CropWizard?\" or similar, then please respond with a warm welcome to CropWizard, the AI farm assistant chatbot. Tell them that you can answer questions using the entire knowledge base of Extension plus a growing list of open-access research publications. Whether you need information on crop management, pest control, or any other farming-related topic, feel free to ask! When the provided documents don't contain the answer, say in bold italic text \"The CropWizard database doesn't have anything covering this exact question, but here's what I know from my general world knowledge.\" Always refer to the provided documents as \"the CropWizard database\" and use bold italics when giving this disclaimer."

In [4]:
def get_prompt_tokens(prompt:str, 
                      url:str =prod_url,
                      db:str=db_version, 
                      groups:list=cw_groups, 
                      limit:int=token_limit, 
                      verbose:bool=False) -> str:

    """
    Posts a prompt to CropWizard, and returns the token vector as a JSON.
    Arguments:
    url -- Address of CropWizard instance being prompted.
    prompt -- A string representing the prompt submitted to CropWizard.
    db -- A string representing the name of the queried database. | Default: cropwizard-1.5
    groups -- a list containing all databases to be queried. | Default: ["All Documents"]
    limit -- An integer representing the token limit for the query. | Default: 128000
    
    Returns:
    A dictionary of tokens representing the fragments, retrieved from the submitted prompt.
    """

    entry:dict = {
    "course_name": db,
    "doc_groups": groups,
    "search_query": prompt,
    "token_limit": limit
    }

    if verbose:
        print(entry)

    response = requests.post(url, json=entry)
    
    if response.status_code != 200:
        raise ValueError(f"failed to retrieve data (error_code: {response.status_code})")
    fragments = response.json()
    
    return fragments

In [5]:
def query_cropwizard(prompt:str,
                     model:str="gpt-4o-mini",
                     url:str=cropwizard_url, 
                     course:str=db_version, 
                     group:list=cw_groups, 
                     limit:int=token_limit) -> dict:
    """
    Function to send a prompt to CropWizard and get the response.
    """
    payload = {
    "model": model,
    "messages": [
        {
            "role": "system",
            "content": cropwiz_sys_prompt
        },
        {
            "role": "user",
            "content": prompt
        }
    ],
    "temperature": 0.1,
    "course_name": course,
    "doc_groups": group,
    "search_query": prompt,
    "token_limit": limit,
    "stream": True,
    "api_key": cropwiz_api_key,
}
    
    response = requests.post(url, json=payload)

    if response.status_code != 200:
        raise ValueError(f"failed to retrieve data (error_code: {response.status_code})")
    
    return response.text

In [6]:
def create_test_cases(question_answer_pairs:dict) -> dict:
    """
    Creates a test case dictionary from a question-answer dictionary.

    Args:
        question_answer_pairs (dict): Dictionary with keys representing questions and values representing expert answers.

    Returns:
        test_cases (dict): Dictionary with keys "question", "answer", "retrieved_contexts", and "ground_truth".
    """

    test_cases = {"question":[], "answer":[], "retrieved_contexts":[], "ground_truth":[]}
    for key,value in question_answer_pairs.items():
        test_cases["question"].append(key)
        test_cases["answer"].append(query_cropwizard(key))
        test_cases["retrieved_contexts"].append(get_prompt_tokens(key))
        test_cases["ground_truth"].append(value)

    return test_cases

In [7]:
def preprocess_test_cases(test_cases:dict) -> dict:
    """
    Extracts text from the "retrieved_contexts" key from a test_cases dictionary.

    Args:
        test_cases (dict): Dictionary with keys "question", "answer", "retrieved_contexts", and "ground_truth".

    Returns:
        dict: Dictionary with keys "question", "answer", "retrieved_contexts", and "ground_truth", where "retrieved_contexts" 
        now only contains the contents of its "text" key
    """

    return {
        "question": test_cases["question"],
        "answer": test_cases["answer"],
        "retrieved_contexts": [
            [entry["text"] for entry in inner_list] if isinstance(inner_list, list) else inner_list
            for inner_list in test_cases["retrieved_contexts"] if isinstance(test_cases["retrieved_contexts"], list)
        ],
        "ground_truth": test_cases["ground_truth"],
    }

In [8]:
# Create a dataset from the preprocessed dictionary

def create_dataset(data:dict):
    """
    Cleans the input dictionary by removing entries where `retrieved_contexts` is a string instead of a list.

    Args:
        data (dict): Dictionary with keys "question", "answer", "retrieved_contexts", and "ground_truths".

    Returns:
        cleaned_data (dict): Cleaned dictionary with valid entries.
        removed_entries (list): List of dictionaries containing removed entries for review.
    """
    removed_entries = []  # To store removed tuples for review

    # Ensure all lists have the same length
    keys = ["question", "answer", "retrieved_contexts", "ground_truth"]
    assert all(len(data[key]) == len(data[keys[0]]) for key in keys), "All lists must have the same length."

    # Iterate over retrieved_contexts and remove invalid entries
    valid_indices = []
    for i, retrieved_context in enumerate(data["retrieved_contexts"]):
        if isinstance(retrieved_context, list):
            valid_indices.append(i)  # Keep valid entries
        elif isinstance(retrieved_context, str) and "error" in retrieved_context.lower():
            # Add invalid entries to removed_entries
            removed_entries.append((
                data["question"][i],
                data["answer"][i],
                data["retrieved_contexts"][i],
                data["ground_truth"][i],
            ))

    # 

    # Filter the dictionary to keep only valid entries
    cleaned_data = {
        key: [data[key][i] for i in valid_indices] for key in keys
    }

    return cleaned_data, removed_entries

In [9]:
def convert_dict_to_list(data:dict) -> list:
    """
    Converts a dictionary with keys 'question', 'answer', 'retrieved_contexts', and 'ground_truth'
    into a list of dictionaries with the desired structure.

    Args:
        data (dict): Input dictionary with keys as lists of matching indexes.

    Returns:
        list: A list of dictionaries following the specified layout.
    """
    dataset = []
    for i in range(len(data["question"])):
        dataset.append({
            "user_input": data["question"][i],
            "retrieved_contexts": data["retrieved_contexts"][i],
            "response": data["answer"][i],
            "reference": data["ground_truth"][i],
        })
    return dataset

### Evaluation of test case dataset

In [10]:
from json import load
# Importing cached dataset from Json

with open("Ragas_test.json", "r") as json_file:
    Ragas_test = load(json_file)

Ragas_test

{'question': ['How do selective insecticides differ in their impact on natural enemies compared to broad-spectrum insecticides?',
  'How might the findings from the poultry litter study influence future agricultural practices in Northeast Arkansas?',
  'Analyze the significance of the April 17, 2024 event for agricultural professionals in Arizona.',
  'What symptoms were observed in redbay trees affected by Laurel Wilt?',
  'Explain the significance of adjusting the downward force in precision planting and its impact on seed depth.',
  'If manure use had not increased by 30% from 2012 to 2017, how might this have affected the integration of zeolites into crop production?',
  'True or False: The text suggests that sonic devices have been proven effective for pest management.',
  'What were the average direct expenses per acre for corn fields in the CGSRVP for 2009?',
  'What role does the University of Arkansas System Division of Agriculture play in managing plant diseases like Dogwood 

In [11]:
preprocessed_ragas_test = preprocess_test_cases(Ragas_test)

preprocessed_ragas_test

{'question': ['How do selective insecticides differ in their impact on natural enemies compared to broad-spectrum insecticides?',
  'How might the findings from the poultry litter study influence future agricultural practices in Northeast Arkansas?',
  'Analyze the significance of the April 17, 2024 event for agricultural professionals in Arizona.',
  'What symptoms were observed in redbay trees affected by Laurel Wilt?',
  'Explain the significance of adjusting the downward force in precision planting and its impact on seed depth.',
  'If manure use had not increased by 30% from 2012 to 2017, how might this have affected the integration of zeolites into crop production?',
  'True or False: The text suggests that sonic devices have been proven effective for pest management.',
  'What were the average direct expenses per acre for corn fields in the CGSRVP for 2009?',
  'What role does the University of Arkansas System Division of Agriculture play in managing plant diseases like Dogwood 

In [12]:
eval_dataset, error_log = create_dataset(preprocessed_ragas_test)

error_log

[('What were the average direct expenses per acre for corn fields in the CGSRVP for 2009?',
  "**_The CropWizard database doesn't have anything covering this exact question, but here's what I know from my general world knowledge._**\n\nFor the 2009 growing season, average direct expenses per acre for corn fields can vary based on location and specific farming practices. However, based on historical data from similar reports, direct expenses for corn typically include costs for seed, fertilizer, herbicides, fuel, and labor.\n\nIn general, average direct expenses for corn production have been reported to range from approximately $300 to $600 per acre in various regions, depending on input prices and farming practices. For a more precise figure, it would be best to consult specific agricultural reports or databases that focus on the 2009 crop year.\n\nIf you have access to specific reports or data from that year, you might find the exact figures you need. \n\nIf you have any other questio

##### Log results into Langsmith

In [13]:
# Setting up Langchain specific variables
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = str(os.getenv("LANGCHAIN_API_KEY"))
os.environ["LANGCHAIN_PROJECT"] = "cropwizard_testing"

In [14]:
langsmith_ragas_eval = EvaluationDataset.from_list(convert_dict_to_list(eval_dataset))

In [15]:
# Log results to LangSmith

llm = ChatOpenAI(model="gpt-4o-mini")
evaluator_llm = LangchainLLMWrapper(llm)

result = ragas_eval(
    dataset=langsmith_ragas_eval,
    metrics=[metrics.LLMContextRecall(), metrics.Faithfulness(), metrics.FactualCorrectness(), metrics.ContextPrecision(), metrics.ContextRecall()],
    llm=evaluator_llm,
)

result

Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]

Exception raised in Job[1]: TimeoutError()
Exception raised in Job[5]: TimeoutError()
Exception raised in Job[6]: TimeoutError()
Exception raised in Job[11]: TimeoutError()
Exception raised in Job[14]: TimeoutError()
Exception raised in Job[16]: TimeoutError()
Exception raised in Job[19]: TimeoutError()
Exception raised in Job[20]: TimeoutError()
Exception raised in Job[21]: TimeoutError()
Exception raised in Job[24]: TimeoutError()
Exception raised in Job[25]: TimeoutError()
Exception raised in Job[26]: TimeoutError()
Exception raised in Job[29]: TimeoutError()
Exception raised in Job[34]: TimeoutError()
Exception raised in Job[36]: TimeoutError()
Exception raised in Job[40]: TimeoutError()
Exception raised in Job[41]: TimeoutError()
Exception raised in Job[44]: TimeoutError()
Exception raised in Job[45]: TimeoutError()
Exception raised in Job[46]: TimeoutError()
Exception raised in Job[49]: TimeoutError()
Exception raised in Job[51]: TimeoutError()
Exception raised in Job[54]: Timeou

{'context_recall': 1.0000, 'faithfulness': 0.9667, 'factual_correctness': 0.4606, 'context_precision': 0.6010}