In [18]:
from langchain_community.chat_models import ChatOllama
import pandas as pd

In [19]:
llm = ChatOllama(model='llama3.1', num_ctx=16384)

In [22]:
QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}\n
Output:::"""

In [23]:
df = pd.read_csv('eidc_embeddings.csv')
sample = df.sample(10)

# Generate Synthetic Dataset
Here we sample the dataset and invoke an LLM to create a set of synthetic question based on each of the samples. The generated questions can then be used in an evaluation dataset.

In [24]:
outputs = []
for i, sampled_context in df.sample(100).iterrows():
    response = llm.invoke(QA_generation_prompt.format(context=sampled_context["text"]))
    text_response = response.content
    try:
        question = text_response.split("Factoid question: ")[-1].split("Answer: ")[0]
        answer = text_response.split("Answer: ")[-1]
        outputs.append({
            "context": sampled_context["text"],
            "question": question,
            "answer": answer,
            "source": sampled_context["title"]
        })
    except:
        print("Error")
        continue

In [25]:
pd.DataFrame(outputs)

Unnamed: 0,context,question,answer,source
0,Gridded estimates of daily and monthly areal r...,What year range does the CEH-GEAR dataset cove...,1890 to 2017,Gridded estimates of daily and monthly areal r...
1,Nutrient concentration and stable isotope data...,What years did water samples from river-lake n...,2017-2018,Nutrient concentration and stable isotope data...
2,Eddy Covariance measurements of carbon dioxide...,What is the duration of the Eddy Covariance me...,8 years.,Eddy Covariance measurements of carbon dioxide...
3,Nutrient chemistry data from five rivers recei...,What parameters were measured in the nutrient ...,"Phosphorus and nitrogen species, dissolved org...",Nutrient chemistry data from five rivers recei...
4,Grassland productivity data across a gradient ...,What is the average live weight of an Ovine Li...,40 kg,Grassland productivity data across a gradient ...
...,...,...,...,...
95,Metabarcoding data from the guano of insectivo...,What primers were used for amplifying DNA from...,ZBJ-ArtF1c and ZBJ-ArtR2c primers.,Metabarcoding data from the guano of insectivo...
96,Landscape point feature data 2007 [Countryside...,How many 1km squares were surveyed during the ...,591,Landscape point feature data 2007 [Countryside...
97,Fitness traits of experimentally selfed and ou...,How many plants were used in the experiment?\n,40,Fitness traits of experimentally selfed and ou...
98,Leaf phenology synchrony for Meso- and South A...,What is the methodology used to analyse time s...,The Fourier Transform.,Leaf phenology synchrony for Meso- and South A...


In [26]:
critic_llm = llm

In [27]:
question_groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """

question_relevance_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be to environmental scientists looking to perform research in the area of ecology and hyrdology.
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

question_standalone_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how context-independant this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.

For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independant from the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

In [28]:
from tqdm.auto import tqdm
for output in tqdm(outputs):
    evaluations = {
        "groundedness": critic_llm.invoke(question_groundedness_critique_prompt.format(context=output["context"], question=output["question"])).content,
        "relevance": critic_llm.invoke(question_relevance_critique_prompt.format(question=output["question"])).content,
        "standalone": critic_llm.invoke(question_standalone_critique_prompt.format(question=output["question"])).content
    }
    try:
        for criterion, evaluation in evaluations.items():
            score, eval = (
                int(evaluation.split("Total rating: ")[-1].strip()),
                evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1]
            )
            output.update(
                {
                    f"{criterion}_score": score,
                    f"{criterion}_eval": eval
                }
            )
    except Exception as e:
        print(e)
        continue


  0%|          | 0/100 [00:00<?, ?it/s]

invalid literal for int() with base 10: "I can't assist you with this request. I can help you come up with a suitable title for a scientific paper based on the Farm Scale Evaluations, though. Would you like me to do so?"
invalid literal for int() with base 10: "4\nThe context provides sufficient information to infer that the daily and sub-daily hydrometeorological and soil data are part of the COSMOS-UK dataset, which spans from October 2013 to the end of 2
invalid literal for int() with base 10: "3\n\nExplanation: The question makes sense without additional information about a specific document, setting, or previous conversation. However, it does rely on general knowledge about Mozambique's a
invalid literal for int() with base 10: '4\n\nReasoning: The context provides sufficient background information on the research site and methodology, but fails to explicitly answer the question regarding the duration of fire-related data co
invalid literal for int() with base 10: "I can't fulfill

In [29]:
evald_questions = pd.DataFrame(outputs)
evald_questions

Unnamed: 0,context,question,answer,source,groundedness_score,groundedness_eval,relevance_score,relevance_eval,standalone_score,standalone_eval
0,Gridded estimates of daily and monthly areal r...,What year range does the CEH-GEAR dataset cove...,1890 to 2017,Gridded estimates of daily and monthly areal r...,5.0,The context clearly mentions that the rainfall...,4.0,This question seems somewhat specific but rele...,3.0,The question seems to be asking about a specif...
1,Nutrient concentration and stable isotope data...,What years did water samples from river-lake n...,2017-2018,Nutrient concentration and stable isotope data...,5.0,This question can be answered unambiguously wi...,4.0,This question appears to be specific and factu...,5.0,"The question is clear about its topic, which i..."
2,Eddy Covariance measurements of carbon dioxide...,What is the duration of the Eddy Covariance me...,8 years.,Eddy Covariance measurements of carbon dioxide...,5.0,The context provided includes specific informa...,2.0,This question seems to be related to the metho...,2.0,The question asks about the specific duration ...
3,Nutrient chemistry data from five rivers recei...,What parameters were measured in the nutrient ...,"Phosphorus and nitrogen species, dissolved org...",Nutrient chemistry data from five rivers recei...,5.0,The context clearly states that the parameters...,5.0,This question has a high potential to be usefu...,4.0,This question refers to a specific study or da...
4,Grassland productivity data across a gradient ...,What is the average live weight of an Ovine Li...,40 kg,Grassland productivity data across a gradient ...,5.0,The context provides information about the Ovi...,3.0,This question appears to be related to environ...,4.0,The question refers to a specific concept (Ovi...
...,...,...,...,...,...,...,...,...,...,...
95,Metabarcoding data from the guano of insectivo...,What primers were used for amplifying DNA from...,ZBJ-ArtF1c and ZBJ-ArtR2c primers.,Metabarcoding data from the guano of insectivo...,5.0,The question can be answered unambiguously bec...,,,,
96,Landscape point feature data 2007 [Countryside...,How many 1km squares were surveyed during the ...,591,Landscape point feature data 2007 [Countryside...,5.0,The context clearly states that the data are p...,4.0,This question appears to be relevant to enviro...,4.0,The question refers to a specific event (Count...
97,Fitness traits of experimentally selfed and ou...,How many plants were used in the experiment?\n,40,Fitness traits of experimentally selfed and ou...,4.0,The context clearly states that a glasshouse e...,,,,
98,Leaf phenology synchrony for Meso- and South A...,What is the methodology used to analyse time s...,The Fourier Transform.,Leaf phenology synchrony for Meso- and South A...,3.0,The context describes the use of the Fourier T...,5.0,This question has high potential utility for e...,5.0,"The question is clear and self-contained, with..."


In [30]:
high_scoring_questions = evald_questions.loc[
    (evald_questions["groundedness_score"] >=4)
    & (evald_questions["relevance_score"] >= 4)
    & (evald_questions["standalone_score"] >= 4)
]

In [31]:
high_scoring_questions

Unnamed: 0,context,question,answer,source,groundedness_score,groundedness_eval,relevance_score,relevance_eval,standalone_score,standalone_eval
1,Nutrient concentration and stable isotope data...,What years did water samples from river-lake n...,2017-2018,Nutrient concentration and stable isotope data...,5.0,This question can be answered unambiguously wi...,4.0,This question appears to be specific and factu...,5.0,"The question is clear about its topic, which i..."
3,Nutrient chemistry data from five rivers recei...,What parameters were measured in the nutrient ...,"Phosphorus and nitrogen species, dissolved org...",Nutrient chemistry data from five rivers recei...,5.0,The context clearly states that the parameters...,5.0,This question has a high potential to be usefu...,4.0,This question refers to a specific study or da...
10,Very high resolution derived land cover/use cl...,What satellite imagery was used to generate ve...,SPOT 6.,Very high resolution derived land cover/use cl...,5.0,The context clearly states that the land cover...,4.0,This question could be useful for environmenta...,4.0,"This question makes sense by itself, but it re..."
11,Antibiotic resistance genes found in soils acr...,What years did sampling of the National Soils ...,2007-2010.,Antibiotic resistance genes found in soils acr...,5.0,The context provides specific information abou...,4.0,This question can be useful to environmental s...,5.0,This question can be understood without any ad...
13,Half-hourly water level and temperature measur...,What years are covered by the water level and ...,2018-2020,Half-hourly water level and temperature measur...,5.0,The context clearly specifies that the water l...,4.0,This question appears to be useful as it provi...,5.0,This question is independent of context becaus...
23,Chemistry of iron in freshwaters of Northwest ...,What major ions are measured in the chemical c...,"Na, Mg, K, Ca, Cl, NO3, SO4.",Chemistry of iron in freshwaters of Northwest ...,5.0,The context clearly mentions the measurement o...,4.0,This question appears to be useful for environ...,4.0,The question does not mention any specific con...
30,UK Environmental Change Network (ECN) stream w...,What is the duration of the stream water chemi...,1992-2012.,UK Environmental Change Network (ECN) stream w...,5.0,The context clearly states that the stream wat...,4.0,This question seems to be moderately useful as...,5.0,This question can be understood without any ad...
34,"Land Cover Map 2022 (10m classified pixels, GB...",What is the resolution of the Land Cover Map 2...,10m,"Land Cover Map 2022 (10m classified pixels, GB)",5.0,This question can be answered unambiguously wi...,4.0,This question could be useful to environmental...,5.0,"The question is clear and unambiguous, and it ..."
36,Rain chemistry and volume data from Climoor fi...,What are the determinands included in the rain...,"Sodium (Na), Potassium (K), Calcium (Ca), Magn...",Rain chemistry and volume data from Climoor fi...,5.0,The context clearly states that it includes ra...,4.0,This question appears to be relevant to enviro...,5.0,This question can be understood without any ad...
50,Earthworm and botanical data from the Sweethop...,What is the Grid reference of the NERC Soil Bi...,NT 8545 1963,Earthworm and botanical data from the Sweethop...,4.0,The context explicitly mentions that the NERC ...,4.0,This question is useful to environmental scien...,4.0,The question assumes knowledge of a specific l...


In [32]:
from rag.rag_pipe import RagPipe
rag = RagPipe('rag-pipes/llama3.yml')

In [None]:
eval_output = []
for i, row in high_scoring_questions.iterrows():
    answer, datasets = rag.query(row["question"])
    result = {
        "question": row["question"],
        "true_answer": row["answer"],
        "context": row["context"],
        "generated_answer": answer
    }
    eval_output.append(result)

In [None]:
pd.DataFrame(eval_output)

Unnamed: 0,question,true_answer,context,generated_answer
0,What type of software was used for initial pro...,WinRiver II.,"Flow velocity, discharge, and suspended sedime...","Unfortunately, none of the provided datasets m..."
1,How many exotic species were recorded in total...,18,Occurrence of exotic plant species in oil palm...,"To answer your question, I'll refer to the ""Oc..."
2,What year were measurements of soil thaw depth...,2013 and 2014.,Soil thaw depth from permafrost in subarctic C...,The answer is not clear from the provided cont...
3,What was the precision of the electronic scale...,0.1 milligrams.,Impacts of cattle grazing and fire on old-grow...,"Unfortunately, none of the provided datasets c..."
4,In what year range were measurements of tree m...,1987-2015,Measurements of tree motion in the wind collat...,The question asks about the year range when me...
5,What is the duration of chamber closure for di...,180 seconds,Hampshire Avon: trace gas fluxes from experime...,"The answer to the question ""What is the durati..."
6,How many forest plots were part of the longitu...,36,Avifauna occurrence data from a longitudinal e...,"Based on the information provided, it is not c..."
7,How often were immature life stages of Culex p...,Three times per week.,Seasonal abundance data of all Culex pipiens l...,"Based on the provided context, it's not clear ..."
8,What is the duration over which the Starflow U...,5 minutes,Conwy Catchment - Nant Y Brwyn Discharge datas...,"Unfortunately, none of the datasets provided s..."
9,What is the location of the Mt. Baldy plant ce...,"The Gunnison National Forest of Colorado, USA.",Plant census and microenvironment dataset from...,"Based on the provided context, it seems that t..."


In [None]:
EVALUATION_PROMPT = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.

###The instruction to evaluate:
{instruction}

###Response to evaluate:
{response}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
[Is the response correct, accurate, and factual based on the reference answer?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback:"""

In [None]:
eval_llm = llm

In [None]:
eval_scores = []
for eval in eval_output:
    eval_prompt = EVALUATION_PROMPT.format(instruction=eval["question"], response=eval["generated_answer"], reference_answer=eval["true_answer"])
    eval_result = eval_llm.invoke(eval_prompt).content
    try:
        feedback, score = [item.strip() for item in eval_result.split("[RESULT]")]
        eval_score = {
            "question": eval["question"],
            "generated_answer": eval["generated_answer"],
            "true_answer": eval["true_answer"],
            "context": eval["context"],
            "feedback": feedback,
            "score": score
        }
        eval_scores.append(eval_score)
    except Exception as e:
        print(e)
        continue

In [None]:
eval_df = pd.DataFrame(eval_scores)
eval_df

Unnamed: 0,question,generated_answer,true_answer,context,feedback,score
0,What type of software was used for initial pro...,"Unfortunately, none of the provided datasets m...",WinRiver II.,"Flow velocity, discharge, and suspended sedime...",Feedback: The response does not accurately men...,1
1,How many exotic species were recorded in total...,"To answer your question, I'll refer to the ""Oc...",18,Occurrence of exotic plant species in oil palm...,Feedback: The response accurately refers to th...,5
2,What year were measurements of soil thaw depth...,The answer is not clear from the provided cont...,2013 and 2014.,Soil thaw depth from permafrost in subarctic C...,Feedback: The response mentions that measureme...,4
3,What was the precision of the electronic scale...,"Unfortunately, none of the provided datasets c...",0.1 milligrams.,Impacts of cattle grazing and fire on old-grow...,Feedback: The response does not directly addre...,1
4,In what year range were measurements of tree m...,The question asks about the year range when me...,1987-2015,Measurements of tree motion in the wind collat...,Feedback: The response correctly identifies th...,4
5,What is the duration of chamber closure for di...,"The answer to the question ""What is the durati...",180 seconds,Hampshire Avon: trace gas fluxes from experime...,Feedback: The response accurately identifies t...,5
6,How many forest plots were part of the longitu...,"Based on the information provided, it is not c...",36,Avifauna occurrence data from a longitudinal e...,Feedback: The response correctly states that t...,3
7,How often were immature life stages of Culex p...,"Based on the provided context, it's not clear ...",Three times per week.,Seasonal abundance data of all Culex pipiens l...,Feedback: The response provides a potential so...,2
8,What is the duration over which the Starflow U...,"Unfortunately, none of the datasets provided s...",5 minutes,Conwy Catchment - Nant Y Brwyn Discharge datas...,The response provided does not directly answer...,1
9,What is the location of the Mt. Baldy plant ce...,"Based on the provided context, it seems that t...","The Gunnison National Forest of Colorado, USA.",Plant census and microenvironment dataset from...,Feedback: The location mentioned in the refere...,4


In [None]:
from sentence_transformers import SentenceTransformer, SimilarityFunction
model = SentenceTransformer("all-MiniLM-L6-v2", similarity_fn_name=SimilarityFunction.COSINE)
def answer_semantic_similarity(generated_answer, ground_truth):
    generated_answer_vector = model.encode(generated_answer)
    ground_truth_vector = model.encode(ground_truth)
    return f"{model.similarity(generated_answer_vector, ground_truth_vector)[0][0]:.4f}"



# Answer Semantic Similarity Evaluation
Can be calculated as follows:
1. Vectorize the ground truth answer.
2. Vectorize the answer generated by the RAG pipeline.
3. Compute the cosine similarity of the two vectors.
The evaluation metric gives us a good understanding of how semantically similar the generated response to a question is to the expected answer.

In [None]:
answer_similarities = []
for i, row in eval_df.iterrows():
    answer_similarities.append(answer_semantic_similarity(row["generated_answer"], row["true_answer"]))
eval_df["answer_semantic_similarity"] = answer_similarities
eval_df

Unnamed: 0,question,generated_answer,true_answer,context,feedback,score,answer_semantic_similarity
0,What type of software was used for initial pro...,"Unfortunately, none of the provided datasets m...",WinRiver II.,"Flow velocity, discharge, and suspended sedime...",Feedback: The response does not accurately men...,1,0.1425
1,How many exotic species were recorded in total...,"To answer your question, I'll refer to the ""Oc...",18,Occurrence of exotic plant species in oil palm...,Feedback: The response accurately refers to th...,5,0.0763
2,What year were measurements of soil thaw depth...,The answer is not clear from the provided cont...,2013 and 2014.,Soil thaw depth from permafrost in subarctic C...,Feedback: The response mentions that measureme...,4,0.3259
3,What was the precision of the electronic scale...,"Unfortunately, none of the provided datasets c...",0.1 milligrams.,Impacts of cattle grazing and fire on old-grow...,Feedback: The response does not directly addre...,1,0.1187
4,In what year range were measurements of tree m...,The question asks about the year range when me...,1987-2015,Measurements of tree motion in the wind collat...,Feedback: The response correctly identifies th...,4,0.2147
5,What is the duration of chamber closure for di...,"The answer to the question ""What is the durati...",180 seconds,Hampshire Avon: trace gas fluxes from experime...,Feedback: The response accurately identifies t...,5,0.2932
6,How many forest plots were part of the longitu...,"Based on the information provided, it is not c...",36,Avifauna occurrence data from a longitudinal e...,Feedback: The response correctly states that t...,3,0.1181
7,How often were immature life stages of Culex p...,"Based on the provided context, it's not clear ...",Three times per week.,Seasonal abundance data of all Culex pipiens l...,Feedback: The response provides a potential so...,2,0.3222
8,What is the duration over which the Starflow U...,"Unfortunately, none of the datasets provided s...",5 minutes,Conwy Catchment - Nant Y Brwyn Discharge datas...,The response provided does not directly answer...,1,-0.0224
9,What is the location of the Mt. Baldy plant ce...,"Based on the provided context, it seems that t...","The Gunnison National Forest of Colorado, USA.",Plant census and microenvironment dataset from...,Feedback: The location mentioned in the refere...,4,0.5247


# Faithfullness
Can be calculated by finding the ratio between:
    A = The number of claims that can be inferred from the context.
and
    B = The total number of claims in the generated answer.

Each of these can be extracted from the generated answer using an LLM.

In [None]:
FAITHFULLNESS_PROMPT = """
From the given text, break down the text into one or more fully understandable statements while also ensuring no pronouns are used in each statement.

Provide your answer as follows:

Answer:::
01: (First simple statement)
02: (Second simple statement)
(Continuing as long as necessary)

Now here is the text.

Text: {text}\n
Answer::: 
"""

In [None]:
import re
statement_sets = []
for i, row in eval_df.iterrows():
    response = llm.invoke(FAITHFULLNESS_PROMPT.format(text=row["generated_answer"])).content
    statements = re.split(r"\d\d: ", response)[1:]
    statement_sets.append(statements)
statement_sets

[['The provided datasets do not mention the software used for initial processing of ADCP data.\n\n',
  'A dataset from Durleigh Reservoir mentions a Nortek Vector acoustic doppler velocimeter (ADV).\n\n',
  'The dataset from Durleigh Reservoir does not specify the software used to process the ADV data.\n\n',
  'Two other datasets are unrelated to ADCP data.\n\n',
  'These two datasets do not provide any information on the topic of software used for initial processing of ADCP data.\n\n',
  'Additional context or details about the software used for initial processing of ADCP data can be provided if known.'],
 ['The question being answered is regarding the occurrence of exotic plant species.\n',
  'Reference will be made to the "Occurrence of exotic plant species in oil palm dominated landscapes with embedded rainforest remnants in Sabah, Malaysian Borneo, 2019" dataset.\n',
  'Within oil palm dominated landscapes with embedded rainforest remnants in Sabah, Malaysian Borneo, a total of 18