In [34]:
import pandas as pd

In [106]:
df = pd.read_csv("../data/data.csv")

## Ingestion

In [3]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

In [119]:
documents = df.to_dict(orient='records')

In [120]:
import minsearch

In [121]:
index = minsearch.Index(
    text_fields=[ 'activity_name', 'activity_type', 'materials_needed',
       'time_required', 'age_group', 'difficulty_level', 'instructions'],
    keyword_fields=[]
)

In [122]:
index.fit(documents)

<minsearch.Index at 0x1342e6f5690>

## RAG Flow

In [94]:
query = "i want to plan an activity for my child, time duration is around one hour"

In [95]:
from openai import OpenAI

client = OpenAI()

response = client.chat.completions.create(
    model='gpt-4o-mini',
    messages=[{"role": "user", "content": query}]
)

response.choices[0].message.content

"That sounds great! Here are a few ideas for a fun, one-hour activity for your child, depending on their age and interests:\n\n### 1. **Nature Scavenger Hunt** \n   - **Materials**: Paper and pencil (or a printed list), a bag for collecting items.\n   - **Activity**: Create a list of items commonly found in your backyard or a nearby park (like a leaf, a flower, a rock, etc.). Go outside with your child and see how many items they can find within an hour.\n\n### 2. **Arts and Crafts Project**\n   - **Materials**: Basic craft supplies (paper, markers, glue, scissors, etc.), any recyclable items you have at home.\n   - **Activity**: Choose a simple craft project, like making greeting cards or decorating picture frames. Let them get creative!\n\n### 3. **Indoor Obstacle Course**\n   - **Materials**: Household items like pillows, chairs, and blankets.\n   - **Activity**: Set up an obstacle course around your house using the materials you've gathered. Time your child as they navigate through

In [96]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=5
    )

    return results

In [97]:
prompt_template = """
You're a activity planner. Answer the QUESTION based on the CONTEXT from our activity database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

entry_template = ''' 
activity_name:{activity_name}
activity_type:{activity_type}
materials_needed:{materials_needed}
time_required:{time_required}
age_group:{age_group}
difficulty_level:{difficulty_level} 
instructions:{instructions} 
'''.strip()

def build_prompt(query, search_results): 
    
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [244]:
def llm(prompt,model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model = model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [245]:
def rag(query,model='gpt-4o-mini'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, model=model)
    return answer

In [102]:
query = 'I have seeds, what activity can I do with my kids'
answer = rag(query)

In [103]:
print(answer)

You can do the "Edible Garden Planting" activity with your kids, as it requires seeds, soil, and pots. This activity is suitable for school-aged children and takes about 1.5 hours. You will plant the seeds in pots or garden beds and care for them as they grow into edible plants, such as herbs or vegetables.


## Retrieval Evaluation

In [125]:
df_question = pd.read_csv('../data/ground-truth-retrieval.csv')

In [126]:
df_question.head()

Unnamed: 0,id,question
0,0,What materials do I need for the Family Picnic?
1,0,How long should we allocate for the Family Pic...
2,0,What age group is this Family Picnic suitable ...
3,0,Can you provide a simple instruction for organ...
4,0,What is the difficulty level of setting up the...


In [127]:
ground_truth = df_question.to_dict(orient='records')

In [128]:
ground_truth[0]

{'id': 0, 'question': 'What materials do I need for the Family Picnic?'}

In [129]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [130]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [189]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = str(q['id'])
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance) 

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [190]:
from tqdm.auto import tqdm

In [191]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/1205 [00:00<?, ?it/s]

{'hit_rate': 0.8970954356846473, 'mrr': 0.8228716327471518}

## Finding the best parameters

In [221]:
df_validation = df_question[:100]
df_test = df_question[100:]

In [222]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [223]:
gt_val = df_validation.to_dict(orient='records')

In [224]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [225]:
param_ranges = {
    'activity_name': (0.0, 5.0),
    'activity_type': (0.0, 5.0),
    'materials_needed': (0.0, 5.0),
    'time_required': (0.0, 5.0),
    'age_group': (0.0, 5.0),
    'difficulty_level': (0.0, 5.0),
    'instructions': (0.0, 5.0),
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [226]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'activity_name': 3.9380757174569143,
  'activity_type': 3.992601905600103,
  'materials_needed': 1.1356172652800272,
  'time_required': 0.7322746848502865,
  'age_group': 1.0490159109626285,
  'difficulty_level': 1.8574140178530485,
  'instructions': 1.9990201946518575},
 0.8523611111111111)

In [229]:
def minsearch_improved(query):
    boost = {
      'activity_name': 3.94,
      'activity_type': 3.99,
      'materials_needed': 1.14,
      'time_required': 0.73,
      'age_group': 1.05,
      'difficulty_level': 1.86,
      'instructions': 2.00
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/1205 [00:00<?, ?it/s]

{'hit_rate': 0.8979253112033195, 'mrr': 0.8263930053349142}

## RAG Evaluation

In [230]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [231]:
len(ground_truth)

1205

In [234]:
record = ground_truth[0]
question = record['question']
answer_llm = rag(question)

In [236]:
print(question, answer_llm)

What materials do I need for the Family Picnic? For the Family Picnic, you will need the following materials: a picnic blanket, a basket of food, and drinks.


In [237]:
prompt = prompt2_template.format(question=question, answer_llm=answer_llm)
print(prompt)

You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: What materials do I need for the Family Picnic?
Generated Answer: For the Family Picnic, you will need the following materials: a picnic blanket, a basket of food, and drinks.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}


In [238]:
import json

In [239]:
df_sample = df_question.sample(n=200, random_state=1)

In [241]:
sample = df_sample.to_dict(orient='records')

In [243]:
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question) 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [246]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [247]:
df_eval.relevance.value_counts()

relevance
RELEVANT           153
PARTLY_RELEVANT     46
NON_RELEVANT         1
Name: count, dtype: int64

In [248]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.765
PARTLY_RELEVANT    0.230
NON_RELEVANT       0.005
Name: proportion, dtype: float64

In [249]:
df_eval.to_csv('../data/rag-eval-gpt-4o-mini.csv', index=False)

In [250]:
df_eval[df_eval.relevance == 'NON_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation
137,The recommended duration for the Laser Maze ch...,43,How long is the recommended duration for the L...,NON_RELEVANT,The answer states that the recommended duratio...


In [251]:
evaluations_gpt4o = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question, model='gpt-4o') 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)
    
    evaluations_gpt4o.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [252]:
df_eval = pd.DataFrame(evaluations_gpt4o, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [253]:
df_eval.relevance.value_counts()

relevance
RELEVANT           158
PARTLY_RELEVANT     41
NON_RELEVANT         1
Name: count, dtype: int64

In [254]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.790
PARTLY_RELEVANT    0.205
NON_RELEVANT       0.005
Name: proportion, dtype: float64

In [255]:
df_eval.to_csv('../data/rag-eval-gpt-4o.csv', index=False)

In [256]:
df_eval[df_eval.relevance == 'NON_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation
137,The recommended duration for the Laser Maze ch...,43,How long is the recommended duration for the L...,NON_RELEVANT,The generated answer incorrectly states that t...
