In [117]:
import pandas as pd
import minsearch
from groq import Groq
from mistralai import Mistral
from dotenv import load_dotenv
import os

## Loading Environment Variables

In [115]:
load_dotenv()


True

In [118]:
# Access the API keys
mistral_api_key = os.getenv("MISTRAL_API_KEY")
groq_api_key = os.getenv("GROQ_API_KEY")

In [119]:
client = Groq(api_key=groq_api_key)
client_mistral=Mistral(api_key=mistral_api_key)

## Data Loading

In [17]:
df=pd.read_csv("../dataset/Mental_Health_FAQ.csv")

In [18]:
df.head(10)

Unnamed: 0,Question_ID,Questions,Answers
0,1590140,What does it mean to have a mental illness?,Mental illnesses are health conditions that di...
1,2110618,Who does mental illness affect?,It is estimated that mental illness affects 1 ...
2,6361820,What causes mental illness?,It is estimated that mental illness affects 1 ...
3,9434130,What are some of the warning signs of mental i...,Symptoms of mental health disorders vary depen...
4,7657263,Can people with mental illness recover?,"When healing from mental illness, early identi..."
5,1619387,What should I do if I know someone who appears...,Although this website cannot substitute for pr...
6,1030153,How can I find a mental health professional fo...,Feeling comfortable with the professional you ...
7,8022026,What treatment options are available?,Just as there are different types of medicatio...
8,1155199,"If I become involved in treatment, what do I n...",Since beginning treatment is a big step for in...
9,7760466,What is the difference between mental health p...,There are many types of mental health professi...


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98 entries, 0 to 97
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Question_ID  98 non-null     int64 
 1   Questions    98 non-null     object
 2   Answers      98 non-null     object
dtypes: int64(1), object(2)
memory usage: 2.4+ KB


In [21]:
documents=df.to_dict('records')

In [22]:
documents[0]

{'Question_ID': 1590140,
 'Questions': 'What does it mean to have a mental illness?',
 'Answers': 'Mental illnesses are health conditions that disrupt a personâ€™s thoughts, emotions, relationships, and daily functioning. They are associated with distress and diminished capacity to engage in the ordinary activities of daily life.\nMental illnesses fall along a continuum of severity: some are fairly mild and only interfere with some aspects of life, such as certain phobias. On the other end of the spectrum lie serious mental illnesses, which result in major functional impairment and interference with daily life. These include such disorders as major depression, schizophrenia, and bipolar disorder, and may require that the person receives care in a hospital.\nIt is important to know that mental illnesses are medical conditions that have nothing to do with a personâ€™s character, intelligence, or willpower. Just as diabetes is a disorder of the pancreas, mental illness is a medical condit

## Indexing the data Using Minsearch

In [32]:
!curl -O https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/minsearch.py


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  3832  100  3832    0     0   4260      0 --:--:-- --:--:-- --:--:--  4291


In [None]:
df

In [23]:
df = df.rename(columns={'Question_ID': 'question_id', 'Questions': 'questions','Answers':'answers'})


In [28]:
df.columns

Index(['question_id', 'questions', 'answers'], dtype='object')

In [29]:
documents=df.to_dict('records')

In [36]:
documents[0]

{'question_id': 1590140,
 'questions': 'What does it mean to have a mental illness?',
 'answers': 'Mental illnesses are health conditions that disrupt a personâ€™s thoughts, emotions, relationships, and daily functioning. They are associated with distress and diminished capacity to engage in the ordinary activities of daily life.\nMental illnesses fall along a continuum of severity: some are fairly mild and only interfere with some aspects of life, such as certain phobias. On the other end of the spectrum lie serious mental illnesses, which result in major functional impairment and interference with daily life. These include such disorders as major depression, schizophrenia, and bipolar disorder, and may require that the person receives care in a hospital.\nIt is important to know that mental illnesses are medical conditions that have nothing to do with a personâ€™s character, intelligence, or willpower. Just as diabetes is a disorder of the pancreas, mental illness is a medical condit

In [40]:
index=minsearch.Index(
    text_fields=['questions', 'answers'],
    keyword_fields=[]
)

In [41]:
index.fit(documents)

<minsearch.Index at 0x25470cb40d0>

In [42]:
query="What should I eat if I lost a friend"

In [45]:
index.search(query)

[{'question_id': 4759773,
  'questions': 'What should I do if I’m worried about a friend or relative?',
  'answers': 'This may depend on your relationship with them. Gently encouraging someone to seek appropriate support would be helpful to start with.'},
 {'question_id': 3388962,
  'questions': 'What should I know before starting a new medication?',
  'answers': 'The best source of information regarding medications is the physician prescribing them. He or she should be able to answer questions such as:    1. What is the medication supposed to do? 2. When should it begin to take effect, and how will I know when it is effective? 3. How is the medication taken and for how long? What food, drinks, other medicines, and activities should be avoided while taking this medication? 4. What are the side effects and what should be done if they occur? 5. What do I do if a dose is missed? 6. Is there any written information available about this medication? 7. Are there other medications that might 

## Evaluating Retrieval

In [48]:
df_questions = pd.read_csv('../dataset/ground_truth_data.csv')

In [49]:
df_questions.head()

Unnamed: 0,id,question
0,1590140,How do mental illnesses affect a person's dail...
1,1590140,What are some examples of serious mental illne...
2,1590140,Why is it a misconception to associate mental ...
3,1590140,"How are mental illnesses treated, and what is ..."
4,1590140,Can a person with mental illness become indepe...


In [51]:
ground_truth=df_questions.to_dict('records')

In [52]:
ground_truth[0]

{'id': 1590140,
 'question': "How do mental illnesses affect a person's daily functioning and relationships?"}

In [53]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [57]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)


In [72]:
def precision(relevance_list):
    """
    Precision: Proportion of retrieved documents that are relevant.
    """
    relevant_retrieved = sum(relevance_list)  # True indicates relevance
    total_retrieved = len(relevance_list)  # All retrieved documents
    if total_retrieved == 0:
        return 0.0  # Avoid division by zero
    return relevant_retrieved / total_retrieved

In [73]:
def recall(relevance_list, total_relevant):
    """
    Recall: Proportion of relevant documents that are retrieved.
    """
    relevant_retrieved = sum(relevance_list)  # True indicates relevance
    if total_relevant == 0:
        return 0.0  # Avoid division by zero
    return relevant_retrieved / total_relevant

In [86]:
def evaluate(ground_truth, search_function):
    relevance_total = []
    precision_scores = []
    recall_scores = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        
        # Get search results for this query
        results = search_function(q)
        
        # Check if the correct document (matching question_id) is in the results
        relevance = [doc['question_id'] == doc_id for doc in results]
        relevance_total.append(relevance)
        
        # Precision: Fraction of retrieved documents that are relevant
        precision_score = precision(relevance)
        precision_scores.append(precision_score)
        
        # Recall: There is only 1 relevant document per query, so recall is either 1 or 0
        recall_score = recall(relevance, 1)
        recall_scores.append(recall_score)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
        'precision': sum(precision_scores) / len(precision_scores),  # Average precision
        'recall': sum(recall_scores) / len(recall_scores),  # Average recall
    }


In [61]:
from tqdm.auto import tqdm


In [87]:
evaluate(ground_truth, lambda q: search(q['question']))


  0%|          | 0/366 [00:00<?, ?it/s]

{'hit_rate': 0.9180327868852459,
 'mrr': 0.7107739179460493,
 'precision': 0.09180327868852516,
 'recall': 0.9180327868852459}

High Recall, Low Precision: This suggests that the system is finding most of the relevant documents, but it is also retrieving a lot of irrelevant ones. You might want to focus on improving the precision by better tuning your search algorithm (e.g., improving query relevance, adjusting boosts, or filtering irrelevant results).

## Rag Flow

In [126]:
entry_template="""
You are an expert mental health assistant specialized in providing detailed and accurate answers based on the given context.
Answer the QUESTION based on the CONTEXT from our meantal health database.
Use only the facts from the CONTEXT when answering the QUESTION.

Here is the context:

Context: {context}

Please answer the following question based on the provided context:

Question: {question}

Provide a detailed and informative response. Ensure that your answer is clear, concise, and directly addresses the question while being relevant to the context provided.

Your response should be in plain text and should not include any code blocks or extra formatting.

Answer:
""".strip()
def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt


In [123]:
def llm(prompt,model):
        response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}])
        return response.choices[0].message.content

In [124]:
def rag(query, model='gpt-4o-mini'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    #print(prompt)
    answer = llm(prompt, model=model)
    return answer

In [127]:
rag(query)

KeyError: 'answer_llm'

In [110]:
prompt_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [105]:
record=ground_truth[0]

In [111]:
prompt=prompt_template.format(**record)

KeyError: 'answer_llm'

In [109]:
record

{'id': 1590140,
 'question': "How do mental illnesses affect a person's daily functioning and relationships?"}