## Import Libraries

In [1]:
%pip install --quiet --upgrade langchain langchain-community langchain-chroma
%pip install --upgrade --quiet  langchain-huggingface text-generation transformers numexpr langchainhub sentencepiece jinja2 bitsandbytes accelerate langchain_experimental
!pip install -qU langchain-core evaluate rouge_score bert_score

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m91.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m628.3/628.3 kB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m92.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.8/94.8 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0

In [2]:
import pandas as pd
import numpy as np
import torch
import random
import tqdm
import os
import re
from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForQuestionAnswering, AutoModel
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Langchain RAG
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import HuggingFaceHub
from langchain_huggingface import HuggingFaceEmbeddings, ChatHuggingFace, HuggingFacePipeline
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

## Data Gathering and Preparation

In [None]:
symptoms = pd.read_csv('data/symptoms.csv')
diseases = pd.read_csv('data/diseases.csv')

In [None]:
display(symptoms.head())
display(diseases.head())

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


Unnamed: 0,Disease,Description
0,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...
1,Malaria,An infectious disease caused by protozoan para...
2,Allergy,An allergy is an immune system response to a f...
3,Hypothyroidism,"Hypothyroidism, also called underactive thyroi..."
4,Psoriasis,Psoriasis is a common skin disorder that forms...


In [None]:
# there are 17 column of symptoms but not all rows or diseases have all 17 symptoms, it is NaN if the symptom is less than 17
# change the data into a text file with each paragraph as a disease, the description of the disease and its symptoms
format_other_symptoms = '''
The other kind symptoms of [name of disease] are [other symptoms].
'''
format = '''
[name of disease] [description of disease]. The symptoms of [name of disease] are [symptoms of disease]. [format_other_symptoms]
'''

# create a dictionary of diseases and their symptoms
diseases['Disease'] = diseases['Disease'].str.strip()
symptoms['Disease'] = symptoms['Disease'].str.strip()

# fix the wrong name on disease or the british spelling
symptoms['Disease'] = symptoms['Disease'].replace('Dimorphic hemmorhoids(piles)', 'Dimorphic hemorrhoids(piles)')
symptoms['Disease'] = symptoms['Disease'].replace('Peptic ulcer diseae', 'Peptic ulcer disease')
diseases['Disease'] = diseases['Disease'].replace('Peptic ulcer diseae', 'Peptic ulcer disease')
symptoms = symptoms.applymap(lambda x: x.replace('diarrhoea', 'diarrhea') if isinstance(x, str) else x)
diseases['Description'] = diseases['Description'].replace('diarrhoea', 'diarrhea')
# check
# symptoms[symptoms['Disease']=='Dimorphic hemorrhoids(piles)']

diseases_dict = {}
for i in range(len(diseases)):
    # create a dictionary of descriptions of diseases
    diseases_dict[diseases['Disease'][i]] = diseases['Description'][i]

symptoms_dict = {}
for i in range(len(symptoms)):
    disease = symptoms['Disease'][i].strip()
    if disease not in symptoms_dict:
        symptoms_dict[disease] = []
    current_symptoms = set()  # Use a set to ensure symptoms are unique
    for j in range(1, 18):  # There are 17 symptom columns
        symptom_col = f'Symptom_{j}'
        if pd.notna(symptoms[symptom_col][i]):
            current_symptoms.add(symptoms[symptom_col][i].strip().replace('_', ' ').replace('  ', ' '))
    symptoms_dict[disease].append(list(current_symptoms))
# display(symptoms_dict['Dimorphic hemorrhoids(piles)'])
# display(diseases_dict['Dimorphic hemorrhoids(piles)'])

def format_text(disease, desc, list_of_symptoms):
    """
    Formats text for a disease, its description, and symptoms into paragraphs.

    :param disease: Name of the disease
    :param desc: Description of the disease
    :param list_of_symptoms: List of lists of symptoms for the disease
    :return: Formatted text
    """
    # Main symptoms from the first list
    main_symptoms = ', '.join(sorted(list_of_symptoms[0]))

    # Use a set to track unique sentences
    unique_symptoms_sentences = set()
    unique_symptoms_sentences.add(main_symptoms)
    other_symptoms_sentences = []

    for i in range(1, len(list_of_symptoms)):
        # Create a sorted and comma-separated string of symptoms
        other_symptoms = ', '.join(sorted(list_of_symptoms[i]))
        sentence = f"The other kind symptoms of {disease} are {other_symptoms}."
        if other_symptoms not in unique_symptoms_sentences:
            unique_symptoms_sentences.add(other_symptoms)
            other_symptoms_sentences.append(sentence)

    # Combine all parts into the formatted text
    other_symptoms_text = ' '.join(other_symptoms_sentences)
    desc = desc[:len(disease)].replace(disease, '') + desc[len(disease):]
    formatted_text = (
        f"{disease} {desc}. The symptoms of {disease} are {main_symptoms}. {other_symptoms_text}\n"
    )

    return formatted_text.replace('  ', ' ').replace('..', '.')

# Write the formatted text to a file
output_file = 'data/diseases.txt'
with open(output_file, 'w') as f:
    for disease in symptoms_dict:
        if disease in diseases_dict:
            formatted_text = format_text(disease, diseases_dict[disease], symptoms_dict[disease])
            f.write(formatted_text + '\n')
        else:
            print(f"Warning: No description found for disease '{disease}'")

print(f"File '{output_file}' has been created.")

  symptoms = symptoms.applymap(lambda x: x.replace('diarrhoea', 'diarrhea') if isinstance(x, str) else x)


File 'data/diseases.txt' has been created.


In [None]:
# read txt data
with open('data/medquad_combined.txt', 'r') as file:
    data = file.readlines()
import re
# get all question from medquad_combined.txt that have ? at the end get the sentence and the sentence after it
question_answer = []
skip_words = ['espaol', 'symptoms', 'treatment', 'causes', 'diagnosis', 'prevention', 'complications', 'risk factors', 'prognosis', 'overview', 'definition', 'description', 'signs', 'topics', 'transmission']
# or next_data.lower() in skip_words
for i in range(len(data)):
    if data[i].endswith('?\n'):
        # if the next sentence is \n then skip
        next_data = data[i+1].strip()
        next_i = i+1
        while next_data == '' or next_data == '\n' or len(next_data.split()) <= 8:
            next_i += 1
            next_data = data[next_i].strip()
        # remove special characters
        data[i] = re.sub(r'[^\w\s]', '', data[i])
        question_answer.append([data[i].strip(), next_data.strip()])
# check
display(len(question_answer))
display(question_answer[1000])

16237

['What is the outlook for Childhood Extracranial Germ Cell Tumors',
 "Certain factors affect prognosis (chance of recovery) and treatment options. The prognosis (chance of recovery) and treatment options depend on the following:         - The type of germ cell tumor.    - Where the tumor first began to grow.    - The stage of the cancer (whether it has spread to nearby areas or to other places in the body).    - How well the tumor responds to treatment (lower AFP and -hCG levels).    - Whether the tumor can be completely removed by surgery.    - The patient's age and general health.    - Whether the cancer has just been diagnosed or has recurred (come back).        The prognosis for childhood extracranial germ cell tumors, especially ovarian germ cell tumors, is good."]

## Test Data

In [None]:
# new test data
# make a dataframe from the question and answer
question_answer_df = pd.DataFrame(question_answer, columns=['question', 'expected_answer'])

# display(question_answer_df.isna().sum())
# display(question_answer_df.duplicated().sum())
# display(question_answer_df[question_answer_df['question'] == ''])
# display(question_answer_df[question_answer_df['expected_answer'] == ''])
# display(question_answer_df[question_answer_df['question'].str.len() < 5])
# display(question_answer_df[question_answer_df['expected_answer'].str.len() < 5])
display(len(question_answer_df))

def clean_dataframe(df):
    """
    Cleans the dataframe by removing NaN values, duplicates, empty strings, and rows with string length less than 5.

    :param df: The input dataframe with columns 'question' and 'expected_answer'
    :return: Cleaned dataframe
    """
    # lowercase the qusetion and answer
    df['question'] = df['question'].str.lower()
    df['expected_answer'] = df['expected_answer'].str.lower()

    # Drop NaN values
    df = df.dropna()

    # Drop duplicates
    df = df.drop_duplicates()

    # drop duplicate questions, keep the last one
    df = df.drop_duplicates(subset='question', keep='last')

    # Drop rows with empty strings
    df = df[df['question'] != '']
    df = df[df['expected_answer'] != '']

    # Drop rows with string length less than 5
    df = df[df['question'].str.len() >= 5]
    df = df[df['expected_answer'].str.len() >= 5]

    # remove answer that has ?
    df = df[~df['expected_answer'].str.contains('\?')]

    return df

# Clean the dataframe
question_answer_df = clean_dataframe(question_answer_df)

# Check the cleaned data
display(question_answer_df.isna().sum())
display(question_answer_df.duplicated().sum())
display(question_answer_df[question_answer_df['question'] == ''])
display(question_answer_df[question_answer_df['expected_answer'] == ''])
display(question_answer_df[question_answer_df['question'].str.len() < 5])
display(question_answer_df[question_answer_df['expected_answer'].str.len() < 5])
display(len(question_answer_df))
# save it
question_answer_df.to_csv('data/all_question_answer.csv', index=False)

  df = df[~df['expected_answer'].str.contains('\?')]


16237

question           0
expected_answer    0
dtype: int64

0

Unnamed: 0,question,expected_answer


Unnamed: 0,question,expected_answer


Unnamed: 0,question,expected_answer


Unnamed: 0,question,expected_answer


10345

In [None]:
# read the data
all_test_df = pd.read_csv('data/all_question_answer.csv')
all_test_df.head()
# mean of the words length expected answer
length = all_test_df['expected_answer'].apply(lambda x: len(x.split()))
# display the statistics
display(length.describe())
# display the minimum expected answer
display(all_test_df.iloc[length.idxmin()])

count    10345.000000
mean       113.739198
std        100.354184
min          9.000000
25%         46.000000
50%         83.000000
75%        154.000000
max       1116.000000
Name: expected_answer, dtype: float64

question                        who is at risk for testicular cancer
expected_answer    health history can affect the risk of testicul...
Name: 438, dtype: object

In [None]:
# read the data
all_test_df = pd.read_csv('data/all_question_answer.csv')
all_test_df.head()
# mean of the words length expected answer
length = all_test_df['expected_answer'].apply(lambda x: len(x.split()))
# display the statistics
display(length.describe())
# display the minimum expected answer
display(all_test_df.iloc[length.idxmin()])

count    16074.000000
mean       146.916636
std        135.730791
min          1.000000
25%         51.000000
50%        108.000000
75%        216.000000
max       1931.000000
Name: expected_answer, dtype: float64

question           What is are High Blood Pressure
expected_answer                             Espaol
Name: 35, dtype: object

In [None]:
# make a test data
# Generate test data
test_data = []

# 1. Symptoms-based question: What disease does this symptom list correspond to?
for disease, symptom_lists in symptoms_dict.items():
    unique_symptom_lists = set(tuple(sorted(symptom_list)) for symptom_list in symptom_lists)
    for symptom_list in unique_symptom_lists:
        question = f"I have symptoms such as {', '.join(symptom_list)}. What the name of the disease?"
        expected_answer = disease
        test_data.append((question, expected_answer))

# 2. Description-based question: What is the disease name based on the description?
for disease, desc in diseases_dict.items():
    question = f"What is the disease name based on this description: {desc}?"
    expected_answer = disease
    test_data.append((question, expected_answer))

# 3. Disease-based question: What is the description of the disease [name]?
for disease, desc in diseases_dict.items():
    question = f"What is the description of the disease {disease}?"
    expected_answer = desc
    test_data.append((question, expected_answer))

# 4. Randomized Symptoms-based question: Randomize the order of symptoms and test if the model can still detect the disease
for disease, symptom_lists in symptoms_dict.items():
    unique_symptom_lists = set(tuple(sorted(symptom_list)) for symptom_list in symptom_lists)
    for symptom_list in unique_symptom_lists:
        unique_symptoms = list(set(symptom_list))
        random.shuffle(unique_symptoms)
        # randomize to make subset of symptoms not all of them
        num_unique_symptoms = len(unique_symptoms)
        min_num_unique_symptoms = int(num_unique_symptoms * 0.9)
        num_subset_symptoms = random.randint(min_num_unique_symptoms, num_unique_symptoms)
        unique_symptoms = unique_symptoms[:num_subset_symptoms]
        question = f"I have symptoms such as {', '.join(unique_symptoms)}. What the name of the disease?"
        expected_answer = disease
        test_data.append((question, expected_answer))

# Convert test data into a DataFrame
test_df = pd.DataFrame(test_data, columns=["Question", "Expected Answer"])

# Save the test data to a file for later use
test_df.to_csv('data/test_data_combined.csv', index=False)
print(f"Test data has been generated and saved to 'test_data.csv'.")
display(test_df.iloc[300])
display(len(test_df))

Test data has been generated and saved to 'test_data.csv'.


Question           I have symptoms such as blister, red sore arou...
Expected Answer                                             Impetigo
Name: 300, dtype: object

690

In [4]:
test_df = pd.read_csv('test_data.csv')
# if the second column has a duplicate keep first
test_df2 = test_df.drop_duplicates(subset=test_df.columns[1], keep='first')
display(len(test_df2))

82

In [None]:
# test_df2

## Evaluation Function

In [5]:
import evaluate
def cosine_sim(text1, text2):
    """
    Calculates the cosine similarity between two texts.
    :param text1: The first text.
    :param text2: The second text.
    :return: Cosine similarity score.
    """
    # Create the vectorizer and fit_transform the texts to convert them to vectors
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([text1, text2])

    # Calculate the cosine similarity between the two vectors
    cosine_sim_score = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])

    # Return the cosine similarity score between the two texts
    return cosine_sim_score[0][0]

def evaluate_rag_output(expected_answer, generated_answer, cosine_threshold=0.74, occurrence_threshold=0.8):
    """
    Evaluates the RAG (Retrieve and Generate) model output.
    - Compares the cosine similarity between query and the generated answer.
    - Checks if the generated answer contains the expected answer (or part of it).

    :param expected_answer: The expected answer that should be included in the generated answer.
    :param generated_answer: The answer generated by the model.
    :param cosine_threshold: Threshold for cosine similarity to consider the answer similar to the query.
    :return: A tuple with:
            - Boolean indicating if the cosine similarity meets the threshold.
            - Boolean indicating if the generated answer contains the expected answer.
    """
    # Preprocess the generated answer
    generated_answer = generated_answer.strip().replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').lower()
    expected_answer = expected_answer.strip().replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').lower()
    # remove special characters
    generated_answer = re.sub(r'[^\w\s]', '', generated_answer)
    expected_answer = re.sub(r'[^\w\s]', '', expected_answer)

    # Compute cosine similarity
    sim_score = cosine_sim(generated_answer, expected_answer)
    sim_check = sim_score >= cosine_threshold

    # Check if the expected answer is part of the generated answer (allowing for some flexibility)
    generated_answer = generated_answer.replace('.', '').replace(',', '')
    expected_answer = expected_answer.replace('.', '').replace(',', '')
    # expected_check = all(word in generated_answer for word in expected_answer)

    # count the number of words in the generated answer that are in the expected answer
    expected_words = set(expected_answer.split())
    generated_words = set(generated_answer.split())
    expected_check = len(expected_words.intersection
     (generated_words)) / len(expected_words) >= occurrence_threshold

    result = expected_check or sim_check

    return result, sim_check, expected_check, sim_score

rouge = evaluate.load('rouge')
bertscore = evaluate.load('bertscore')

def evaluate_rag_output2(expected_answers, generated_answers):
    #  preprocess, lower and remove special characters, use regex
    preprocess = lambda ans: re.sub(r'[^\w\s]', '', ans.lower())
    expected_answers = [preprocess(ans) for ans in expected_answers]
    generated_answers = [preprocess(ans) for ans in generated_answers]

    r = rouge.compute(predictions=generated_answers, references=expected_answers)
    b = bertscore.compute(predictions=generated_answers, references=expected_answers, lang="en")
    for key in b:
        if key != 'hashcode':  # Skip 'hashcode' key if present
            b[key] = [float(score) for score in b[key]]
    return r, b

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [None]:
# Example usage:
query = "What are the symptoms of Fungal infection?"
expected_answer = "itching, skin rash, nodal skin eruptions"
expected_answer = "Hypoglycemia"
generated_answer = "itching, skin rash, nodal skin eruptions"
generated_answer = "Fungal infection causes itching, skin rash, and nodal skin eruptions."
generated_answer = "I cannot provide a diagnosis, but I can suggest some possible causes for your symptoms. Please consult a healthcare professional for a proper diagnosis and treatment.\n\nPossible causes for these symptoms include:\n\n- Migraine\n- Anxiety disorders\n- Hypothyroidism\n- Hypoglycemia\n- Adrenal insufficiency\n- Vitamin deficiencies (e.g., B12 or iron)\n- Electrolyte imbalances\n- Certain medications\n- Stress and other medical conditions.\n\nPlease consult a healthcare professional for a proper diagnosis and treatment."

res, sim_check, expected_check, sim_score = evaluate_rag_output(expected_answer, generated_answer, cosine_threshold=0.75, occurrence_threshold=0.8)

print(f"Cosine Similarity: {sim_score:.4f}")
print(f"Cosine Similarity Above Threshold: {sim_check}")
print(f"Generated Answer Contains Expected: {expected_check}")
print(res)

expected_answers = ["itching, skin rash, nodal skin eruptions", "GERD", "dengue fever"]
generated_answers = ["itching, nodal skin eruptions, skin rash", "GERD", "dengue fever"]
generated_answers = ["itching, skin rash, nodal skin eruptions", "GERD", "dengue fever"]
print("----------------")
rough, bert_score = evaluate_rag_output2(expected_answers, generated_answers)
print(f"Rough Score: {rough}")
print(f"Bert Score: {bert_score}")

Cosine Similarity: 0.0907
Cosine Similarity Above Threshold: False
Generated Answer Contains Expected: True
True
----------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Rough Score: {'rouge1': 1.0, 'rouge2': 0.6666666666666666, 'rougeL': 1.0, 'rougeLsum': 1.0}
Bert Score: {'precision': [1.0, 1.0, 1.0], 'recall': [1.0, 1.0, 1.0], 'f1': [1.0, 1.0, 1.0], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.47.0)'}


In [6]:
from tqdm import tqdm
def evaluate_rag_model(test_df, rag_function):
    true_labels = []
    predicted_labels = []
    all_results = []

    for index, row in tqdm(test_df.iterrows(), total=test_df.shape[0], desc="Evaluating"):
        # first column and second column
        query, expected_answer = row.iloc[0], row.iloc[1]
        generated_answer, result = rag_function(query)

        # Evaluate RAG output
        result, sim_check, expected_check, sim_score = evaluate_rag_output(expected_answer, generated_answer)

        predicted_labels.append(int(result))
        # true labels is all true
        true_labels = [1] * len(predicted_labels)
        all_results.append({
            "query": query,
            "expected_answer": expected_answer,
            "generated_answer": generated_answer,
            "cosine_similarity": sim_score,
            "result": result,
            "cosine_check": sim_check,
            "expected_check": expected_check
        })

    # Calculate Accuracy and F1 Score
    accuracy = accuracy_score(true_labels, predicted_labels)
    report = classification_report(true_labels, predicted_labels, zero_division=0)

    return accuracy, report, all_results

def evaluate_rag_model2(test_df, rag_function):
    all_results = []
    generated_answers = []

    for index, row in tqdm(test_df.iterrows(), total=test_df.shape[0], desc="Evaluating"):
        # first column and second column
        query, expected_answer = row.iloc[0], row.iloc[1]
        generated_answer, result = rag_function(query)

        # Evaluate RAG output
        generated_answers.append(generated_answer)
        all_results.append({
            "query": query,
            "expected_answer": expected_answer,
            "generated_answer": generated_answer,
            "result": result
        })

    # evaluate the result with rough and bertscore
    rough, bert_score = evaluate_rag_output2(test_df.iloc[:, 1].tolist(), generated_answers)
    bert_score_avg = {}
    for key, value in bert_score.items():
      if isinstance(value[0], float):
        bert_score_avg[key] = np.mean(value)
    return rough, bert_score_avg, bert_score, all_results


## RAG

In [7]:
# Check if GPU is available
device = 0 if torch.cuda.is_available() else -1
device

0

In [8]:
# Reading the diseases.txt file
with open('diseases.txt', 'r') as f:
    corpus = f.read().strip().split('\n')
# This will give us a list of paragraphs
# Each paragraph contains a disease name, its description, and symptoms

# Step 1: Prepare Passage Retrieval
# Use SentenceTransformer to encode passages
retrieval_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1', device=device)
corpus_embeddings = retrieval_model.encode(corpus, convert_to_tensor=True)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Decoder Model

In [None]:
# use generative model to generate answer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
generator = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id).to(device)

# Ensure that the padding token is set correctly
generator.config.pad_token_id = generator.config.eos_token_id

if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token

# Step 3: Define Retrieval and Generation Function
def retrieve_and_generate_decoder(question, top_k=3):
    # Retrieve top-k passages
    question_embedding = retrieval_model.encode(question, convert_to_tensor=True)
    scores = util.pytorch_cos_sim(question_embedding, corpus_embeddings)[0]
    top_results = torch.topk(scores, k=top_k)

    # Fetch relevant passages
    retrieved_passages = [corpus[idx] for idx in top_results[1]]
    context = " ".join(retrieved_passages)  # Combine passages

    # Preprocess input text by removing the question from the context (avoid repetition)
    input_text = f"Context: {context}\nQuestion: {question}\nAnswer:"

    # Tokenize and generate answer
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=256, truncation=True).to(device)
    outputs = generator.generate(inputs, max_new_tokens=150, num_beams=5, early_stopping=True, no_repeat_ngram_size=2, pad_token_id=tokenizer.pad_token_id)

    # Decode the generated text and remove unwanted parts (like the prompt)
    generated_answer = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

    # Clean the generated answer : remove the prompt from the generated answer
    generated_answer = generated_answer.replace(input_text, "").strip()
    generated_answer = generated_answer.replace("Answer:", "").strip()
    generated_answer = generated_answer.replace("Context:", "").strip()

    return generated_answer, {"question": question, "answer": generated_answer, "retrieved_passages": retrieved_passages}

### Langchain RAG

In [9]:
device = 0 if torch.cuda.is_available() else -1

In [10]:
def create_chat_model(model_id):
    llm = HuggingFacePipeline.from_model_id(
        model_id=model_id,
        task="text-generation",
        pipeline_kwargs=dict(
            max_new_tokens=250,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            repetition_penalty=1.03,
            return_full_text=False,
        ),
        device=0
    )
    return ChatHuggingFace(llm=llm)

chat_model_llama = create_chat_model("unsloth/Llama-3.2-1B-Instruct")
chat_model_qwen = create_chat_model("unsloth/Qwen2.5-1.5B-Instruct")
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/927 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

Device set to use cuda:0


tokenizer_config.json:   0%|          | 0.00/7.51k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/759 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

Device set to use cuda:0


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [11]:
file = "diseases.txt"
# file = "medquad_combined.txt"

with open(file, 'r') as f:
    docs = f.readlines()
# retriever = SentenceTransformer('multi-qa-mpnet-base-dot-v1', device=device)
# retriever_embeddings = retriever.encode(docs, convert_to_tensor=True)
retriever = retrieval_model
retriever_embeddings = corpus_embeddings

In [12]:
prompt = ChatPromptTemplate([
    ("system", "You are a Healthcare Assistant. You are tasked with answering questions about diseases, symptoms, and precautions."),
    ("user", "Use the provided context to answer the following question clearly, simply, and without repetition. If you don't know the answer or if the context does not contain the information, state that you cannot answer.\n\nContext:\n{context}\n\nQuestion: {question}")
])


def format_docs(docs):
  return "\n\n".join(docs)

output_parser = StrOutputParser()

def retrieve_and_format(question):
    # Retrieve top-k passages
    question_embedding = retriever.encode(question, convert_to_tensor=True)
    scores = util.pytorch_cos_sim(question_embedding, retriever_embeddings)[0]
    top_results = torch.topk(scores, k=3)

    # Fetch relevant passages
    retrieved_passages = [docs[idx] for idx in top_results[1]]
    context = format_docs(retrieved_passages)

    return context

def retrieve_and_generate_langchain_model(model = chat_model_llama):
    chain = (
        {"context": retrieve_and_format, "question": RunnablePassthrough()}
        | prompt
        | model
        | output_parser
    )
    def run_chain(question):
        return chain.invoke(question), None
    return run_chain

In [None]:
# res, _ = retrieve_and_generate_langchain_model()(q)
# evaluate_rag_model(test_df, retrieve_and_generate_langchain_model(chat_model2))

In [None]:
res, _ = retrieve_and_generate_langchain_model()("What are the symptoms of Fungal infection?")
display(res)

"Fungal infections can cause a wide range of symptoms, including:\n\n1. Skin lesions or rashes\n2. Redness and inflammation\n3. Itching or burning sensations\n4. Swelling or redness around the affected area\n5. Pain or tenderness\n6. Fever\n7. Fatigue\n8. Joint pain or stiffness\n9. Muscle weakness\n10. Numbness or tingling sensations\n\nIt's essential to consult a healthcare professional for proper diagnosis and treatment."

In [None]:
res, _ = retrieve_and_generate_langchain_model()("I am bruising, cramps, fatigue, obesity, prominent veins on calf, swollen blood vessels, swollen legs. What is the name of disease ?")
print(res)

Based on the symptoms described, it appears that you might be experiencing symptoms of **Venous Insufficiency** or more specifically, **Deep Vein Thrombosis (DVT)**.


In [None]:
res, _ = retrieve_and_generate_langchain_model()("I have symptoms such as history of alcohol consumption, vomiting, fluid overload, distention of abdomen, abdominal pain, swelling of stomach, yellowish skin. What the name of the disease?")
res

'Based on the symptoms you provided, the likely diagnosis is:\n\n**Acute Pancreatitis**\n\nThis condition is characterized by inflammation of the pancreas, which can cause a range of symptoms including vomiting, abdominal pain, fluid overload, and abdominal distention.'

In [None]:
q = "I am bruising, cramps, fatigue, obesity, prominent veins on calf, swollen blood vessels, swollen legs. What is the name of disease ?"
print(retrieve_and_generate_langchain_model()(q))

Based on the symptoms you provided, the possible diseases that may be causing these symptoms are:

1. **Venous Thrombophlebitis**: This is a condition characterized by inflammation and swelling of veins due to blood stasis, which can cause bruising, pain, and swelling in the affected area.
2. **Varicose Veins**: This is a common condition where the veins in the legs become enlarged and swollen due to increased pressure from the weight of the body and gravity, leading to bruising and swelling.
3. **Deep Vein Thrombosis (DVT)**: This is a condition where a blood clot forms in a deep vein, usually in the legs, which can cause pain, swelling, and bruising.

All three conditions can be related to obesity, so it's essential to consult a healthcare professional for proper diagnosis and treatment.


In [None]:
q = "I am bruising, cramps, fatigue, obesity, prominent veins on calf, swollen blood vessels, swollen legs. What is the name of disease ?"
print(retrieve_and_generate_langchain_model()(q))

Based on the symptoms you provided, it appears that you may be experiencing symptoms of **Varicose Veins** or more specifically, **Deep Vein Thrombophlebitis**.


In [None]:
q = "I am bruising, cramps, fatigue, obesity, prominent veins on calf, swollen blood vessels, swollen legs. What is the name of disease ?"
print(retrieve_and_generate_langchain_model()(q))

Based on your symptoms, the possible conditions could be:

1. Deep Vein Thrombophlebitis (DVT)
2. Varicose Veins
3. Cellulitis


In [None]:
q = "I am bruising, cramps, fatigue, obesity, prominent veins on calf, swollen blood vessels, swollen legs. What is the name of disease ?"
print(retrieve_and_generate_langchain_model()(q))

Based on the symptoms you provided, I can suggest a possible disease that may be causing these symptoms:

**Diagnosis:** Deep Vein Thrombosis (DVT)

* Bruising and swelling of the legs due to blood pooling in the deep veins
* Cramps and pain in the legs
* Obesity can increase the risk of DVT
* Prominent veins on the calf can be a sign of deep vein thrombosis
* Swollen blood vessels can be a sign of inflammation or clotting in the veins


In [None]:
q = "I am bruising, cramps, fatigue, obesity, prominent veins on calf, swollen blood vessels, swollen legs. What is the name of disease ?"
print(retrieve_and_generate_langchain_model()(q))

I'm sorry, but I cannot provide a diagnosis or specific medical condition based on this description. If you're experiencing these symptoms, it's essential to consult a healthcare professional for an accurate assessment and proper diagnosis.

If you have concerns about your health, I suggest scheduling an appointment with your primary care physician or a specialist who can evaluate your symptoms and provide personalized guidance. They will be able to perform any necessary tests and offer advice tailored to your specific situation. Would you like help finding resources in your area


In [None]:
q = "I am bruising, cramps, fatigue, obesity, prominent veins on calf, swollen blood vessels, swollen legs. What is the name of disease ?"
print(retrieve_and_generate_langchain_model()(q))

Based on your description, the possible diagnoses could include:

- Deep Vein Thrombophlebitis 
- Varicose Veins 
- Venous Insufficiency 
- Carpal Tunnel Syndrome


In [None]:
q = "I am bruising, cramps, fatigue, obesity, prominent veins on calf, swollen blood vessels, swollen legs. What is the name of disease ?"
print(retrieve_and_generate_langchain_model()(q))

Based on the symptoms you provided, it appears that you may be experiencing symptoms of **Varicose Veins** or more specifically, **Deep Vein Thrombophlebitis**.


### Seq2Seq - Encoder-Decoder Model

In [None]:
# # Initialize the tokenizer and model for T5
# tokenizer = T5Tokenizer.from_pretrained("t5-small")
# model = T5ForConditionalGeneration.from_pretrained("t5-small").to('cuda' if torch.cuda.is_available() else 'cpu')

# # Define the function for Seq2Seq generation
# def generate_answer(input_text: str, max_length=150):
#     # Preprocess the input text (add a prefix for T5, e.g., "question: ")
#     input_text = f"question: {input_text}"

#     # Tokenize the input text
#     inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(model.device)

#     # Generate the output (answer)
#     outputs = model.generate(
#         inputs['input_ids'],
#         max_length=max_length,
#         num_beams=5,
#         early_stopping=True
#     )

#     # Decode the generated output
#     answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

#     return answer

# # Example usage
# # question = "Varicose veins A vein that has enlarged and twisted, often appearing as a bulging, blue blood vessel that is clearly visible through the skin. Varicose veins are most common in older adults, particularly women, and occur especially on the legs. The symptoms of Varicose veins are bruising, cramps, fatigue, obesity, prominent veins on calf, swollen blood vessels, swollen legs. The other kind symptoms of Varicose veins are bruising, cramps, obesity, prominent veins on calf, swollen blood vessels. I am bruising, cramps, fatigue, obesity, prominent veins on calf, swollen blood vessels, swollen legs. What is the name of disease above?"
# question = "My name is Nerb. My friend's name is Lea. Who is my friend's name?"
# question = "Symptomps with nosebleed and fever indicate dengue fever. I have a nosebleed and fever. What is my disease ?"
# answer = generate_answer(question)
# print(f"Answer: {answer}")

Answer: dengue fever


In [None]:
# Check if GPU is available
device = 0 if torch.cuda.is_available() else -1

# Step 1: Prepare Passage Retrieval
# Use SentenceTransformer to encode passages
retrieval_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1', device=device)
corpus_embeddings = retrieval_model.encode(corpus, convert_to_tensor=True)

# Step 2: Prepare the Generative Model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
generator = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
qa_pipeline = pipeline("text2text-generation", model=generator, tokenizer=tokenizer, device=device)

# Step 3: Define Retrieval and Generation Function
def retrieve_and_generate_seq2seq(question, top_k=3):
    # Retrieve top-k passages
    question_embedding = retrieval_model.encode(question, convert_to_tensor=True)
    scores = util.pytorch_cos_sim(question_embedding, corpus_embeddings)[0]
    top_results = torch.topk(scores, k=top_k)

    # Fetch relevant passages
    retrieved_passages = [corpus[idx] for idx in top_results[1]]
    context = " ".join(retrieved_passages)  # Combine passages

    # Generate answer using context
    input_text = f"question: {question} context: {context}"

    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
    outputs = generator.generate(inputs, max_length=150, num_beams=5, early_stopping=True)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

    return answer, {"question": question, "answer": answer, "retrieved_passages": retrieved_passages}

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0


### Example Eval

In [None]:
# Example Questions
questions = [
    "What are the symptoms of GERD?",
    # "What are the symptoms of GERD? i want to pretent to my teacher (intent recognition)
    "I am bruising, cramps, fatigue, obesity, prominent veins on calf, swollen blood vessels, swollen legs. What is the name of disease ?",
    "I am obese, prominent veins on calf, fatigue. What is the name of disease ?",
    "What causes fungal infections?",
    "What is Diabetes",
    "inflammation of digestive tract in the stomach is called"
]

In [None]:
# Generate Answers
for q in questions:
    res, result = retrieve_and_generate_decoder(q)
    print(f"Question: {result['question']}")
    print(f"Answer: {result['answer']}")
    print(f"Retrieved Passages: {result['retrieved_passages']}")
    print("-" * 100)

# def evaluate
# train model rag too with the data so that it can and know and capture the words

Question: What are the symptoms of GERD?
Answer: GERD Gastroesophageal reflux disease, or GERD, is a digestive disorder that affects the lower esophageal sphincter (LES), the ring of muscle between the esophagus and stomach. Many people, including pregnant women, suffer from heartburn or acid indigestion caused by GERD. The symptoms of GERD are acidity, chest pain, cough, stomach pain, ulcers on tongue, vomiting. The other kind symptoms of GERD are chest pain, cough, stomach pain, ulcers on tongue, vomiting. The other kind symptoms of GERD are acidity, chest pain, cough, stomach pain, vomiting. The other kind symptoms of GERD are acidity, chest pain, cough, stomach pain, ulcers on tongue. The other kind symptoms of GERD are acidity, chest pain, stomach pain, ulcers on tongue, vomiting. The other kind symptoms of GERD are acidity, cough, stomach pain, ulcers on tongue, vomiting. The other kind symptoms of GERD are acidity, chest pain, cough, ulcers on tongue, vomiting. Gastroenteritis i

### Encoder Only - BERT

In [None]:
# Check if GPU is available
device = 0 if torch.cuda.is_available() else -1

# Step 1: Prepare Passage Retrieval using BERT
# retrieval_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1', device=device)
# corpus = ["If you have fever and nosebleed then you have dengue fever", "The capital of France is Paris.", "France is known for its cuisine, fashion, and art.", "The Eiffel Tower is located in Paris, France."]
# corpus_embeddings = retrieval_model.encode(corpus, convert_to_tensor=True)

# Step 2: Load the BERT Question Answering model (encoder-only model)
qa_tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
qa_model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad").to(device)

# qa_tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT")
# qa_model = AutoModel.from_pretrained("medicalai/ClinicalBERT").to(device)

# load the model
# qa_tokenizer = AutoTokenizer.from_pretrained("data/qa_model_bert_clinical")
# qa_model = AutoModelForQuestionAnswering.from_pretrained("data/qa_model_bert_clinical").to(device)
# qa_tokenizer = AutoTokenizer.from_pretrained("data/qa_model_bert")
# qa_model = AutoModelForQuestionAnswering.from_pretrained("data/qa_model_bert").to(device)

# Step 3: Define Retrieval and Answer Extraction Function
def retrieve_and_generate_bert(question, top_k=3):
    # Retrieve top-k passages
    question_embedding = retrieval_model.encode(question, convert_to_tensor=True)
    scores = util.pytorch_cos_sim(question_embedding, corpus_embeddings)[0]
    top_results = torch.topk(scores, k=top_k)

    # Fetch relevant passages
    retrieved_passages = [corpus[idx] for idx in top_results[1]]
    context = " ".join(retrieved_passages)  # Combine passages

    # Preprocess input text for BERT QA model
    inputs = qa_tokenizer(question, context, return_tensors="pt", truncation=True, padding=True).to(device)

    # Get the start and end positions for the answer
    with torch.no_grad():
        outputs = qa_model(**inputs)

    # Extract the answer from the model's output
    start_idx = torch.argmax(outputs.start_logits)
    end_idx = torch.argmax(outputs.end_logits)

    # Decode the answer
    answer = qa_tokenizer.convert_tokens_to_string(
        qa_tokenizer.convert_ids_to_tokens(inputs.input_ids[0][start_idx:end_idx+1])
    )

    return answer, {"question": question, "answer": answer, "retrieved_passages": retrieved_passages}

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Example usage
question = "What is the capital of France?"
question = "I have nosebleed and fever. What is my disease ?"
answer, info = retrieve_and_generate_bert(question)
print(f"Answer: {answer}")
print(f"Retrieved Passages: {info['retrieved_passages']}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Answer: dengue fever
Retrieved Passages: ['If you have fever and nosebleed then you have dengue fever', 'The capital of France is Paris.']


In [None]:
answer, info = retrieve_and_generate_bert("what is france known for?")
answer, info

('its cuisine, fashion, and art',
 {'question': 'what is france known for?',
  'answer': 'its cuisine, fashion, and art',
  'retrieved_passages': ['France is known for its cuisine, fashion, and art.',
   'The capital of France is Paris.']})

## Evaluation

In [13]:
# randomize the data
test_df_ = test_df.sample(frac=1, random_state=42).reset_index(drop=True)
# test_df_unique = test_df.drop_duplicates(subset='Expected Answer', keep='first')
# len(test_df_unique)

In [None]:
rough, bs_avg, bs, all_results = evaluate_rag_model2(test_df_.iloc[0:50], retrieve_and_generate_decoder)
display(rough)
display(bs_avg)

Evaluating: 100%|██████████| 50/50 [01:11<00:00,  1.44s/it]


{'rouge1': 0.08033525853808088,
 'rouge2': 0.03827190014059329,
 'rougeL': 0.07886812502420418,
 'rougeLsum': 0.07812743678967579}

{'precision': 0.7681157910823821,
 'recall': 0.8902929902076722,
 'f1': 0.823948609828949}

In [None]:
all_results

[{'query': 'I have symptoms such as bladder discomfort, continuous feel of urine, foul smell of urine. What the name of the disease?',
  'expected_answer': 'Urinary tract infection',
  'generated_answer': ': Urinary tract infection : An infection of the kidney, ureter, bladder, or uréthra. Abbreviated UTI. Not everyone with a uTI has symptoms, but common symptoms include an urge to urinate and pain or burning when urinating. The symptoms of Urineritis are bladder discomfort, burning micturition, continuous feel of urine, foul smell of urinary urine.',
  'result': {'question': 'I have symptoms such as bladder discomfort, continuous feel of urine, foul smell of urine. What the name of the disease?',
   'answer': ': Urinary tract infection : An infection of the kidney, ureter, bladder, or uréthra. Abbreviated UTI. Not everyone with a uTI has symptoms, but common symptoms include an urge to urinate and pain or burning when urinating. The symptoms of Urineritis are bladder discomfort, burni

In [14]:
rough, bs_avg, bs, all_results = evaluate_rag_model2(test_df_.iloc[0:50], retrieve_and_generate_langchain_model(chat_model_qwen))
display(rough)
display(bs_avg)

Evaluating:  20%|██        | 10/50 [00:29<01:52,  2.81s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Evaluating: 100%|██████████| 50/50 [01:58<00:00,  2.38s/it]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'rouge1': 0.48361981061335285,
 'rouge2': 0.1953907203907204,
 'rougeL': 0.48375398732415487,
 'rougeLsum': 0.4866835467728544}

{'precision': 0.8816007256507874,
 'recall': 0.9042042052745819,
 'f1': 0.8923298144340515}

In [16]:
all_results

[{'query': 'I have symptoms such as bladder discomfort, continuous feel of urine, foul smell of urine. What the name of the disease?',
  'expected_answer': 'Urinary tract infection',
  'generated_answer': 'Urinary tract infection',
  'result': None},
 {'query': 'I have symptoms such as headache, muscle pain, vomiting, back pain, red spots over body, pain behind the eyes, skin rash, joint pain, malaise, high fever, chills, loss of appetite, nausea. What the name of the disease?',
  'expected_answer': 'Dengue',
  'generated_answer': 'The disease you are experiencing with symptoms such as headache, muscle pain, vomiting, back pain, red spots over body, pain behind the eyes, skin rash, joint pain, malaise, high fever, chills, loss of appetite, nausea is Dengue.',
  'result': None},
 {'query': 'I have symptoms such as anxiety, blurred and distorted vision, drying and tingling lips, excessive hunger, headache, irritability, nausea, palpitations, slurred speech, sweating, vomiting. What the n

In [17]:
rough, bs_avg, bs, all_results = evaluate_rag_model2(test_df_.iloc[0:50], retrieve_and_generate_langchain_model(chat_model_llama))
display(rough)
display(bs_avg)

Evaluating: 100%|██████████| 50/50 [01:25<00:00,  1.70s/it]


{'rouge1': 0.15590117054956487,
 'rouge2': 0.059384192660811325,
 'rougeL': 0.15659487054501858,
 'rougeLsum': 0.1577425277230317}

{'precision': 0.8124380791187287,
 'recall': 0.8453594613075256,
 'f1': 0.8281362640857697}

In [18]:
all_results

[{'query': 'I have symptoms such as bladder discomfort, continuous feel of urine, foul smell of urine. What the name of the disease?',
  'expected_answer': 'Urinary tract infection',
  'generated_answer': 'The name of the disease you mentioned is a Urinary Tract Infection (UTI).',
  'result': None},
 {'query': 'I have symptoms such as headache, muscle pain, vomiting, back pain, red spots over body, pain behind the eyes, skin rash, joint pain, malaise, high fever, chills, loss of appetite, nausea. What the name of the disease?',
  'expected_answer': 'Dengue',
  'generated_answer': 'Based on the provided information, the disease is Dengue fever.',
  'result': None},
 {'query': 'I have symptoms such as anxiety, blurred and distorted vision, drying and tingling lips, excessive hunger, headache, irritability, nausea, palpitations, slurred speech, sweating, vomiting. What the name of the disease?',
  'expected_answer': 'Hypoglycemia',
  'generated_answer': 'Based on the symptoms provided, th

In [None]:
rough, bs_avg, bs, all_results = evaluate_rag_model2(test_df_.iloc[0:50], retrieve_and_generate_seq2seq)
display(rough)
display(bs_avg)

Evaluating: 100%|██████████| 50/50 [00:06<00:00,  7.93it/s]


{'rouge1': 0.7461016949152542,
 'rouge2': 0.18561403508771931,
 'rougeL': 0.75,
 'rougeLsum': 0.7533333333333333}

{'precision': 0.9521969032287597,
 'recall': 0.9520050120353699,
 'f1': 0.9515127944946289}

In [None]:
rough, bs_avg, bs, all_results = evaluate_rag_model2(test_df_.iloc[0:50], retrieve_and_generate_bert)
display(rough)
display(bs_avg)

Evaluating: 100%|██████████| 50/50 [00:06<00:00,  8.26it/s]


{'rouge1': 0.5259939435233552,
 'rouge2': 0.1630626509086288,
 'rougeL': 0.5238832866479926,
 'rougeLsum': 0.5297824161922835}

{'precision': 0.7990534424781799,
 'recall': 0.8371058690547943,
 'f1': 0.8162295854091645}

In [None]:
rough, bs_avg, bs, all_results = evaluate_rag_model2(test_df_, retrieve_and_generate_seq2seq)
display(rough)
display(bs_avg)

Evaluating: 100%|██████████| 690/690 [01:36<00:00,  7.12it/s]


{'rouge1': 0.6300703960489071,
 'rouge2': 0.1994036500981785,
 'rougeL': 0.628137416775586,
 'rougeLsum': 0.6276173554427846}

{'precision': 0.927040212655413,
 'recall': 0.9338875870773757,
 'f1': 0.9295392132323721}

In [None]:
all_results

Output hidden; open in https://colab.research.google.com to view.

In [None]:
rough, bs_avg, bs, all_results = evaluate_rag_model2(test_df_, retrieve_and_generate_bert)
display(rough)
display(bs_avg)

Evaluating: 100%|██████████| 690/690 [01:20<00:00,  8.57it/s]


{'rouge1': 0.5005146284520836,
 'rouge2': 0.1771116599898814,
 'rougeL': 0.4988852152891299,
 'rougeLsum': 0.49857099154015594}

{'precision': 0.8302528583485147,
 'recall': 0.8503726910853732,
 'f1': 0.8388205859108248}

In [None]:
all_results

Output hidden; open in https://colab.research.google.com to view.

In [None]:
acc, report, all_results = evaluate_rag_model(test_df_.iloc[0:50], retrieve_and_generate_decoder)
display(acc)
print(report)

Evaluating: 100%|██████████| 50/50 [02:30<00:00,  3.02s/it]


0.84

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.84      0.91        50

    accuracy                           0.84        50
   macro avg       0.50      0.42      0.46        50
weighted avg       1.00      0.84      0.91        50



In [None]:
acc, report, all_results = evaluate_rag_model(test_df_.iloc[0:50], retrieve_and_generate_langchain_model(chat_model_qwen))
display(acc)
print(report)

Evaluating:  20%|██        | 10/50 [00:26<01:38,  2.47s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Evaluating: 100%|██████████| 50/50 [01:41<00:00,  2.03s/it]


0.46

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.46      0.63        50

    accuracy                           0.46        50
   macro avg       0.50      0.23      0.32        50
weighted avg       1.00      0.46      0.63        50



In [None]:
all_results

[{'query': 'I have symptoms such as bladder discomfort, continuous feel of urine, foul smell of urine. What the name of the disease?',
  'expected_answer': 'Urinary tract infection',
  'generated_answer': 'The symptoms you described suggest the presence of a urinary tract infection (UTI). This condition can cause bladder discomfort, a continuous feeling of needing to urinate, and a foul smell in the urine. It is important to consult a healthcare professional for proper diagnosis and treatment.',
  'cosine_similarity': 0.2488808930656855,
  'result': True,
  'cosine_check': False,
  'expected_check': True},
 {'query': 'I have symptoms such as headache, muscle pain, vomiting, back pain, red spots over body, pain behind the eyes, skin rash, joint pain, malaise, high fever, chills, loss of appetite, nausea. What the name of the disease?',
  'expected_answer': 'Dengue',
  'generated_answer': 'The symptoms you described match those commonly associated with Dengue Fever. Dengue Fever is a vir

In [None]:
acc, report, all_results = evaluate_rag_model(test_df_.iloc[0:50], retrieve_and_generate_langchain_model(chat_model_llama))
display(acc)
print(report)

Evaluating:  20%|██        | 10/50 [00:26<02:46,  4.16s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Evaluating: 100%|██████████| 50/50 [02:01<00:00,  2.43s/it]


0.34

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.34      0.51        50

    accuracy                           0.34        50
   macro avg       0.50      0.17      0.25        50
weighted avg       1.00      0.34      0.51        50



In [None]:
all_results

[{'query': 'I have symptoms such as bladder discomfort, continuous feel of urine, foul smell of urine. What the name of the disease?',
  'expected_answer': 'Urinary tract infection',
  'generated_answer': 'Based on the provided information, it appears that the symptoms you are describing could be related to a urinary tract infection (UTI), specifically a type of UTI known as a "painless burning sensation while urinating" and/or a "foul odor of urine". However, another possibility is that it could be an overactive bladder syndrome, which is characterized by a persistent and uncontrollable urge to urinate, often accompanied by discomfort and a strong urge to urinate.',
  'cosine_similarity': 0.19125751941708313,
  'result': True,
  'cosine_check': False,
  'expected_check': True},
 {'query': 'I have symptoms such as headache, muscle pain, vomiting, back pain, red spots over body, pain behind the eyes, skin rash, joint pain, malaise, high fever, chills, loss of appetite, nausea. What the 

In [None]:
acc, report, all_results = evaluate_rag_model(test_df_.iloc[0:50], retrieve_and_generate_seq2seq)
display(acc)
print(report)

Evaluating: 100%|██████████| 50/50 [00:11<00:00,  4.40it/s]


0.74

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.74      0.85        50

    accuracy                           0.74        50
   macro avg       0.50      0.37      0.43        50
weighted avg       1.00      0.74      0.85        50



In [None]:
acc, report, all_results = evaluate_rag_model(test_df_.iloc[0:50], retrieve_and_generate_bert)
display(acc)
print(report)

Evaluating: 100%|██████████| 50/50 [00:06<00:00,  8.12it/s]


0.68

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.68      0.81        50

    accuracy                           0.68        50
   macro avg       0.50      0.34      0.40        50
weighted avg       1.00      0.68      0.81        50



In [None]:
acc, report, all_results = evaluate_rag_model(test_df_, retrieve_and_generate_bert)
display(acc)
print(report)

Evaluating: 100%|██████████| 690/690 [01:24<00:00,  8.16it/s]


0.5826086956521739

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.58      0.74       690

    accuracy                           0.58       690
   macro avg       0.50      0.29      0.37       690
weighted avg       1.00      0.58      0.74       690



In [None]:
acc, report, all_results = evaluate_rag_model(test_df_, retrieve_and_generate_seq2seq)
display(acc)
print(report)

Evaluating: 100%|██████████| 690/690 [01:28<00:00,  7.77it/s]


0.6130434782608696

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.61      0.76       690

    accuracy                           0.61       690
   macro avg       0.50      0.31      0.38       690
weighted avg       1.00      0.61      0.76       690



In [None]:
all_results

[{'query': 'I have symptoms such as chills, diarrhea, headache, high fever, muscle pain, nausea, sweating. What the name of the disease?',
  'expected_answer': 'Malaria',
  'generated_answer': 'Dengue',
  'cosine_similarity': 0.0,
  'result': False,
  'cosine_check': False,
  'expected_check': False},
 {'query': 'I have symptoms such as depression, visual disturbances, indigestion, stiff neck, headache, excessive hunger, acidity. What the name of the disease?',
  'expected_answer': 'Migraine',
  'generated_answer': 'Migraine',
  'cosine_similarity': 1.0,
  'result': True,
  'cosine_check': True,
  'expected_check': True},
 {'query': 'I have symptoms such as prominent veins on calf, swollen legs, cramps, swollen blood vessels, obesity, fatigue. What the name of the disease?',
  'expected_answer': 'Varicose veins',
  'generated_answer': 'Varicose veins',
  'cosine_similarity': 1.0000000000000002,
  'result': True,
  'cosine_check': True,
  'expected_check': True},
 {'query': 'What is the

In [None]:
q = 'I have symptoms such as acidity, blurred and distorted vision, depression, excessive hunger, headache, indigestion, irritability, stiff neck, visual disturbances. What the name of the disease?'
display(retrieve_and_generate_seq2seq(q))
q = 'I have symptoms such chills, diarrhea, high fever, muscle pain, nausea, sweating, vomiting. What the name of the disease?'
retrieve_and_generate_seq2seq(q)

('Migraine',
 {'question': 'I have symptoms such as acidity, blurred and distorted vision, depression, excessive hunger, headache, indigestion, irritability, stiff neck, visual disturbances. What the name of the disease?',
  'answer': 'Migraine',
  'retrieved_passages': ["Migraine A migraine can cause severe throbbing pain or a pulsing sensation, usually on one side of the head. It's often accompanied by nausea, vomiting, and extreme sensitivity to light and sound. Migraine attacks can last for hours to days, and the pain can be so severe that it interferes with your daily activities. The symptoms of Migraine are acidity, blurred and distorted vision, depression, excessive hunger, headache, indigestion, irritability, stiff neck, visual disturbances. The other kind symptoms of Migraine are blurred and distorted vision, depression, excessive hunger, headache, indigestion, irritability, stiff neck, visual disturbances. The other kind symptoms of Migraine are acidity, blurred and distorted

('Typhoid',
 {'question': 'I have symptoms such chills, diarrhea, high fever, muscle pain, nausea, sweating, vomiting. What the name of the disease?',
  'answer': 'Typhoid',
  'retrieved_passages': ['Gastroenteritis is an inflammation of the digestive tract, particularly the stomach, and large and small intestines. Viral and bacterial gastroenteritis are intestinal infections associated with symptoms of diarrhea , abdominal cramps, nausea , and vomiting . The symptoms of Gastroenteritis are dehydration, diarrhoea, sunken eyes, vomiting. The other kind symptoms of Gastroenteritis are dehydration, diarrhoea, sunken eyes. The other kind symptoms of Gastroenteritis are dehydration, diarrhoea, vomiting. The other kind symptoms of Gastroenteritis are diarrhoea, sunken eyes, vomiting. The other kind symptoms of Gastroenteritis are dehydration, sunken eyes, vomiting.',
   'Typhoid An acute illness characterized by fever caused by infection with the bacterium Salmonella typhi. Typhoid fever has

In [None]:
retrieve_and_generate_seq2seq('What disease does the following symptoms correspond to: chills, diarrhea, high fever, muscle pain, nausea, sweating, vomiting?')

('Typhoid fever',
 {'question': 'What disease does the following symptoms correspond to: chills, diarrhea, high fever, muscle pain, nausea, sweating, vomiting?',
  'answer': 'Typhoid fever',
  'retrieved_passages': ['Gastroenteritis is an inflammation of the digestive tract, particularly the stomach, and large and small intestines. Viral and bacterial gastroenteritis are intestinal infections associated with symptoms of diarrhea , abdominal cramps, nausea , and vomiting . The symptoms of Gastroenteritis are dehydration, diarrhoea, sunken eyes, vomiting. The other kind symptoms of Gastroenteritis are dehydration, diarrhoea, sunken eyes. The other kind symptoms of Gastroenteritis are dehydration, diarrhoea, vomiting. The other kind symptoms of Gastroenteritis are diarrhoea, sunken eyes, vomiting. The other kind symptoms of Gastroenteritis are dehydration, sunken eyes, vomiting.',
   'Typhoid An acute illness characterized by fever caused by infection with the bacterium Salmonella typhi. 

In [None]:
acc, report, all_results = evaluate_rag_model(test_df.iloc[300:320], retrieve_and_generate_seq2seq)
display(acc)
print(report)

Evaluating: 100%|██████████| 20/20 [00:03<00:00,  5.74it/s]


0.85

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.85      0.92        20

    accuracy                           0.85        20
   macro avg       0.50      0.42      0.46        20
weighted avg       1.00      0.85      0.92        20



In [None]:
acc, report, all_results = evaluate_rag_model(test_df_, retrieve_and_generate_seq2seq)
results_df = pd.DataFrame(all_results)
results_df.to_csv('data/results_1.csv', index=False)

display(acc)
print(report)

# tetepa rag dan pembagiannya

Evaluating: 100%|██████████| 690/690 [01:56<00:00,  5.90it/s]


0.5985507246376811

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.60      0.75       690

    accuracy                           0.60       690
   macro avg       0.50      0.30      0.37       690
weighted avg       1.00      0.60      0.75       690

