# Baseline Framework

This is the baseline framework using FAISS on the user patient records, we'll run it for some similarity evaluation 

In [None]:
!pip install -r '/content/requirements.txt' -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.0/974.0 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [None]:
# !pip install -r '/content/MedRAG/requirements.txt' -q

In [None]:
#attach drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_mistralai.chat_models import ChatMistralAI
from langchain_mistralai.embeddings import MistralAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain

from langchain.embeddings.huggingface import HuggingFaceEmbeddings

api_key = "YOUR_API_KEY"

# the drive paths
drive_dataset_path = '/content/drive/MyDrive/dataset_folder/'
drive_faiss_path = '/content/drive/MyDrive/faiss_index_full'
drive_trad_model_path = '/content/drive/MyDrive/traditional_med_model/'

# /content/drive/MyDrive/dataset_folder/health_report_{0}/health_report_{0}.txt

# Load a sample health record data
loader = TextLoader(drive_dataset_path+"health_report_{13}/health_report_{13}.txt")
docs = loader.load()


# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter()
documents = text_splitter.split_documents(docs)

# Define the embedding model
# can be from MistralAI or a locally loaded embedding model
# embeddings = MistralAIEmbeddings(model="mistral-embed", mistral_api_key=api_key)
embeddings= HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Create the vector store
vector = FAISS.from_documents(documents, embeddings)
# Define a retriever interface
retriever = vector.as_retriever()

# Define LLM
# open-mixtral-8x7b
model = ChatMistralAI(mistral_api_key=api_key)

# Define prompt template
prompt = ChatPromptTemplate.from_template("""
You are a helpful, respectful and honest medical bot. Always answer as
helpfully as possible, while being safe.

If a question does not make any sense, or is not factually coherent, explain
why instead of answering something not correct. If you don't know the answer
to a question, please don't share false information.

Answer the following question based only on the provided context:

<context>
{context}
</context>

Question: {input}""")

# Create a retrieval chain to answer questions
document_chain = create_stuff_documents_chain(model, prompt)
retrieval_chain = create_retrieval_chain(retriever, document_chain)


In [None]:
#a sample response for the health record of the 13th patient
response = retrieval_chain.invoke({"input": "What is are the symptoms of the person in question?"})
print(response["answer"])

Based on the provided context, the symptoms of the person in question are:

1. Pain around the navel that has been present for two or three days.
2. The pain can be sharp like being pricked with a needle, but it goes away after a few seconds.
3. There is no vomiting or fainting.
4. There is a loss of appetite and bloating.
5. The pain is described as faint or mild.
6. After eating, there is a sudden urge to have a bowel movement, which results in loose stools.

The person in question has not taken any medication and has not undergone any medical examination. The pain is suspected to be related to gastrointestinal dysfunction, which could be caused by diet, mental factors, or the autoimmune system.


# Similarity Evaluation

Here we go through all files of type `/content/drive/MyDrive/dataset_folder/health_report_i/health_report_i.txt` then:
1. create vector stores for each using FAISS
2. generate their summary based on RAG
3. and store the original text plus generated summary in a data frame


In [None]:

import pandas as pd
import os

#data: to store the original text
#answers: to store the generated summary
data = []
answers = []

prompt = ChatPromptTemplate.from_template("""

Answer the following question based only on the provided context:

<context>
{context}
</context>

Question: {input}""")


# Define the embedding model
# embeddings = MistralAIEmbeddings(model="mistral-embed", mistral_api_key=api_key)
embeddings= HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Define LLM
model = ChatMistralAI(mistral_api_key=api_key)

#creating the document chain using the model and prompt
document_chain = create_stuff_documents_chain(model, prompt)

#now for each health record
for i in range(0, 76):
  filename = '/content/drive/MyDrive/dataset_folder/health_report_{'+str(i)+'}/health_report_{'+str(i)+'}.txt'
  if os.path.exists(filename):
    # Load data
    loader = TextLoader(filename)
    docs = loader.load()
    # Split text into chunks
    text_splitter = RecursiveCharacterTextSplitter()
    documents = text_splitter.split_documents(docs)


    # Create the vector store
    vector = FAISS.from_documents(documents, embeddings)
    # Define a retriever interface
    retriever = vector.as_retriever()



    # Create a retrieval chain to answer questions
    retrieval_chain = create_retrieval_chain(retriever, document_chain)
    with open(filename, 'r') as file:
      text = file.read()
      data.append(text)
    #the LLM is prompted to generate a summary each time
    response = retrieval_chain.invoke({"input": "Summarize what the text says"})
    answers.append(response['answer'])

#both data and summary are stored in a df
df = pd.DataFrame(data, columns=['text'])
df['summary'] = answers

df.head()




Unnamed: 0,text,summary
0,"Hello, there is a pain around the navel, I don...","A female individual, 29 years old, has been ex..."
1,I am 42 years old. I started to be dizzy this ...,The text is a conversation between two people ...
2,"In the morning, the stools are normal and thin...",A 35-year-old male has been experiencing chang...
3,Female female abdomen is painful on the left s...,A 34-year-old female is experiencing pain in h...
4,"The left rib part is very painful, it is pancr...",A 51-year-old female is experiencing severe pa...


In [None]:
df.shape

(74, 2)

### Cosine Similarity
now we go through all the original texts and generated summaries to look for cosine similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


cosine_similarity_scores = []

for og_answer, mistral_answer in zip(df['text'], df['summary']):

  vector_og = embeddings.embed_query(og_answer)
  vector_mistral = embeddings.embed_query(mistral_answer)

  # Calculate the cosine similarity
  CS_score = cosine_similarity([vector_og], [vector_mistral])[0][0]

  cosine_similarity_scores.append(CS_score)


# Print the cosine similarity
# cosine_similarity_scores

In [None]:
#adding the cosine similarity scores to the df
df['cosine_similarity'] = cosine_similarity_scores
len(cosine_similarity_scores)

74

In [None]:
#looking at the mean
df['cosine_similarity'].mean()

0.7309454276913732

### Euclidean Distance
now we go through all the original texts and generated summaries to look for euclidean distance

In [None]:
from scipy.spatial import distance
# bringing in HF embeddings - need these to represent document chunks

euclidean_distances = []

for og_answer, mistral_answer in zip(df['text'], df['summary']):

  vector_og = embeddings.embed_query(og_answer)
  vector_mistral = embeddings.embed_query(mistral_answer)

  # Calculate the euclidean
  dist = distance.euclidean(vector_og, vector_mistral)

  euclidean_distances.append(dist)


# Print
len(euclidean_distances)

74

In [None]:
df['euclidean_distance'] = euclidean_distances

In [None]:
df['euclidean_distance'].median()

0.73351579525755

In [None]:
df.to_csv('health_report_evaluation.csv', index=False)

# LLM Evaluation

Evaluating the LLM on the Medical benchmarks from MedRAG

In [None]:
!git clone https://github.com/Teddy-XiongGZ/MIRAGE.git

Cloning into 'MIRAGE'...
remote: Enumerating objects: 30680, done.[K
remote: Counting objects: 100% (30680/30680), done.[K
remote: Compressing objects: 100% (29064/29064), done.[K
remote: Total 30680 (delta 1597), reused 30627 (delta 1575), pack-reused 0[K
Receiving objects: 100% (30680/30680), 21.25 MiB | 19.20 MiB/s, done.
Resolving deltas: 100% (1597/1597), done.
Updating files: 100% (30769/30769), done.


In [None]:
#cloning MIRAGE repo to get the benchmark for Medical evaluation.
import json
benchmark = json.load(open('/content/MIRAGE/benchmark.json'))

### MMLU-Med

In [None]:
# Loading the questions, options, and answers from the MMLU-Med bencmark

import pandas as pd

questions = []
options = []
answers = []

for key, values in benchmark['mmlu'].items():
  questions.append(values['question'])
  options.append(values['options'])
  answers.append(values['answer'])

mmlu_df = pd.DataFrame({'question': questions, 'options': options, 'answer': answers})


In [None]:
mmlu_df.head()

Unnamed: 0,question,options,answer
0,A lesion causing compression of the facial ner...,"{'A': 'paralysis of the facial muscles.', 'B':...",A
1,"A ""dished face"" profile is often associated with",{'A': 'a protruding mandible due to reactivati...,B
2,Which of the following best describes the stru...,"{'A': 'Bladder', 'B': 'Kidney', 'C': 'Ureter',...",A
3,Which of the following structures is derived f...,"{'A': 'Motor neurons', 'B': 'Skeletal muscles'...",C
4,Which of the following describes the cluster o...,"{'A': 'Afferent arteriole', 'B': 'Glomerulus',...",B


In [None]:
#attach drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# loading in the vector store for the textbooks from the drive, it was the only one less than 15GB to be stored in the drive
vector_db = FAISS.load_local('/content/drive/MyDrive/faiss_index_full', embeddings, allow_dangerous_deserialization=True)

In [None]:
# Define a retriever interface
retriever = vector_db.as_retriever()

# Define prompt template for the MMLU-Med benchmark
prompt = ChatPromptTemplate.from_template("""
You are a helpful medical expert, and your task is to answer a multi-choice medical question using the relevant documents.
Please first think step-by-step and then choose the answer from the provided options.
Organize your output in a json formatted as python Dict("step_by_step_thinking": Str(explanation), "answer_choice": Str(A/B/C/...)).
Your responses will be used for research purposes only, so please have a definite answer.

<context>
{context}
</context>

Please think step-by-step and generate your output in json:

The input will be a json file:
question: "question", options: 'A':.., 'B':.., 'C':.., 'D':...

Question: {input}""")

# Create a retrieval chain to answer questions
document_chain = create_stuff_documents_chain(model, prompt)
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [None]:
#viewing a sample question:options pair and the corresponding correct answer
question = {'question': mmlu_df['question'][0], 'options': mmlu_df['options'][0]}

print(question)

mmlu_df['answer'][0]

{'question': 'A lesion causing compression of the facial nerve at the stylomastoid foramen will cause ipsilateral', 'options': {'A': 'paralysis of the facial muscles.', 'B': 'paralysis of the facial muscles and loss of taste.', 'C': 'paralysis of the facial muscles, loss of taste and lacrimation.', 'D': 'paralysis of the facial muscles, loss of taste, lacrimation and decreased salivation.'}}


'A'

In [None]:
#generating a sample response for the first question
print(response["answer"])

{
"step_by_step_thinking": "A lesion at the stylomastoid foramen would affect only the motor fibers of the facial nerve (red lines in Figure 455-2). This would cause paralysis of the facial muscles (option A) but not loss of taste, lacrimation, or decreased salivation (options B, C, D) as these are controlled by parasympathetic, visceral afferent, and special visceral efferent fibers which are not affected at this site.",
"answer_choice": "A"
}


In [None]:
#now iterating through every question:options pair in the MMLU-Med benchmark and storing the generated answers

model_answers = []

i = 0

for question, options in zip(mmlu_df['question'], mmlu_df['options']):
  test = {'question': question, 'options': options}
  response = retrieval_chain.invoke({"input": str(test)})
  model_answers.append(response["answer"])
  i += 1
  if i%100 == 0:
    print(i)


# answer = json.loads(response["answer"])


100
200
300
400
500
600
700
800
900
1000


ReadTimeout: The read operation timed out

In [None]:
len(model_answers)

1061

In [None]:
#creating a df to store the model answers
df_model_ans = pd.DataFrame(model_answers, columns = ['model_answer'])
df_model_ans['model_answer'][0]

'{\n"step_by_step_thinking": "A lesion at the stylomastoid foramen affects only the motor fibers of the facial nerve (red line in Figure 455-2). It does not involve the chorda tympani, which carries taste fibers (green lines in Figure 455-2). Therefore, the lesion will not cause loss of taste. Also, the lesion is proximal to the parasympathetic fibers for lacrimation and salivation, so it will not affect these functions. The correct answer is option A: paralysis of the facial muscles.",\n"answer_choice": "A"\n}'

In [None]:
#saving the answers for MMLU in csv
df_model_ans.to_csv('mmlu_evaluation.csv', index=False)

In [None]:
#loading from csv
df_mmlu_test = pd.read_csv('/content/mmlu_evaluation.csv')

df_mmlu_test

Unnamed: 0,model_answer
0,"{\n""step_by_step_thinking"": ""A lesion at the s..."
1,"{\n""step_by_step_thinking"": ""The 'dished face'..."
2,"{\n""step_by_step_thinking"": ""The structure tha..."
3,"{\n""step_by_step_thinking"": ""The options provi..."
4,"{\n""step_by_step_thinking"": ""The cluster of bl..."
...,...
1056,"{\n""step_by_step_thinking"": ""The patient is a ..."
1057,"{\n""step_by_step_thinking"": ""The patient has s..."
1058,"{\n""step_by_step_thinking"": ""The patient's sym..."
1059,"{\n""step_by_step_thinking"": ""The patient is a ..."


In [None]:
#parsing through a text and looking for "answer_choice": then storing the character after it after a space

def extract_answer_choice(text):
    """
    Extracts the letter after "answer_choice:" from a text.

    Args:
        text: The text to extract the answer choice from.

    Returns:
        The letter after "answer_choice:" or None if not found.
    """

    # Find the index of "answer_choice:" in the text.
    index = text.find('"answer_choice":')

    # If "answer_choice:" is not found, return None.
    if index == -1:
        return None

    # Find the index of the space after "answer_choice:".
    space_index = text.find(" ", index)

    # Extract the letter after the space.
    answer_choice = text[space_index + 2]

    # Return the answer choice.
    return answer_choice

# Apply the function to each row in the 'model_answer' column
answer_choice = df_mmlu_test['model_answer'].apply(extract_answer_choice)

# Print the 'answer_choice' column
answer_choice


0       A
1       B
2       B
3       C
4       B
       ..
1056    C
1057    D
1058    D
1059    A
1060    A
Name: model_answer, Length: 1061, dtype: object

In [None]:
#calculating the accuracy of the MMLU answers generated
sum = 0
total = 0
for i, j in zip(mmlu_df['answer'][:1061], answer_choice):
  if j in ['A', 'B', 'C','D']:
    total += 1
    if i == j:
      sum += 1

print(f"correct guesses {sum} out of {total}: Accuaray = {sum/total}")

correct guesses 804 out of 1032: Accuaray = 0.7790697674418605


### BIO-ASQ

In [None]:
#attach drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Loading the questions, options, and answers from the Bio-ASQ benchmark

questions = []
options = []
answers = []

for key, values in benchmark['bioasq'].items():
  questions.append(values['question'])
  options.append(values['options'])
  answers.append(values['answer'])

bioasq_df = pd.DataFrame({'question': questions, 'options': options, 'answer': answers})

In [None]:
bioasq_df.head()

Unnamed: 0,question,options,answer
0,Can losartan reduce brain atrophy in Alzheimer...,"{'A': 'yes', 'B': 'no'}",B
1,Is PRP-40 regulation of microexons a conserved...,"{'A': 'yes', 'B': 'no'}",A
2,Is casimersen effective for the treatment of D...,"{'A': 'yes', 'B': 'no'}",A
3,Is medical hydrology the same as Spa therapy?,"{'A': 'yes', 'B': 'no'}",B
4,Is eteplirsen effective for the treatment of D...,"{'A': 'yes', 'B': 'no'}",A


In [None]:
#Looking at a sample question:options pair and the corresponding answer for BIO-ASQ

question = {'question': bioasq_df['question'][0], 'options': bioasq_df['options'][0]}

print(question)

bioasq_df['answer'][0]

{'question': "Can losartan reduce brain atrophy in Alzheimer's disease?", 'options': {'A': 'yes', 'B': 'no'}}


'B'

In [None]:
# Define a retriever interface from the Vector store of the Medical textbooks in the drive
vector_db = FAISS.load_local('/content/drive/MyDrive/faiss_index_full', embeddings, allow_dangerous_deserialization=True)
retriever = vector_db.as_retriever()

# Define prompt template for the Bio-ASQ benchmark
prompt = ChatPromptTemplate.from_template("""
You are a helpful medical expert, and your task is to answer a Yes & No medical question using knowledge and step_by_step_thinking.
Please first think step-by-step and then choose the answer from the provided options.
You can look at the relevant documents as well to see if they provide any insight.

Organize your output in a json formatted as python
Dict("step_by_step_thinking": Str(explanation), "answer_choice": Str(A/B)).

Your responses will be used for research purposes only, so please have a definite answer.

<context>
{context}
</context>

Please think step-by-step and generate your output in json:

The input will be in a json format:
'question': "question", 'options': 'A': 'yes', 'B': 'no'

Question: {input}""")

# Create a retrieval chain to answer questions
document_chain = create_stuff_documents_chain(model, prompt)
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [None]:
#iterating over the question:options pair and generating reponses for each the storing in a variable `model_answers_bioasq`

model_answers_bioasq = []

i = 0

# Create a retrieval chain to answer questions
document_chain = create_stuff_documents_chain(model, prompt)
retrieval_chain = create_retrieval_chain(retriever, document_chain)

for question, options in zip(bioasq_df['question'], bioasq_df['options']):
  test = {'question': question, 'options': options}
  response = retrieval_chain.invoke({"input": str(test)})
  model_answers_bioasq.append(response["answer"])
  i += 1
  if i%100 == 0:
    print(i)


100
200
300
400
500
600


In [None]:
#making a df of the bio-asq answers
df_bioasq_model_ans = pd.DataFrame(model_answers_bioasq, columns = ['model_answer_bioasq'])


#and then extracting the Answers generated by the LLM
answer_choices_bioasq = df_bioasq_model_ans['model_answer_bioasq'].apply(extract_answer_choice)

In [None]:
#calculating the accuracy of the Bio-Asq answers generated
#it is lower, beacause only the Medical textbooks were used for this demo, however using the entire MedRAG dataset would yeild a better result.

sum = 0
total = 0
for i, j in zip(bioasq_df['answer'], answer_choices_bioasq):
  if j in ['A', 'B']:
    total += 1
    if j in i:
      sum += 1

print(f"correct guesses {sum} out of {total}: Accuaray = {sum/total}")

correct guesses 394 out of 584: Accuaray = 0.6746575342465754


In [None]:
#saving the model answers for Bio-ASQ as csv
df_bioasq_model_ans.to_csv("bioasq_eval_2.csv", index = False)

# Adding a Traditional Model

Here are some drafts of traditional model edits done to be added to the Framework in future improvements

In [None]:
prompt1 = ChatPromptTemplate.from_template("""
If you don't know the answer to a question, please don't share false information.


Answer the following question by only giving a python list with the symtoms of the person
based only on the provided context, don't give text based asnwers only a python list:

mention all possible symptoms but if certain symptoms are not present don't mention them in the list.

and use python characters don't use any invalid characters, and always use single quotes for python list.

<context>
{context}
</context>

Question: {input}""")

# Create a retrieval chain to answer questions
document_chain = create_stuff_documents_chain(model, prompt1)
retrieval_chain = create_retrieval_chain(retriever, document_chain)


answers = []

for question in df['text']:
  print(question)
  response = retrieval_chain.invoke({"input": question})
  answers.append(response['answer'])

In [None]:
prompt1 = ChatPromptTemplate.from_template("""
If you don't know the answer to a question, please don't share false information.


Answer the following question by only giving a python list with the symtoms of the person
based only on the provided context, don't give text based asnwers only a python list:

mention all possible symptoms but if the patient says they don't have a certain symptom don't mention it in the list.

and use python characters don't use any invalid characters, and always use single quotes for python list.

<context>
{context}
</context>

Question: {input}""")

# Create a retrieval chain to answer questions
document_chain = create_stuff_documents_chain(model, prompt1)
retrieval_chain = create_retrieval_chain(retriever, document_chain)
response = retrieval_chain.invoke({"input": "List the symptoms of person in question?"})
print(response["answer"])

Based on the provided context, the symptoms of the person can be listed as follows:

['suddenly vomiting blood', 'blood and blood clots in mouth upon waking up for two consecutive days', 'gum bleeding (unsure)', 'hypertension', 'teeth are a bit swollen', 'both legs were soft and panting', 'color of stool not specified']

However, it is important to note that the person's symptoms may be indicative of a more serious underlying condition, and it is recommended that they seek medical attention immediately.


In [None]:
# prompt: load .pkl file  as minmax_scaler

import pickle

with open('/content/drive/MyDrive/minmax_scaler.pkl', 'rb') as file:
  minmax_scaler = pickle.load(file)


In [None]:
# prompt: how to extract python list from an unstructed text

import re

def extract_python_list(text):
  """
  Extracts a Python list from an unstructured text.

  Args:
    text: The text containing the Python list.

  Returns:
    The extracted Python list, or None if no list is found.
  """

  # Use a regular expression to find the Python list syntax
  match = re.search(r'\[(.*?)\]', text)

  if match:
    # Extract the contents of the list
    list_contents = match.group(1)

    # Split the contents into individual elements
    elements = list_contents.split(',')

    # Remove any leading or trailing whitespace from each element
    elements = [element.strip() for element in elements]

    elements = [element.strip("'") for element in elements]

    return elements

  else:
    return None




['pain around the navel', 'pain for 2-3 days', 'intermittent pain', 'no medication or examination', 'normal stool', 'no nausea or vomiting', 'loss of appetite', 'bloating', 'felt like a needle was tied', 'possible mild diarrhea']


In [None]:
# Example usage:
text = response["answer"]
python_list = extract_python_list(text)
print(python_list)

['granules like a bean in the anus', 'painful', 'hard lump(s) near anus']


In [None]:
import json
import pickle
import pandas as pd

with open(drive_trad_model_path+"symptom_list.json", "r") as f:
  symptom_list = json.load(f)

with open(drive_trad_model_path+"rfc_model.sav", "rb") as f:
  rfc_model = pickle.load(f)

with open(drive_trad_model_path+"label_encoder.sav", "rb") as f:
  label_encoder = pickle.load(f)

df_precautions = pd.read_csv(drive_trad_model_path+"symptom_precaution.csv")

In [None]:
# prompt: how to iterate through and match a list of str in terms of cosine similarity to another list of strings

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def match_strings_cosine_similarity(list1, list2):
  """
  This function calculates the cosine similarity between two lists of strings.

  Args:
      list1 (list): The first list of strings.
      list2 (list): The second list of strings.

  Returns:
      list: A list of cosine similarity scores between each pair of strings in the two lists.
  """

  # Convert the lists of strings to numpy arrays
  array1 = []
  array2 = []

  cs_sim = [0]*len(list2)

  for i in list1:
    array1.append(embeddings.embed_query(i))

  for i in list2:
    array2.append(embeddings.embed_query(i))

  for i in range(len(list1)):
    for j in range(len(list2)):
      cosine_sim = cosine_similarity([array1[i]], [array2[j]])[0][0]
      if cosine_sim > 0.6:
        cs_sim[j] = 1


  return cs_sim


  # Calculate the cosine similarity between each pair of strings
  # similarity_scores = cosine_similarity(embeddings_array1.reshape(-1, 1), embeddings_array2.reshape(-1, 1))

  # # Return the list of similarity scores
  # return similarity_scores.tolist()

# Example usage
list1 = ['fever', 'cough', 'shortness of breath']
list2 = ['fever', 'coughing', 'body aches']

similarity_scores = match_strings_cosine_similarity(list1, list2)

print(similarity_scores)



[1, 1, 0]


In [None]:
similarity_scores = match_strings_cosine_similarity(python_list, symptom_list)


print(similarity_scores)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
len(similarity_scores)

132

In [None]:
import warnings
warnings.filterwarnings("ignore")

prediction = rfc_model.predict(np.array(similarity_scores).reshape(1, -1))

prediction = label_encoder.inverse_transform(prediction)

prediction[0]

'Arthritis'

In [None]:
test = df_precautions.loc[df_precautions['Disease'] == prediction[0]]
print(test.values[0])

['Arthritis' 'exercise' 'use hot and cold therapy' 'try acupuncture'
 'massage']


In [None]:


def get_diagnosis_and_precautions(diagnosis):



['Typhoid' 'eat high calorie vegitables' 'antiboitic therapy'
 'consult doctor' 'medication']


In [None]:
# prompt: how to save RandomForestClassifier Model?

# Import necessary libraries
import pickle

# Load the trained RandomForestClassifier model
model = pickle.load(open("/content/model.sav", "rb"))

# Save the model to a new file
pickle.dump(model, open("/content/new_model.sav", "wb"))


In [None]:
# prompt: how to get back value from Label encoder

from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
le = LabelEncoder()

# Fit the encoder to the data
le.fit(['apple', 'banana', 'cherry'])

# Get the encoded values
encoded_values = le.transform(['apple', 'banana', 'cherry'])

# Get the decoded values
decoded_values = le.inverse_transform(encoded_values)

# Print the encoded and decoded values
print(encoded_values)
print(decoded_values)


[0 1 2]
['apple' 'banana' 'cherry']


In [None]:
# prompt: how to save a Label encoder

# Import necessary libraries
import pickle

# Load the trained LabelEncoder model
le = pickle.load(open("/content/label_encoder.sav", "rb"))

# Save the model to a new file
pickle.dump(le, open("/content/new_label_encoder.sav", "wb"))


In [None]:
# prompt: how to load from json

import json

with open('/content/symptom_list.json', 'r') as f:
  symptom_list = json.load(f)

print(symptom_list)


['itching', 'skin_rash', 'nodal_skin_eruptions', 'continuous_sneezing', 'shivering', 'chills', 'joint_pain', 'stomach_pain', 'acidity', 'ulcers_on_tongue', 'muscle_wasting', 'vomiting', 'burning_micturition', 'spotting_ urination', 'fatigue', 'weight_gain', 'anxiety', 'cold_hands_and_feets', 'mood_swings', 'weight_loss', 'restlessness', 'lethargy', 'patches_in_throat', 'irregular_sugar_level', 'cough', 'high_fever', 'sunken_eyes', 'breathlessness', 'sweating', 'dehydration', 'indigestion', 'headache', 'yellowish_skin', 'dark_urine', 'nausea', 'loss_of_appetite', 'pain_behind_the_eyes', 'back_pain', 'constipation', 'abdominal_pain', 'diarrhoea', 'mild_fever', 'yellow_urine', 'yellowing_of_eyes', 'acute_liver_failure', 'fluid_overload', 'swelling_of_stomach', 'swelled_lymph_nodes', 'malaise', 'blurred_and_distorted_vision', 'phlegm', 'throat_irritation', 'redness_of_eyes', 'sinus_pressure', 'runny_nose', 'congestion', 'chest_pain', 'weakness_in_limbs', 'fast_heart_rate', 'pain_during_bow