In [18]:
#Fetching necessary libraries
import requests
import pandas as pd
import xml.etree.ElementTree as ET
from ydata_profiling import ProfileReport

In [6]:
# List of URLs to request data from
urls = [
    'https://medlineplus.gov/xml/generalhealthdefinitions.xml',
    'https://medlineplus.gov/xml/fitnessdefinitions.xml',
    'https://medlineplus.gov/xml/mineralsdefinitions.xml',
    'https://medlineplus.gov/xml/vitaminsdefinitions.xml',
    'https://medlineplus.gov/xml/nutritiondefinitions.xml'
]

In [7]:
#requesting data from a URLs
def request_data(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors
        return response.text  # Return the content of the response
    except requests.exceptions.RequestException as e:
        print(f"Error requesting data from {url}: {e}")
        return None

In [8]:
#parsing xml data and converting it to dictionary
def parse_data(xml):
    root = ET.fromstring(xml)
    data = []

    for element in root.findall('term-group'):
        row = {}
        for child in element:
            row[child.tag] = child.text
        data.append(row)
    return data

In [9]:
# Requesting data from all URLs and storing
data_list = [request_data(url) for url in urls]

In [10]:
#Acummulating all xml data into a single list
alldata = []
for data in data_list:
    if data:
        parsed_data = parse_data(data)
        alldata.extend(parsed_data)

In [14]:
df.to_csv('Medicalterms Meaning Data.csv', index = False)

In [11]:
#converting to dataframe
df = pd.DataFrame(alldata)

In [15]:
df

Unnamed: 0,term,definition
0,>Basal Body Temperature,>Basal body temperature is your temperature at...
1,>Blood Alcohol Content,">Blood alcohol content, or blood alcohol conce..."
2,>Blood Pressure,>Blood pressure is the force of blood pushing ...
3,>Blood Type,">There are four major blood types: A, B, O, an..."
4,>Body Mass Index,>Body Mass Index (BMI) is an estimate of your ...
...,...,...
92,>Sugar,>Sugars are a type of simple carbohydrate. Th...
93,>Total Fat,>Fat is a type of nutrient. You need a certain...
94,>Trans Fat,>Trans fat is a type of fat that is created wh...
95,>Triglycerides,>Triglycerides are a type of fat found in your...


## cleaning terms dataset

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97 entries, 0 to 96
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   term        97 non-null     object
 1   definition  97 non-null     object
dtypes: object(2)
memory usage: 1.6+ KB


In [19]:
#generating pandas profiling report
profile = ProfileReport(df, title = 'Medical terms REPORT')
profile.to_file('medical terms.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [20]:
df = df.drop_duplicates()

In [22]:
df.duplicated().sum()

np.int64(0)

In [38]:
df.nunique()

term          86
definition    85
dtype: int64

In [39]:
df.value_counts()

term                      definition                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
>Activity Count           >Physical activity is any body movement that works your muscles and requires more energy than resting. Walking, running, dancing, swimming, yoga, and gardening are a few examples of physical activity.                                                                                                                                                                                                                                          

In [40]:
df = df.replace('>', '', regex = True)

In [41]:
df.head()

Unnamed: 0,term,definition
0,Basal Body Temperature,Basal body temperature is your temperature at ...
1,Blood Alcohol Content,"Blood alcohol content, or blood alcohol concen..."
2,Blood Pressure,Blood pressure is the force of blood pushing a...
3,Blood Type,"There are four major blood types: A, B, O, and..."
4,Body Mass Index,Body Mass Index (BMI) is an estimate of your b...


In [58]:
df.describe()

Unnamed: 0,term,definition
count,86,86
unique,86,85
top,Basal Body Temperature,Your weight is the mass or quantity of your he...
freq,1,2


In [16]:
das = pd.read_csv('DiseaseAndSymptoms.csv')
das

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,vomiting,headache,nausea,spinning_movements,loss_of_balance,unsteadiness,,,,,,,,,,,
4916,Acne,skin_rash,pus_filled_pimples,blackheads,scurring,,,,,,,,,,,,,
4917,Urinary tract infection,burning_micturition,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,,,,,,,,,,,,,
4918,Psoriasis,skin_rash,joint_pain,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,,,,,,,,,,,


## cleaning diseases and symptoms dataset

In [42]:
das.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4920 entries, 0 to 4919
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Disease     4920 non-null   object
 1   Symptom_1   4920 non-null   object
 2   Symptom_2   4920 non-null   object
 3   Symptom_3   4920 non-null   object
 4   Symptom_4   4572 non-null   object
 5   Symptom_5   3714 non-null   object
 6   Symptom_6   2934 non-null   object
 7   Symptom_7   2268 non-null   object
 8   Symptom_8   1944 non-null   object
 9   Symptom_9   1692 non-null   object
 10  Symptom_10  1512 non-null   object
 11  Symptom_11  1194 non-null   object
 12  Symptom_12  744 non-null    object
 13  Symptom_13  504 non-null    object
 14  Symptom_14  306 non-null    object
 15  Symptom_15  240 non-null    object
 16  Symptom_16  192 non-null    object
 17  Symptom_17  72 non-null     object
dtypes: object(18)
memory usage: 692.0+ KB


In [44]:
#generating pandas profiling report
profile = ProfileReport(das, title = 'Diseases and Symptoms REPORT')
profile.to_file('DiseasesAndSymptoms.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [47]:
das['Symptom_17'].value_counts()

Symptom_17
muscle_pain    72
Name: count, dtype: int64

In [52]:
das['Symptom_17'].shape

(4920,)

In [54]:
das.duplicated().sum()

np.int64(4616)

In [57]:
das.describe()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
count,4920,4920,4920,4920,4572,3714,2934,2268,1944,1692,1512,1194,744,504,306,240,192,72
unique,41,34,48,54,50,38,32,26,21,22,21,18,11,8,4,3,3,1
top,Fungal infection,vomiting,vomiting,fatigue,high_fever,headache,nausea,abdominal_pain,abdominal_pain,yellowing_of_eyes,yellowing_of_eyes,irritability,malaise,muscle_pain,chest_pain,chest_pain,blood_in_sputum,muscle_pain
freq,120,822,870,726,378,348,390,264,276,228,198,120,126,72,96,144,72,72


In [61]:
#concatenate dataframes side by side
combined_df = pd.concat([df, das], axis = 1)
combined_df

Unnamed: 0,term,definition,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Basal Body Temperature,Basal body temperature is your temperature at ...,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Blood Alcohol Content,"Blood alcohol content, or blood alcohol concen...",Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Blood Pressure,Blood pressure is the force of blood pushing a...,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Blood Type,"There are four major blood types: A, B, O, and...",Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Body Mass Index,Body Mass Index (BMI) is an estimate of your b...,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,,,(vertigo) Paroymsal Positional Vertigo,vomiting,headache,nausea,spinning_movements,loss_of_balance,unsteadiness,,,,,,,,,,,
4916,,,Acne,skin_rash,pus_filled_pimples,blackheads,scurring,,,,,,,,,,,,,
4917,,,Urinary tract infection,burning_micturition,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,,,,,,,,,,,,,
4918,,,Psoriasis,skin_rash,joint_pain,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,,,,,,,,,,,


In [62]:
combined_df.to_csv('combined.csv', index = False)

## NLP

In [77]:
#importing necessary libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
import numpy as np
import re
from numpy.linalg import norm

In [92]:
class MedicalQA:
    def __init__(self):
        # Download required NLTK data
        nltk.download('punkt')
        nltk.download('punkt_tab')
        nltk.download('stopwords')
        nltk.download('averaged_perceptron_tagger')

        
        # Initialize the sentence transformer model
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        
        # Load and process the dataset
        self.df = self.load_and_process_data()
        
        # Create disease-symptom dictionary
        self.disease_dict = self.create_disease_dict()
        
        # Create embeddings for diseases and symptoms
        self.disease_embeddings = self.create_disease_embeddings()

    def load_and_process_data(self):
        df = pd.read_csv('combined.csv')
        df = df.dropna()
        df = df.iloc[:, 2:]  # Remove empty columns
        
        # Clean disease names and symptoms
        df['Disease'] = df['Disease'].str.strip()
        symptoms_cols = [col for col in df.columns if 'Symptom' in col]
        
        # Combine symptoms into a list
        df['symptoms'] = df[symptoms_cols].apply(
            lambda x: [str(item).replace('_', ' ').strip() for item in x if pd.notna(item) and str(item) != 'nan'],
            axis=1
        )
        
        return df[['Disease', 'symptoms']]

    def create_disease_dict(self):
        return dict(zip(self.df['Disease'], self.df['symptoms']))

    def get_embedding(self, text):
        return self.model.encode(text, convert_to_tensor=False)

    def create_disease_embeddings(self):
        disease_embeddings = {}
        
        for disease, symptoms in self.disease_dict.items():
            # Create a single text combining disease name and symptoms
            disease_text = f"{disease} {' '.join(symptoms)}"
            disease_embeddings[disease] = self.get_embedding(disease_text)
            
        return disease_embeddings

    def cosine_similarity(self, v1, v2):
        if (norm(v1) * norm(v2)) == 0:
            return 0
        return np.dot(v1, v2) / (norm(v1) * norm(v2))

    def extract_symptoms(self, question):
        """Extract symptoms from the question using semantic similarity"""
        processed_question = self.preprocess_question(question)
        question_embedding = self.get_embedding(processed_question)
        
        # Create a set of all known symptoms
        all_symptoms = set()
        for symptoms in self.disease_dict.values():
            all_symptoms.update(symptoms)
        
        # Find matching symptoms using semantic similarity
        found_symptoms = []
        for symptom in all_symptoms:
            symptom_embedding = self.get_embedding(symptom)
            similarity = self.cosine_similarity(question_embedding, symptom_embedding)
            
            # If similarity is above threshold, consider it a match
            if similarity > 0.6:  # Adjust threshold as needed
                found_symptoms.append(symptom)
                
        return found_symptoms

    def preprocess_question(self, question):
        # Convert to lowercase and remove special characters
        question = re.sub(r'[^a-zA-Z\s]', '', question.lower())
        
        # Tokenize
        tokens = word_tokenize(question)
        
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [t for t in tokens if t not in stop_words]
        
        return ' '.join(tokens)

    def get_most_likely_diseases(self, symptoms, top_n=3):
        if not symptoms:
            return []
        
        # Create query embedding from symptoms
        query_text = ' '.join(symptoms)
        query_embedding = self.get_embedding(query_text)
        
        # Calculate similarities with all diseases
        similarities = []
        for disease, embedding in self.disease_embeddings.items():
            similarity = self.cosine_similarity(query_embedding, embedding)
            similarities.append((disease, similarity))
        
        # Sort by similarity and get top matches
        similarities.sort(key=lambda x: x[1], reverse=True)
        return [(disease, round(similarity * 100, 2)) for disease, similarity in similarities[:top_n]]

    def answer_question(self, question):
        try:
            # Extract symptoms from question
            symptoms = self.extract_symptoms(question)
            
            if not symptoms:
                return "I couldn't identify any specific symptoms in your question. Please describe your symptoms more clearly."
            
            # Get likely diseases
            likely_diseases = self.get_most_likely_diseases(symptoms)
            
            # Prepare response
            response = "Based on the symptoms you described:\n"
            response += f"Identified symptoms: {', '.join(symptoms)}\n\n"
            response += "Possible conditions:\n"
            
            for disease, confidence in likely_diseases:
                response += f"- {disease} (Confidence: {confidence}%)\n"
                
            response += "\nPlease note: This is not a medical diagnosis. Consult a healthcare professional for proper medical advice."
            
            return response
            
        except Exception as e:
            return f"An error occurred: {str(e)}\nPlease try rephrasing your question."

# Usage example
if __name__ == "__main__":
    # Initialize the QA system
     print("Initializing Medical QA System...")
     qa_system = MedicalQA()
    
    # Example questions
     questions = [
        "body mass index",
        "I'm experiencing chest pain and shortness of breath",
        "I have constant fatigue and increased thirst with blurred vision"
     ]
    
     print("\nMedical QA System Demo\n")
     for question in questions:
        print(f"Question: {question}")
        print("\nAnswer:")
        print(qa_system.answer_question(question))
        print("\n" + "="*50 + "\n")

Initializing Medical QA System...


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\harka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\harka\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!



Medical QA System Demo

Question: body mass index

Answer:
I couldn't identify any specific symptoms in your question. Please describe your symptoms more clearly.


Question: I'm experiencing chest pain and shortness of breath

Answer:
I couldn't identify any specific symptoms in your question. Please describe your symptoms more clearly.


Question: I have constant fatigue and increased thirst with blurred vision

Answer:
I couldn't identify any specific symptoms in your question. Please describe your symptoms more clearly.




In [None]:
# preprocessing function
