In [2]:
pip install --upgrade tensorflow keras


Collecting tensorflow
  Obtaining dependency information for tensorflow from https://files.pythonhosted.org/packages/cf/24/271e77c22724f370c24c705f394b8035b4d27e4c2c6339f3f45ab9b8258e/tensorflow-2.18.0-cp311-cp311-win_amd64.whl.metadata
  Downloading tensorflow-2.18.0-cp311-cp311-win_amd64.whl.metadata (3.3 kB)
Collecting keras
  Obtaining dependency information for keras from https://files.pythonhosted.org/packages/c2/88/eef50051a772dcb4433d1f3e4c1d6576ba450fe83e89d028d7e8b85a2122/keras-3.6.0-py3-none-any.whl.metadata
  Downloading keras-3.6.0-py3-none-any.whl.metadata (5.8 kB)
Collecting tensorflow-intel==2.18.0 (from tensorflow)
  Obtaining dependency information for tensorflow-intel==2.18.0 from https://files.pythonhosted.org/packages/76/ad/fa6c508a15ff79cb5409294c293388e0999b7d480f84b65e4287277434fe/tensorflow_intel-2.18.0-cp311-cp311-win_amd64.whl.metadata
  Downloading tensorflow_intel-2.18.0-cp311-cp311-win_amd64.whl.metadata (4.9 kB)
Collecting flatbuffers>=24.3.25 (from tens

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\Gaurav Gupta\\anaconda\\Lib\\site-packages\\~~mpy\\.libs\\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll'
Consider using the `--user` option or check the permissions.



In [5]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [29]:
# Cell 1: Import Libraries
import PyPDF2

import sqlite3
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification, AutoModelForCausalLM
from IPython.display import display
import ipywidgets as widgets

In [23]:
# Cell 2: Text Extraction Functions
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text

def extract_text_from_txt(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
        return text


In [6]:
# Cell 2: Symptom Extraction with Hugging Face (BioBERT)
# Load a pre-trained BioBERT model for medical symptom extraction
tokenizer_biobert = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model_biobert = AutoModelForTokenClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

nlp_symptom_extraction = pipeline("ner", model=model_biobert, tokenizer=tokenizer_biobert)

def extract_symptoms_with_huggingface(text):
    entities = nlp_symptom_extraction(text)
    symptoms = []
    
    for entity in entities:
        if entity['entity'] == 'SYMPTOM':  # This would be trained to detect symptoms
            symptoms.append(entity['word'])
    
    return symptoms



Downloading config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Cell 3: Summarization of Medical Reports
# Load a pre-trained summarization model for medical text
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_medical_report(text):
    summary = summarizer(text, max_length=150, min_length=40, do_sample=False)
    return summary[0]['summary_text']



Downloading config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [9]:
def setup_database():
    conn = sqlite3.connect('medical_data.db')
    c = conn.cursor()

    # Create tables
    c.execute('''CREATE TABLE IF NOT EXISTS Disease (
                    Disease_ID INTEGER PRIMARY KEY,
                    Disease_Name TEXT,
                    Description TEXT
                )''')

    c.execute('''CREATE TABLE IF NOT EXISTS Symptom_Disease_Mapping (
                    Symptom_ID TEXT,
                    Disease_ID INTEGER,
                    Probability REAL,
                    PRIMARY KEY (Symptom_ID, Disease_ID),  -- Prevent duplicates here
                    FOREIGN KEY (Disease_ID) REFERENCES Disease(Disease_ID)
                )''')

    # Insert disease data
    try:
        c.execute("INSERT INTO Disease (Disease_Name, Description) VALUES ('Hypertension', 'A condition in which the blood pressure in the arteries is elevated.')")
        c.execute("INSERT INTO Disease (Disease_Name, Description) VALUES ('Influenza', 'A viral infection that attacks the respiratory system.')")
    except sqlite3.IntegrityError:
        print("Disease records already exist")

    # Insert symptom-disease mappings
    mappings = [
        ('S001', 1, 0.75),
        ('S002', 1, 0.85),
        ('S003', 2, 0.90)
    ]

    for mapping in mappings:
        try:
            c.execute("INSERT INTO Symptom_Disease_Mapping (Symptom_ID, Disease_ID, Probability) VALUES (?, ?, ?)", mapping)
        except sqlite3.IntegrityError:
            print(f"Mapping for Symptom_ID {mapping[0]} and Disease_ID {mapping[1]} already exists")

    conn.commit()
    conn.close()


In [10]:
# Cell 5: Prediction Using Classification Model
# Load a classification model for disease prediction
tokenizer_classifier = AutoTokenizer.from_pretrained("bert-base-uncased")
model_classifier = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")

def predict_disease_from_symptoms(symptoms):
    inputs = tokenizer_classifier(symptoms, return_tensors="pt", padding=True, truncation=True)
    outputs = model_classifier(**inputs)
    logits = outputs.logits
    predicted_class = logits.argmax(-1).item()
    return predicted_class



Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Cell 6: Question-Answering System for Clarification
qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")

def ask_question_based_on_report(report_text):
    question = "What are the main symptoms?"
    response = qa_model(question=question, context=report_text)
    return response['answer']



Downloading config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [12]:
# Cell 7: Conversational AI with DialoGPT
tokenizer_dialogpt = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
model_dialogpt = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")

def generate_conversational_response(user_input):
    new_input_ids = tokenizer_dialogpt.encode(user_input + tokenizer_dialogpt.eos_token, return_tensors='pt')
    bot_input_ids = new_input_ids
    chat_history_ids = model_dialogpt.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer_dialogpt.eos_token_id)
    response = tokenizer_dialogpt.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
    return response



Downloading tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/351M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [13]:
# Cell 8: Main Diagnosis and Interaction Function
def get_possible_diseases(symptom_ids):
    conn = sqlite3.connect('medical_data.db')
    c = conn.cursor()

    query = f'''
        SELECT d.Disease_Name, d.Description, s.Probability
        FROM Symptom_Disease_Mapping s
        JOIN Disease d ON s.Disease_ID = d.Disease_ID
        WHERE s.Symptom_ID IN ({','.join('?' * len(symptom_ids))})
        ORDER BY s.Probability DESC
    '''
    c.execute(query, symptom_ids)
    results = c.fetchall()

    if results:
        print("Possible diseases based on symptoms:")
        for row in results:
            print(f"Disease: {row[0]}, Description: {row[1]}, Probability: {row[2]}")
    else:
        print("No diseases found for the given symptoms.")
    
    conn.close()



In [20]:
import ipywidgets as widgets
from IPython.display import display

# Define file upload widget
uploader = widgets.FileUpload(
    accept='.txt,.pdf',  # Acceptable file types
    multiple=False  # Disable multiple file uploads
)

# Handle uploaded file
def handle_uploaded_file(change):
    if uploader.value:
        fileinfo = list(uploader.value)[0]  # Access the first dictionary directly
        report_content = bytes(fileinfo['content'])  # Convert to byte string
        report_text = report_content.decode('utf-8')  # Decode the byte content

        symptoms = extract_symptoms_from_text(report_text)

        if symptoms:
            print("Symptoms extracted from report:", symptoms)
            get_possible_diseases(symptoms)
        else:
            print("No symptoms detected in the report. Please provide more detailed information.")
    else:
        print("No file uploaded. Please select a file.")

uploader.observe(handle_uploaded_file, names='value')  # Observe the file upload




In [37]:
def main():
    print("Welcome to the Medical Diagnosis System")
    
    while True:  # Start an infinite loop for conversation
        option = input("Enter '1' for manual input of symptoms, '2' to upload a health checkup report, or 'q' to quit: ")

        if option == '1':
            symptoms_input = input("Enter symptom IDs separated by commas (e.g., S001,S002): ").split(',')
            get_possible_diseases(symptoms_input)
        
        elif option == '2':
            # Create file upload widget
            uploader = widgets.FileUpload(
                accept='.txt,.pdf',  # Acceptable file types
                multiple=False  # Disable multiple file uploads
            )
            
            display(uploader)
            uploader.observe(handle_uploaded_file, names='value')

        elif option.lower() == 'q':  # Allow the user to quit the conversation
            print("Thank you for using the Medical Diagnosis System. Goodbye!")
            break
        
        else:
            print("Invalid option. Please enter '1', '2', or 'q'.")
        
        follow_up = input("Do you have any more questions or symptoms to add? (yes/no): ")
        if follow_up.lower() != 'yes':
            print("Thank you for using the Medical Diagnosis System. Goodbye!")
            break

# Call the main function to run the application
main()


Welcome to the Medical Diagnosis System
Enter '1' for manual input of symptoms, '2' to upload a health checkup report, or 'q' to quit: 3
Invalid option. Please enter '1', '2', or 'q'.
Do you have any more questions or symptoms to add? (yes/no): yes
Enter '1' for manual input of symptoms, '2' to upload a health checkup report, or 'q' to quit: 1
Enter symptom IDs separated by commas (e.g., S001,S002): S001
Possible diseases based on symptoms:
Disease: Diabetes Mellitus, Description: A chronic condition that affects how your body processes sugar., Probability: 0.85
Do you have any more questions or symptoms to add? (yes/no): NO
Thank you for using the Medical Diagnosis System. Goodbye!
