In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_csv("medical_dataset.csv")

In [3]:
df.head()

Unnamed: 0,condition,symptoms
0,Abdominal aortic aneurysm (AAA) screening,
1,Abdominal aortic aneurysm,"tummy or back pain, a pulsing feeling in your ..."
2,Abdominal aortic aneurysm,"tummy or back pain, a pulsing feeling in your ..."
3,Abdominal aortic aneurysm (AAA) screening,
4,Abortion,


In [4]:
# Step 1: Clean the data by removing rows with empty symptoms or overview columns
df = df[df['symptoms'].notna()]  # Remove rows where symptoms are NaN
df = df[df['symptoms'].str.strip().str.lower() != 'overview']  # Remove rows that have 'overview' in symptoms column
df = df[df['symptoms'].str.strip() != '']  # Remove rows with empty symptoms
df = df[df['symptoms'].str.strip() != '-']  # Remove rows with empty symptoms

# Clean the 'condition' column by removing "Overview\n-" and any unwanted text
df['condition'] = df['condition'].str.replace(r'Overview\n-\n', '', regex=True)

df = df.drop_duplicates()

In [5]:
df.head(20)

Unnamed: 0,condition,symptoms
1,Abdominal aortic aneurysm,"tummy or back pain, a pulsing feeling in your ..."
6,Achalasia,"bringing back up undigested food, choking and ..."
9,Acne,"face – this affects almost everyone with acne,..."
10,Acoustic neuroma (vestibular schwannoma),"hearing loss that usually only affects 1 ear, ..."
11,Acromegaly,swollen hands and feet – you may notice a chan...
15,Acute cholecystitis,"a high temperature, feeling sick, being sick, ..."
16,Acute kidney injury,"feeling sick or being sick, diarrhoea, dehydra..."
18,Acute myeloid leukaemia,"looking pale or ""washed out"", feeling tired or..."
19,Acute pancreatitis,suddenly getting severe pain in the centre of ...
20,Acute respiratory distress syndrome (ARDS),"shortness of breath, taking short, fast breaths"


In [7]:
# Step 5: Save the updated dataset to a new CSV
df.to_csv("updated_medical_dataset.csv", index=False)

## MODEL BUILDING

In [31]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [32]:
# Step 1: Load the dataset
df = pd.read_csv('updated_medical_dataset.csv')

In [33]:
# Step 2: Preprocess the symptoms and convert them to a binary feature vector

# Create a list of all unique symptoms
all_symptoms = set()
for symptoms in df['symptoms']:
    all_symptoms.update(symptoms.lower().split(','))  # Split symptoms by comma and convert to lower case
all_symptoms = list(all_symptoms)  # Convert to list for later use

In [34]:
# Step 3: Convert symptoms to binary vector
def symptoms_to_vector(symptoms, all_symptoms):
    symptom_vector = [0] * len(all_symptoms)  # Initialize a vector with 0's
    symptom_list = [s.strip() for s in symptoms.lower().split(',')]  # Clean symptoms and split by commas
    for symptom in symptom_list:
        if symptom in all_symptoms:
            symptom_vector[all_symptoms.index(symptom)] = 1  # Mark 1 if the symptom is present
    return symptom_vector

In [35]:
# Step 4: Prepare the feature matrix and target variable
X = [symptoms_to_vector(symptoms, all_symptoms) for symptoms in df['symptoms']]
y = df['condition']  # The target variable is the disease condition

In [36]:
# Step 5: Encode the target variable (condition)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [37]:
# Step 6: Train the model on all data (using the entire dataset)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y_encoded)

In [38]:
# Step 7: Evaluate the model on the training data (as all data is being used for training)
y_pred = model.predict(X)
accuracy = accuracy_score(y_encoded, y_pred)
print(f"Training Accuracy: {accuracy * 100:.2f}%")

Training Accuracy: 96.61%


In [39]:
# Print classification report
print("\nClassification Report (Training Data):")
print(classification_report(y_encoded, y_pred, target_names=label_encoder.classes_))


Classification Report (Training Data):
                                                                precision    recall  f1-score   support

                                     Abdominal aortic aneurysm       1.00      1.00      1.00         1
                                                     Achalasia       1.00      1.00      1.00         1
                                                          Acne       1.00      1.00      1.00         1
                      Acoustic neuroma (vestibular schwannoma)       1.00      1.00      1.00         1
                                                    Acromegaly       1.00      1.00      1.00         1
                                           Acute cholecystitis       1.00      1.00      1.00         1
                                           Acute kidney injury       1.00      1.00      1.00         1
                                       Acute myeloid leukaemia       1.00      1.00      1.00         1
                       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [48]:
# Step 8: Function to predict for new symptoms
def predict_disease(symptoms_input):
    # Convert the symptoms input into the same format as the training data (binary vector)
    symptom_vector = symptoms_to_vector(symptoms_input, all_symptoms)

    # Make prediction
    prediction = model.predict([symptom_vector])
    predicted_condition = label_encoder.inverse_transform(prediction)[0]
    
    # Get the probabilities for each condition
    probabilities = model.predict_proba([symptom_vector])[0]
    
    # Print the predicted condition and probabilities
    print(f"Predicted Condition: {predicted_condition}")
    
    # Print only conditions with probability > 50%
    for condition, probability in zip(label_encoder.classes_, probabilities):
        if probability > 0.10:  # Only print if probability is greater than 50%
            print(f"{condition}: {probability * 100:.2f}%")

In [49]:
# Step 9: Test the model with a random symptom input
random_symptoms = "fever, headache, stomach pain"
predict_disease(random_symptoms)

Predicted Condition: Typhoid fever
Endocarditis: 12.00%
Hydrocephalus: 20.00%
Sarcoidosis: 19.00%
Typhoid fever: 27.00%


In [50]:
random_symptoms = "needing to pee suddenly or more often than usual, pain or a burning sensation when peeing, smelly or cloudy pee, blood in your pee"
predict_disease(random_symptoms)

Predicted Condition: Kidney infection
Kidney infection: 67.00%


In [56]:
random_symptoms = "pain in the side of your tummy (abdomen), severe pain that comes and goes, feeling sick or vomiting"
predict_disease(random_symptoms)

Predicted Condition: Kidney stones
Kidney stones: 65.00%


In [57]:
random_symptoms = "a high temperature, sweats and chills, headaches and feeling confused, feeling very tired and sleepy (especially in children), feeling and being sick, tummy pain and diarrhoea, loss of appetite, muscle pains, yellow skin or whites of the eyes, a sore throat, cough and difficulty breathing"
predict_disease(random_symptoms)

Predicted Condition: Toxocariasis
Malaria: 25.86%
Middle East respiratory syndrome (MERS): 20.58%
Rheumatic fever: 24.19%
Toxocariasis: 28.37%


In [58]:
random_symptoms = "you cannot move some or all of your face or body, your face or body is weak or floppy, your face or body is numb, painful or tingles all the time, your face or body is stiff with muscle spasms and twitches"
predict_disease(random_symptoms)

Predicted Condition: Paralysis
Paralysis: 63.00%


In [59]:
random_symptoms = "a high temperature, extreme tiredness (fatigue), headaches, being sick (vomiting), a stiff neck, muscle pain"
predict_disease(random_symptoms)

Predicted Condition: Polio
Polio: 65.00%


In [60]:
random_symptoms = "feeling very tired and weak all the time, feeling irritable and sad all the time, joint, muscle or leg pain, swollen, bleeding gums (sometimes teeth can fall out), developing red or blue spots on the skin, usually on the legs and feet, although this may be less noticeable on brown or black skin, skin that bruises easily"
predict_disease(random_symptoms)

Predicted Condition: Scurvy
Liver disease: 13.00%
Scurvy: 58.00%


In [65]:
random_symptoms = "pain in a tendon that gets worse when you move, difficulty moving the joint, feeling a grating or crackling sensation when you move the tendon, swelling, sometimes with heat or redness"
predict_disease(random_symptoms)

Predicted Condition: Tendonitis
Osteoarthritis: 19.00%
Tendonitis: 67.00%


In [66]:
random_symptoms = "a persistent high temperature that gradually increases each day, headache, general aches and pains, extreme tiredness (fatigue), cough, constipation"
predict_disease(random_symptoms)

Predicted Condition: Typhoid fever
Typhoid fever: 71.00%


In [67]:
random_symptoms = "a high temperature, a headache, sore, red eyes, swollen joints and joint and muscle pain, a rash and itching all over the body"
predict_disease(random_symptoms)

Predicted Condition: Zika virus
Haemophilus influenzae type b (Hib): 21.00%
Zika virus: 63.00%


In [68]:
random_symptoms = "lightheaded or dizzy, faint"
predict_disease(random_symptoms)

Predicted Condition: Heart block
Heart block: 61.00%


In [87]:
random_symptoms = "being sick, diarrhoea and tummy pain, a skin rash, yellowing of the skin and eyes, blood in your poo, lots of bruises all over your body, bleeding from your ears, eyes, nose or mouth"
# random_symptoms = "elevated body temperature or subjective fever, chills, myalgia, and fatigue"
predict_disease(random_symptoms)

Predicted Condition: Ebola virus disease
Ebola virus disease: 58.00%


In [85]:
random_symptoms = "severe diarrhea, vomiting, and dehydration"
# random_symptoms = "having lots of watery diarrhoea, feeling sick or being sick, tummy pain, dehydration"
predict_disease(random_symptoms)


Predicted Condition: Atrial fibrillation


In [73]:
random_symptoms = "a high temperature, a severe headache, pain behind your eyes, muscle and joint pain, feeling or being sick, swollen glands, a blotchy rash made up of flat or slightly raised spots – this can affect large areas of your body"
predict_disease(random_symptoms)

Predicted Condition: Dengue
Appendicitis: 13.00%
Dengue: 64.00%
Hepatitis: 12.00%
