In [100]:
import pandas as pd
import numpy as np
import re

In [101]:
df = pd.read_csv("medical_dataset.csv")

In [102]:
df.head()

Unnamed: 0,condition,symptoms
0,Abdominal aortic aneurysm (AAA) screening,
1,Abdominal aortic aneurysm,"tummy or back pain, a pulsing feeling in your ..."
2,Abdominal aortic aneurysm,"tummy or back pain, a pulsing feeling in your ..."
3,Abdominal aortic aneurysm (AAA) screening,
4,Abortion,


In [103]:
# Step 1: Clean the data by removing rows with empty symptoms or overview columns
df = df[df['symptoms'].notna()]  # Remove rows where symptoms are NaN
df = df[df['symptoms'].str.strip().str.lower() != 'overview']  # Remove rows that have 'overview' in symptoms column
df = df[df['symptoms'].str.strip() != '']  # Remove rows with empty symptoms
df = df[df['symptoms'].str.strip() != '-']  # Remove rows with empty symptoms

# Clean the 'condition' column by removing "Overview\n-" and any unwanted text
df['condition'] = df['condition'].str.replace(r'Overview\n-\n', '', regex=True)

df = df.drop_duplicates()

In [104]:
df.head(20)

Unnamed: 0,condition,symptoms
1,Abdominal aortic aneurysm,"tummy or back pain, a pulsing feeling in your ..."
6,Achalasia,"bringing back up undigested food, choking and ..."
9,Acne,"face – this affects almost everyone with acne,..."
10,Acoustic neuroma (vestibular schwannoma),"hearing loss that usually only affects 1 ear, ..."
11,Acromegaly,swollen hands and feet – you may notice a chan...
15,Acute cholecystitis,"a high temperature, feeling sick, being sick, ..."
16,Acute kidney injury,"feeling sick or being sick, diarrhoea, dehydra..."
18,Acute myeloid leukaemia,"looking pale or ""washed out"", feeling tired or..."
19,Acute pancreatitis,suddenly getting severe pain in the centre of ...
20,Acute respiratory distress syndrome (ARDS),"shortness of breath, taking short, fast breaths"


In [105]:
# Step 5: Save the updated dataset to a new CSV
df.to_csv("updated_medical_dataset.csv", index=False)

## MODEL BUILDING

In [85]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [86]:
# Step 1: Load the dataset
df = pd.read_csv('updated_medical_dataset_2.csv')

In [87]:
# Step 2: Preprocess the symptoms and convert them to a binary feature vector

# Create a list of all unique symptoms
all_symptoms = set()
for symptoms in df['symptoms']:
    all_symptoms.update(symptoms.lower().split(','))  # Split symptoms by comma and convert to lower case
all_symptoms = list(all_symptoms)  # Convert to list for later use

In [88]:
# Step 3: Convert symptoms to binary vector
def symptoms_to_vector(symptoms, all_symptoms):
    symptom_vector = [0] * len(all_symptoms)  # Initialize a vector with 0's
    symptom_list = [s.strip() for s in symptoms.lower().split(',')]  # Clean symptoms and split by commas
    for symptom in symptom_list:
        if symptom in all_symptoms:
            symptom_vector[all_symptoms.index(symptom)] = 1  # Mark 1 if the symptom is present
    return symptom_vector

In [89]:
# Step 4: Prepare the feature matrix and target variable
X = [symptoms_to_vector(symptoms, all_symptoms) for symptoms in df['symptoms']]
y = df['condition']  # The target variable is the disease condition

In [90]:
# Step 5: Encode the target variable (condition)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [91]:
# Step 6: Train the model on all data (using the entire dataset)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y_encoded)

In [92]:
# Step 7: Evaluate the model on the training data (as all data is being used for training)
y_pred = model.predict(X)
accuracy = accuracy_score(y_encoded, y_pred)
print(f"Training Accuracy: {accuracy * 100:.2f}%")

Training Accuracy: 98.34%


In [93]:
# Print classification report
# print("\nClassification Report (Training Data):")
# print(classification_report(y_encoded, y_pred, target_names=label_encoder.classes_))

In [94]:
# Step 8: Function to predict for new symptoms
def predict_disease(symptoms_input):
    # Convert the symptoms input into the same format as the training data (binary vector)
    symptom_vector = symptoms_to_vector(symptoms_input, all_symptoms)

    # Make prediction
    prediction = model.predict([symptom_vector])
    predicted_condition = label_encoder.inverse_transform(prediction)[0]
    
    # Get the probabilities for each condition
    probabilities = model.predict_proba([symptom_vector])[0]
    
    # Print the predicted condition and probabilities
    print(f"Predicted Condition: {predicted_condition}")
    
    # Print only conditions with probability > 50%
    for condition, probability in zip(label_encoder.classes_, probabilities):
        if probability > 0.10:  # Only print if probability is greater than 50%
            print(f"{condition}: {probability * 100:.2f}%")

In [95]:
# Step 9: Test the model with a random symptom input
random_symptoms = "fever, headache, stomach pain"
predict_disease(random_symptoms)

Predicted Condition: Sarcoidosis
Endocarditis: 11.00%
Hydrocephalus: 16.00%
Sarcoidosis: 23.00%
Typhoid fever: 22.00%


In [96]:
random_symptoms = "frequent watery diarrhoea, nausea, vomiting, dehydration, dry mouth, and feeling extremely weak"
predict_disease(random_symptoms)

Predicted Condition: Cholera
Cholera: 100.00%


In [99]:
random_symptoms = "delays in sitting or walking, muscle stiffness or floppiness, weak arms or legs, uncontrolled movements, difficulty swallowing, speech delays, and vision problems"
predict_disease(random_symptoms)

Predicted Condition: Cerebral palsy
Cerebral palsy: 99.00%


In [98]:
random_symptoms ="delays in reaching development milestones – for example, not sitting by 8 months or not walking by 18 months, seeming too stiff or too floppy, weak arms or legs, fidgety, jerky or clumsy movements, random, uncontrolled movements, walking on tiptoes, a range of other problems – such as swallowing problems, speaking problems, vision problems and learning disabilities"
predict_disease(random_symptoms)

Predicted Condition: Cerebral palsy
Cerebral palsy: 64.00%
