In [1]:
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv("DiseaseAndSymptoms.csv")  # Assuming the dataset is saved as a CSV file

# Check the dataset size
print("Dataset shape:", data.shape)
print("Unique diseases:", data["Disease"].nunique())
print("Disease distribution:\n", data["Disease"].value_counts())

Dataset shape: (4920, 18)
Unique diseases: 41
Disease distribution:
 Disease
Fungal infection                           120
Hepatitis C                                120
Hepatitis E                                120
Alcoholic hepatitis                        120
Tuberculosis                               120
Common Cold                                120
Pneumonia                                  120
Dimorphic hemmorhoids(piles)               120
Heart attack                               120
Varicose veins                             120
Hypothyroidism                             120
Hyperthyroidism                            120
Hypoglycemia                               120
Osteoarthristis                            120
Arthritis                                  120
(vertigo) Paroymsal  Positional Vertigo    120
Acne                                       120
Urinary tract infection                    120
Psoriasis                                  120
Hepatitis D                   

In [2]:
# Clean disease names
data["Disease"] = data["Disease"].str.strip()  # Remove leading/trailing spaces
data["Disease"] = data["Disease"].replace({
    "Peptic ulcer diseae": "Peptic ulcer disease",
    "Dimorphic hemmorhoids(piles)": "Dimorphic hemorrhoids (piles)",
    "(vertigo) Paroymsal  Positional Vertigo": "Vertigo (Paroxysmal Positional Vertigo)"
})

# Verify the updated disease names
print("Updated unique diseases:", data["Disease"].nunique())

Updated unique diseases: 41


In [3]:
print("Column names in the dataset:", data.columns.tolist())

Column names in the dataset: ['Disease', 'Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9', 'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14', 'Symptom_15', 'Symptom_16', 'Symptom_17']


In [4]:
# Function to clean symptom names
def clean_symptom(symptom):
    if pd.isna(symptom):
        return None
    symptom = str(symptom).strip()  # Remove leading/trailing spaces
    symptom = symptom.replace("_", " ")  # Replace underscores with spaces
    symptom = symptom.replace("  ", " ")  # Replace double spaces with single spaces
    return symptom.lower()

# Combine symptoms into a single column
symptom_columns = [f"Symptom_{i}" for i in range(1, 18)]
data["extracted_symptoms"] = data[symptom_columns].apply(
    lambda row: " ".join(sorted(set(filter(None, [clean_symptom(symptom) for symptom in row])))),
    axis=1
)

# Drop the original symptom columns
data = data[["Disease", "extracted_symptoms"]]

# Check the updated dataset
print("Updated dataset shape:", data.shape)
print("Sample rows:\n", data.head())

Updated dataset shape: (4920, 2)
Sample rows:
             Disease                                 extracted_symptoms
0  Fungal infection  dischromic patches itching nodal skin eruption...
1  Fungal infection  dischromic patches nodal skin eruptions skin rash
2  Fungal infection    dischromic patches itching nodal skin eruptions
3  Fungal infection               dischromic patches itching skin rash
4  Fungal infection             itching nodal skin eruptions skin rash


In [5]:
# Check for empty symptom rows
empty_symptom_rows = data[data["extracted_symptoms"] == ""]
print("Rows with empty symptoms:", len(empty_symptom_rows))

# Drop rows with empty symptoms (if any)
data = data[data["extracted_symptoms"] != ""]
print("Dataset shape after removing empty symptoms:", data.shape)

Rows with empty symptoms: 0
Dataset shape after removing empty symptoms: (4920, 2)


In [6]:
# Deduplicate rows
data_dedup = data.drop_duplicates(subset=["Disease", "extracted_symptoms"])
print("Dataset shape after deduplication:", data_dedup.shape)
print("Disease distribution after deduplication:\n", data_dedup["Disease"].value_counts())

Dataset shape after deduplication: (304, 2)
Disease distribution after deduplication:
 Disease
Hepatitis D                                10
Dengue                                     10
Chicken pox                                10
Migraine                                   10
Hepatitis B                                 9
Hypoglycemia                                9
Common Cold                                 9
Tuberculosis                                9
Hepatitis E                                 9
hepatitis A                                 9
Typhoid                                     9
Hyperthyroidism                             9
Jaundice                                    9
Diabetes                                    9
Pneumonia                                   9
Varicose veins                              8
Malaria                                     8
Hypothyroidism                              8
Alcoholic hepatitis                         8
Chronic cholestasis            