In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv(
    "/kaggle/input/dataset/combined_symptom_disease_dataset.csv"
)

print("Dataset shape:", df.shape)
df.head()


Dataset shape: (1988, 2)


Unnamed: 0,symptoms,disease
0,i have been experiencing a skin rash on my arm...,psoriasis
1,"my skin has been peeling, especially on my kne...",psoriasis
2,i have been experiencing joint pain in my fing...,psoriasis
3,"there is a silver like dusting on my skin, esp...",psoriasis
4,"my nails have small dents or pits in them, and...",psoriasis


In [3]:
def preprocess_text(text):
    """
    Minimal preprocessing:
    - Convert to lowercase
    - Remove extra whitespace
    """
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    return text.strip()


In [4]:
df["symptoms_clean"] = df["symptoms"].apply(preprocess_text)

# Keep labels consistent
df["disease"] = df["disease"].str.lower().str.strip()


In [5]:
df.isnull().sum()


symptoms          0
disease           0
symptoms_clean    0
dtype: int64

In [6]:
df[["symptoms", "symptoms_clean"]].head()


Unnamed: 0,symptoms,symptoms_clean
0,i have been experiencing a skin rash on my arm...,i have been experiencing a skin rash on my arm...
1,"my skin has been peeling, especially on my kne...","my skin has been peeling, especially on my kne..."
2,i have been experiencing joint pain in my fing...,i have been experiencing joint pain in my fing...
3,"there is a silver like dusting on my skin, esp...","there is a silver like dusting on my skin, esp..."
4,"my nails have small dents or pits in them, and...","my nails have small dents or pits in them, and..."


In [7]:
class_counts = df["disease"].value_counts()

print("Number of classes:", class_counts.shape[0])
class_counts


Number of classes: 24


disease
psoriasis                          90
impetigo                           90
diabetes                           90
chicken pox                        89
drug reaction                      89
allergy                            89
hypertension                       89
varicose veins                     89
urinary tract infection            89
bronchial asthma                   88
common cold                        88
typhoid                            88
fungal infection                   87
dengue                             87
cervical spondylosis               87
gastroesophageal reflux disease    86
peptic ulcer disease               86
pneumonia                          84
malaria                            84
arthritis                          84
migraine                           79
jaundice                           69
acne                               46
dimorphic hemorrhoids              41
Name: count, dtype: int64

In [8]:
df.to_csv(
    "preprocessed_symptom_disease_dataset.csv",
    index=False
)

print("Preprocessed dataset saved.")


Preprocessed dataset saved.
