In [1]:
!pip install -q datasets

In [2]:
# Hugging Face datasets
from datasets import load_dataset

# Data handling
import pandas as pd

# Text processing
import re

In [3]:
# Dataset 1: Symptom2Disease
ds_symptom2disease = load_dataset(
    "NeuronZero/Symptom2Disease",
    split="train"
)

# Dataset 2: symptom_to_diagnosis
ds_symptom_to_diagnosis = load_dataset(
    "gretelai/symptom_to_diagnosis",
    split="train"
)

# Convert to pandas DataFrames
df1 = pd.DataFrame(ds_symptom2disease)
df2 = pd.DataFrame(ds_symptom_to_diagnosis)

# Preview
print("Symptom2Disease:")
display(df1.head())

print("symptom_to_diagnosis:")
display(df2.head())


README.md:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Symptom2Disease.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/1200 [00:00<?, ? examples/s]

README.md: 0.00B [00:00, ?B/s]

train.jsonl: 0.00B [00:00, ?B/s]

test.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/853 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/212 [00:00<?, ? examples/s]

Symptom2Disease:


Unnamed: 0.1,Unnamed: 0,label,text
0,0,Psoriasis,I have been experiencing a skin rash on my arm...
1,1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,2,Psoriasis,I have been experiencing joint pain in my fing...
3,3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,4,Psoriasis,"My nails have small dents or pits in them, and..."


symptom_to_diagnosis:


Unnamed: 0,output_text,input_text
0,cervical spondylosis,I've been having a lot of pain in my neck and ...
1,impetigo,I have a rash on my face that is getting worse...
2,urinary tract infection,I have been urinating blood. I sometimes feel ...
3,arthritis,I have been having trouble with my muscles and...
4,dengue,I have been feeling really sick. My body hurts...


In [4]:
# Rename columns to a common schema
df1 = df1.rename(columns={
    "text": "symptoms",
    "label": "disease"
})

df2 = df2.rename(columns={
    "input_text": "symptoms",
    "output_text": "disease"
})

# Keep only necessary columns
df1 = df1[["symptoms", "disease"]]
df2 = df2[["symptoms", "disease"]]


In [5]:
def clean_text(text):
    """
    Basic NLP text cleaning:
    - Lowercase
    - Remove extra spaces
    """
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# Apply cleaning
df1["symptoms"] = df1["symptoms"].apply(clean_text)
df2["symptoms"] = df2["symptoms"].apply(clean_text)

# Clean disease labels
df1["disease"] = df1["disease"].str.lower().str.strip()
df2["disease"] = df2["disease"].str.lower().str.strip()


In [6]:
# Concatenate datasets
combined_df = pd.concat(
    [df1, df2],
    ignore_index=True
)

print("Combined dataset shape:", combined_df.shape)
display(combined_df.head())


Combined dataset shape: (2053, 2)


Unnamed: 0,symptoms,disease
0,i have been experiencing a skin rash on my arm...,psoriasis
1,"my skin has been peeling, especially on my kne...",psoriasis
2,i have been experiencing joint pain in my fing...,psoriasis
3,"there is a silver like dusting on my skin, esp...",psoriasis
4,"my nails have small dents or pits in them, and...",psoriasis


In [7]:
# Remove duplicates
before = combined_df.shape[0]

combined_df = combined_df.drop_duplicates(
    subset=["symptoms", "disease"]
)

after = combined_df.shape[0]

print(f"Removed {before - after} duplicate samples")
print("Final dataset size:", after)


Removed 65 duplicate samples
Final dataset size: 1988


In [8]:
class_counts = combined_df["disease"].value_counts()

print("Number of disease classes:", class_counts.shape[0])
display(class_counts)


Number of disease classes: 24


disease
psoriasis                          90
impetigo                           90
diabetes                           90
chicken pox                        89
drug reaction                      89
allergy                            89
hypertension                       89
varicose veins                     89
urinary tract infection            89
bronchial asthma                   88
common cold                        88
typhoid                            88
fungal infection                   87
dengue                             87
cervical spondylosis               87
gastroesophageal reflux disease    86
peptic ulcer disease               86
pneumonia                          84
malaria                            84
arthritis                          84
migraine                           79
jaundice                           69
acne                               46
dimorphic hemorrhoids              41
Name: count, dtype: int64

In [9]:
combined_df.to_csv(
    "combined_symptom_disease_dataset.csv",
    index=False
)

print("Dataset saved as combined_symptom_disease_dataset.csv")


Dataset saved as combined_symptom_disease_dataset.csv
