In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('Symptom2Disease.csv')   

In [4]:
df = pd.DataFrame(data)
df

Unnamed: 0.1,Unnamed: 0,label,text
0,0,Psoriasis,I have been experiencing a skin rash on my arm...
1,1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,2,Psoriasis,I have been experiencing joint pain in my fing...
3,3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,4,Psoriasis,"My nails have small dents or pits in them, and..."
...,...,...,...
1195,295,diabetes,I'm shaking and trembling all over. I've lost ...
1196,296,diabetes,"Particularly in the crevices of my skin, I hav..."
1197,297,diabetes,I regularly experience these intense urges and...
1198,298,diabetes,"I have trouble breathing, especially outside. ..."


In [5]:
data['label'].unique()

array(['Psoriasis', 'Varicose Veins', 'Typhoid', 'Chicken pox',
       'Impetigo', 'Dengue', 'Fungal infection', 'Common Cold',
       'Pneumonia', 'Dimorphic Hemorrhoids', 'Arthritis', 'Acne',
       'Bronchial Asthma', 'Hypertension', 'Migraine',
       'Cervical spondylosis', 'Jaundice', 'Malaria',
       'urinary tract infection', 'allergy',
       'gastroesophageal reflux disease', 'drug reaction',
       'peptic ulcer disease', 'diabetes'], dtype=object)

In [6]:
data['label'].value_counts()

label
Psoriasis                          50
Varicose Veins                     50
Typhoid                            50
Chicken pox                        50
Impetigo                           50
Dengue                             50
Fungal infection                   50
Common Cold                        50
Pneumonia                          50
Dimorphic Hemorrhoids              50
Arthritis                          50
Acne                               50
Bronchial Asthma                   50
Hypertension                       50
Migraine                           50
Cervical spondylosis               50
Jaundice                           50
Malaria                            50
urinary tract infection            50
allergy                            50
gastroesophageal reflux disease    50
drug reaction                      50
peptic ulcer disease               50
diabetes                           50
Name: count, dtype: int64

In [7]:
import re

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', ' ', text)
    # Correct common typos
    text = text.replace("vomitting", "vomiting").replace("apetite", "appetite").replace("experince", "experience")
    text = text.replace("dischromic", "discolored").replace("accompained", "accompanied")
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning to the text column
data["cleaned_text"] = data["text"].apply(clean_text)

# Check a few examples
print("Sample cleaned text:")
print(data[["text", "cleaned_text"]].head())

Sample cleaned text:
                                                text  \
0  I have been experiencing a skin rash on my arm...   
1  My skin has been peeling, especially on my kne...   
2  I have been experiencing joint pain in my fing...   
3  There is a silver like dusting on my skin, esp...   
4  My nails have small dents or pits in them, and...   

                                        cleaned_text  
0  i have been experiencing a skin rash on my arm...  
1  my skin has been peeling especially on my knee...  
2  i have been experiencing joint pain in my fing...  
3  there is a silver like dusting on my skin espe...  
4  my nails have small dents or pits in them and ...  


In [8]:
# Define a list of known symptoms (curated by inspecting the dataset)
known_symptoms = [
    "skin rash", "itchy", "dry scaly patches", "peeling", "burning sensation", "stinging sensation",
    "joint pain", "silver like dusting", "small dents in nails", "inflammation in nails", "thickened skin",
    "deep cracks", "red inflamed skin", "sensitive skin", "fatigue", "malaise", "bleeding rash", "prone to infections",
    "cramps", "prominent veins", "bruising", "swollen legs", "visible blood vessels", "obesity", "leg pain",
    "constipation", "belly pain", "chills", "fever", "diarrhea", "abdominal cramps", "bloating", "vomiting",
    "weight loss", "nausea", "headache", "difficulty breathing", "high fever", "red spots", "swollen lymph nodes",
    "lethargic", "loss of appetite", "severe headache", "blistering sores", "yellow rust colored ooze", "painful sores",
    "back pain", "pain behind eyes", "muscle pain", "dizziness", "sweating", "weakness", "discolored patches",
    "nodular eruptions", "severe itching", "frequent urination", "pain during urination", "cloudy urine", "bloody urine",
    "foul smelling urine", "pelvic pain", "low fever", "runny nose", "sneezing", "itchy eyes", "watery eyes", "coughing",
    "sore throat", "swelling on face", "difficulty breathing", "nasal congestion", "chest pain", "loss of smell",
    "loss of taste", "wheezing", "muscle aches", "body cramps", "difficulty swallowing", "flaky skin", "puffy eyes",
    "stomach cramps", "heartburn", "indigestion", "sour taste", "bad breath", "belching", "burping", "chest tightness",
    "hiccups", "lump in throat", "tingling sensation", "hoarseness", "food backing up", "abdominal pain",
    "metallic taste", "change in taste", "change in smell", "tremors", "muscle twitching", "lightheadedness",
    "confusion", "brain fog", "rapid heartbeat", "skin flaking", "hair loss", "dry scalp", "increased dandruff",
    "dry skin", "decreased sex drive", "mood swings", "weight gain", "changes in appetite", "difficulty concentrating",
    "memory issues", "bloody stools", "dark tarry stools", "anemia", "gnawing hunger", "spasms", "gas",
    "changes in bowel movements", "increased thirst", "dry mouth", "blurred vision", "slow healing wounds",
    "tingling in hands", "numbness", "palpitations", "skin irritations", "yeast infections", "persistent cough"
]

# Define single-word symptoms (to handle cases like "fever", "nausea")
single_word_symptoms = [
    "fever", "nausea", "vomiting", "diarrhea", "constipation", "fatigue", "weakness", "dizziness", "headache",
    "coughing", "sneezing", "itching", "swelling", "pain", "rash", "chills", "sweating", "obesity", "malaise",
    "lethargy", "anemia", "bloating", "gas", "hiccups", "hoarseness", "tremors", "confusion", "palpitations"
]

def extract_symptoms_from_text(text, known_symptoms, single_word_symptoms):
    """
    Extract symptoms from free-text descriptions.
    Returns a space-separated string of unique symptoms.
    """
    symptoms_found = set()
    
    # Split text into words
    words = text.split()
    
    # Check for multi-word symptoms (e.g., "skin rash")
    for symptom in known_symptoms:
        if symptom in text:
            symptoms_found.add(symptom)
    
    # Check for single-word symptoms (e.g., "fever")
    for word in words:
        if word in single_word_symptoms:
            symptoms_found.add(word)
    
    # Join symptoms into a space-separated string
    return " ".join(sorted(symptoms_found)) if symptoms_found else ""

# Apply symptom extraction
data["extracted_symptoms"] = data["cleaned_text"].apply(
    lambda x: extract_symptoms_from_text(x, known_symptoms, single_word_symptoms)
)

# Check a few examples
print("Sample extracted symptoms:")
print(data[["cleaned_text", "extracted_symptoms"]].head())

Sample extracted symptoms:
                                        cleaned_text  \
0  i have been experiencing a skin rash on my arm...   
1  my skin has been peeling especially on my knee...   
2  i have been experiencing joint pain in my fing...   
3  there is a silver like dusting on my skin espe...   
4  my nails have small dents or pits in them and ...   

                       extracted_symptoms  
0  dry scaly patches itchy rash skin rash  
1              peeling stinging sensation  
2                         joint pain pain  
3                     silver like dusting  
4                                          


In [9]:
# Deduplicate based on label and extracted symptoms
data_dedup = data[["label", "extracted_symptoms"]].drop_duplicates()

# Check the deduplicated dataset
print("Deduplicated dataset shape:", data_dedup.shape)
print("Deduplicated disease distribution:\n", data_dedup["label"].value_counts())

Deduplicated dataset shape: (477, 2)
Deduplicated disease distribution:
 label
Dengue                             48
Typhoid                            43
Chicken pox                        36
allergy                            28
Jaundice                           25
Psoriasis                          25
drug reaction                      22
Impetigo                           21
peptic ulcer disease               21
Pneumonia                          20
Malaria                            20
Cervical spondylosis               20
Bronchial Asthma                   19
gastroesophageal reflux disease    18
Common Cold                        18
diabetes                           17
Varicose Veins                     16
urinary tract infection            14
Fungal infection                   11
Hypertension                       10
Dimorphic Hemorrhoids               9
Migraine                            7
Acne                                5
Arthritis                           4
Name: cou

In [10]:
# Save the preprocessed dataset
data_dedup.to_csv("preprocessed_dataset.csv", index=False)
print("Preprocessed dataset saved as 'preprocessed_dataset.csv'")

Preprocessed dataset saved as 'preprocessed_dataset.csv'


In [11]:
# Analyze unique symptoms
all_symptoms = set()
data_dedup["extracted_symptoms"].apply(lambda x: all_symptoms.update(x.split()))
print("Number of unique symptoms:", len(all_symptoms))

# Check class balance
print("Class balance after deduplication:\n", data_dedup["label"].value_counts())

Number of unique symptoms: 141
Class balance after deduplication:
 label
Dengue                             48
Typhoid                            43
Chicken pox                        36
allergy                            28
Jaundice                           25
Psoriasis                          25
drug reaction                      22
Impetigo                           21
peptic ulcer disease               21
Pneumonia                          20
Malaria                            20
Cervical spondylosis               20
Bronchial Asthma                   19
gastroesophageal reflux disease    18
Common Cold                        18
diabetes                           17
Varicose Veins                     16
urinary tract infection            14
Fungal infection                   11
Hypertension                       10
Dimorphic Hemorrhoids               9
Migraine                            7
Acne                                5
Arthritis                           4
Name: count, dt