In [1]:
import pandas as pd
import numpy as np
import json
from zipfile import ZipFile
from mlxtend.preprocessing import TransactionEncoder
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm
from collections import Counter

In [4]:
with ZipFile("Data/Sources/Final_Augmented_dataset_Diseases_and_Symptoms.csv.zip","r") as rf:
    rf.extractall("Data/Dataset_Diseases")    

In [3]:
#Encoded Dataset
data = pd.read_csv("Data/Dataset_Diseases/Dataset_Diseases/Final_Augmented_dataset_Diseases_and_Symptoms.csv")
data

Unnamed: 0,diseases,anxiety and nervousness,depression,shortness of breath,depressive or psychotic symptoms,sharp chest pain,dizziness,insomnia,abnormal involuntary movements,chest tightness,...,stuttering or stammering,problems with orgasm,nose deformity,lump over jaw,sore in nose,hip weakness,back swelling,ankle stiffness or tightness,ankle weakness,neck weakness
0,panic disorder,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,panic disorder,0,0,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,panic disorder,1,1,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,panic disorder,1,0,0,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,panic disorder,1,1,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246940,open wound of the nose,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
246941,open wound of the nose,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
246942,open wound of the nose,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
246943,open wound of the nose,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
symptom_severity = {
    "anxiety and nervousness": 2,
    "depression": 2,
    "shortness of breath": 3,
    "depressive or psychotic symptoms": 3,
    "sharp chest pain": 3,
    "dizziness": 2,
    "insomnia": 1,
    "abnormal involuntary movements": 3,
    "chest tightness": 3,
    "palpitations": 3,
    "irregular heartbeat": 3,
    "breathing fast": 3,
    "hoarse voice": 1,
    "sore throat": 1,
    "difficulty speaking": 3,
    "cough": 1,
    "nasal congestion": 1,
    "throat swelling": 3,
    "diminished hearing": 2,
    "lump in throat": 3,
    "throat feels tight": 3,
    "difficulty in swallowing": 3,
    "skin swelling": 2,
    "retention of urine": 3,
    "groin mass": 2,
    "leg pain": 2,
    "hip pain": 2,
    "suprapubic pain": 2,
    "blood in stool": 3,
    "lack of growth": 2,
    "emotional symptoms": 2,
    "elbow weakness": 2,
    "back weakness": 2,
    "pus in sputum": 3,
    "symptoms of the scrotum and testes": 2,
    "swelling of scrotum": 2,
    "pain in testicles": 2,
    "flatulence": 1,
    "pus draining from ear": 2,
    "jaundice": 3,
    "mass in scrotum": 2,
    "white discharge from eye": 2,
    "irritable infant": 2,
    "abusing alcohol": 2,
    "fainting": 3,
    "hostile behavior": 2,
    "drug abuse": 3,
    "sharp abdominal pain": 3,
    "feeling ill": 2,
    "vomiting": 2,
    "headache": 2,
    "nausea": 1,
    "diarrhea": 2,
    "vaginal itching": 1,
    "vaginal dryness": 1,
    "painful urination": 2,
    "involuntary urination": 2,
    "pain during intercourse": 2,
    "frequent urination": 2,
    "lower abdominal pain": 2,
    "vaginal discharge": 2,
    "blood in urine": 3,
    "hot flashes": 1,
    "intermenstrual bleeding": 2,
    "hand or finger pain": 1,
    "wrist pain": 1,
    "hand or finger swelling": 2,
    "arm pain": 2,
    "wrist swelling": 2,
    "arm stiffness or tightness": 2,
    "arm swelling": 2,
    "hand or finger stiffness or tightness": 2,
    "wrist stiffness or tightness": 2,
    "lip swelling": 3,
    "toothache": 1,
    "abnormal appearing skin": 1,
    "skin lesion": 1,
    "acne or pimples": 1,
    "dry lips": 1,
    "facial pain": 2,
    "mouth ulcer": 1,
    "skin growth": 2,
    "eye deviation": 2,
    "diminished vision": 3,
    "double vision": 3,
    "cross-eyed": 2,
    "symptoms of eye": 2,
    "pain in eye": 2,
    "eye moves abnormally": 3,
    "abnormal movement of eyelid": 2,
    "foreign body sensation in eye": 2,
    "irregular appearing scalp": 1,
    "swollen lymph nodes": 2,
    "back pain": 2,
    "neck pain": 2,
    "low back pain": 2,
    "pain of the anus": 2,
    "pain during pregnancy": 3,
    "pelvic pain": 2,
    "impotence": 2,
    "infant spitting up": 1,
    "vomiting blood": 3,
    "regurgitation": 2,
    "burning abdominal pain": 3,
    "restlessness": 1,
    "symptoms of infants": 2,
    "wheezing": 3,
    "peripheral edema": 3,
    "neck mass": 3,
    "ear pain": 2,
    "jaw swelling": 2,
    "mouth dryness": 1,
    "neck swelling": 3,
    "knee pain": 2,
    "foot or toe pain": 2,
    "bowlegged or knock-kneed": 2,
    "ankle pain": 2,
    "bones are painful": 2,
    "knee weakness": 2,
    "elbow pain": 2,
    "knee swelling": 2,
    "skin moles": 1,
    "knee lump or mass": 2,
    "weight gain": 1,
    "problems with movement": 2,
    "knee stiffness or tightness": 2,
    "leg swelling": 2,
    "foot or toe swelling": 2,
    "heartburn": 2,
    "smoking problems": 2,
    "muscle pain": 2,
    "infant feeding problem": 2,
    "recent weight loss": 3,
    "problems with shape or size of breast": 2,
    "underweight": 2,
    "difficulty eating": 3,
    "scanty menstrual flow": 2,
    "vaginal pain": 2,
    "vaginal redness": 2,
    "vulvar irritation": 2,
    "weakness": 2,
    "decreased heart rate": 3,
    "increased heart rate": 3,
    "bleeding or discharge from nipple": 3,
    "ringing in ear": 1,
    "plugged feeling in ear": 1,
    "itchy ear(s)": 1,
    "frontal headache": 2,
    "fluid in ear": 2,
    "neck stiffness or tightness": 3,
    "spots or clouds in vision": 2,
    "eye redness": 1,
    "lacrimation": 1,
    "itchiness of eye": 1,
    "blindness": 3,
    "eye burns or stings": 2,
    "itchy eyelid": 1,
    "feeling cold": 1,
    "decreased appetite": 2,
    "excessive appetite": 2,
    "excessive anger": 2,
    "loss of sensation": 3,
    "focal weakness": 3,
    "slurring words": 3,
    "symptoms of the face": 2,
    "disturbance of memory": 2,
    "paresthesia": 2,
    "side pain": 2,
    "fever": 2,
    "shoulder pain": 2,
    "shoulder stiffness or tightness": 2,
    "shoulder weakness": 2,
    "arm cramps or spasms": 2,
    "shoulder swelling": 2,
    "tongue lesions": 2,
    "leg cramps or spasms": 2,
    "abnormal appearing tongue": 2,
    "ache all over": 2,
    "lower body pain": 2,
    "problems during pregnancy": 3,
    "spotting or bleeding during pregnancy": 3,
    "cramps and spasms": 2,
    "upper abdominal pain": 3,
    "stomach bloating": 2,
    "changes in stool appearance": 2,
    "unusual color or odor to urine": 2,
    "kidney mass": 3,
    "swollen abdomen": 3,
    "symptoms of prostate": 2,
    "leg stiffness or tightness": 2,
    "difficulty breathing": 3,
    "rib pain": 2,
    "joint pain": 2,
    "muscle stiffness or tightness": 2,
    "pallor": 2,
    "hand or finger lump or mass": 2,
    "chills": 2,
    "groin pain": 2,
    "fatigue": 2,
    "abdominal distention": 3,
    "regurgitation.1": 2,
    "symptoms of the kidneys": 3,
    "melena": 3,
    "flushing": 2,
    "coughing up sputum": 2,
    "seizures": 3,
    "delusions or hallucinations": 3,
    "shoulder cramps or spasms": 2,
    "joint stiffness or tightness": 2,
    "pain or soreness of breast": 2,
    "excessive urination at night": 2,
    "bleeding from eye": 3,
    "rectal bleeding": 3,
    "constipation": 2,
    "temper problems": 2,
    "coryza": 1,
    "wrist weakness": 2,
    "eye strain": 1,
    "hemoptysis": 3,
    "lymphedema": 2,
    "skin on leg or foot looks infected": 2,
    "allergic reaction": 3,
    "congestion in chest": 2,
    "muscle swelling": 2,
    "pus in urine": 3,
    "abnormal size or shape of ear": 1,
    "low back weakness": 2,
    "sleepiness": 2,
    "apnea": 3,
    "abnormal breathing sounds": 3,
    "excessive growth": 2,
    "elbow cramps or spasms": 2,
    "feeling hot and cold": 2,
    "blood clots during menstrual periods": 2,
    "absence of menstruation": 2,
    "pulling at ears": 1,
    "gum pain": 1,
    "redness in ear": 1,
    "fluid retention": 2,
    "flu-like syndrome": 2,
    "sinus congestion": 1,
    "painful sinuses": 2,
    "fears and phobias": 2,
    "recent pregnancy": 2,
    "uterine contractions": 3,
    "burning chest pain": 3,
    "back cramps or spasms": 2,
    "stiffness all over": 2,
    "muscle cramps, contractures, or spasms": 2,
    "low back cramps or spasms": 2,
    "back mass or lump": 2,
    "nosebleed": 2,
    "long menstrual periods": 2,
    "heavy menstrual flow": 2,
    "unpredictable menstruation": 2,
    "painful menstruation": 2,
    "infertility": 2,
    "frequent menstruation": 2,
    "sweating": 1,
    "mass on eyelid": 2,
    "swollen eye": 2,
    "eyelid swelling": 2,
    "eyelid lesion or rash": 2,
    "unwanted hair": 1,
    "symptoms of bladder": 2,
    "irregular appearing nails": 1,
    "itching of skin": 1,
    "hurts to breath": 3,
    "nailbiting": 1,
    "skin dryness, peeling, scaliness, or roughness": 1,
    "skin on arm or hand looks infected": 2,
    "skin irritation": 1,
    "itchy scalp": 1,
    "hip swelling": 2,
    "incontinence of stool": 3,
    "foot or toe cramps or spasms": 2,
    "warts": 1,
    "bumps on penis": 2,
    "too little hair": 1,
    "foot or toe lump or mass": 2,
    "skin rash": 1,
    "mass or swelling around the anus": 2,
    "low back swelling": 2,
    "ankle swelling": 2,
    "hip lump or mass": 2,
    "drainage in throat": 1,
    "dry or flaky scalp": 1,
    "premenstrual tension or irritability": 2,
    "feeling hot": 1,
    "feet turned in": 2,
    "foot or toe stiffness or tightness": 2,
    "pelvic pressure": 2,
    "elbow swelling": 2,
    "elbow stiffness or tightness": 2,
    "early or late onset of menopause": 2,
    "mass on ear": 2,
    "bleeding from ear": 3,
    "hand or finger weakness": 2,
    "low self-esteem": 2,
    "throat irritation": 1,
    "itching of the anus": 1,
    "swollen or red tonsils": 2,
    "irregular belly button": 1,
    "swollen tongue": 3,
    "lip sore": 1,
    "vulvar sore": 2,
    "hip stiffness or tightness": 2,
    "mouth pain": 2,
    "arm weakness": 2,
    "leg lump or mass": 2,
    "disturbance of smell or taste": 2,
    "discharge in stools": 2,
    "penis pain": 2,
    "loss of sex drive": 2,
    "obsessions and compulsions": 2,
    "antisocial behavior": 2,
    "neck cramps or spasms": 2,
    "pupils unequal": 3,
    "poor circulation": 2,
    "thirst": 2,
    "sleepwalking": 2,
    "skin oiliness": 1,
    "sneezing": 1,
    "bladder mass": 3,
    "knee cramps or spasms": 2,
    "premature ejaculation": 2,
    "leg weakness": 2,
    "posture problems": 2,
    "bleeding in mouth": 2,
    "tongue bleeding": 3,
    "change in skin mole size or color": 2,
    "penis redness": 2,
    "penile discharge": 3,
    "shoulder lump or mass": 2,
    "polyuria": 2,
    "cloudy eye": 2,
    "hysterical behavior": 2,
    "arm lump or mass": 2,
    "nightmares": 1,
    "bleeding gums": 2,
    "pain in gums": 1,
    "bedwetting": 2,
    "diaper rash": 1,
    "lump or mass of breast": 3,
    "vaginal bleeding after menopause": 3,
    "infrequent menstruation": 2,
    "mass on vulva": 2,
    "jaw pain": 2,
    "itching of scrotum": 1,
    "postpartum problems of the breast": 3,
    "eyelid retracted": 2,
    "hesitancy": 2,
    "elbow lump or mass": 2,
    "muscle weakness": 2,
    "throat redness": 1,
    "joint swelling": 2,
    "tongue pain": 2,
    "redness in or around nose": 1,
    "wrinkles on skin": 1,
    "foot or toe weakness": 2,
    "hand or finger cramps or spasms": 2,
    "back stiffness or tightness": 2,
    "wrist lump or mass": 2,
    "skin pain": 2,
    "low back stiffness or tightness": 2,
    "low urine output": 3,
    "skin on head or neck looks infected": 2,
    "stuttering or stammering": 2,
    "problems with orgasm": 2,
    "nose deformity": 2,
    "lump over jaw": 2,
    "sore in nose": 1,
    "hip weakness": 2,
    "back swelling": 2,
    "ankle stiffness or tightness": 2,
    "ankle weakness": 2,
    "neck weakness": 2,
}

In [4]:
df = pd.DataFrame(symptom_severity.items(),columns=["Symptoms","Severity_level"])
df

Unnamed: 0,Symptoms,Severity_level
0,anxiety and nervousness,2
1,depression,2
2,shortness of breath,3
3,depressive or psychotic symptoms,3
4,sharp chest pain,3
...,...,...
372,hip weakness,2
373,back swelling,2
374,ankle stiffness or tightness,2
375,ankle weakness,2


In [5]:
df.to_csv("Data/SYMPTOMS_SEVERE.csv",index=False)

#### Code below is for Normal Dataset

In [2]:
#converting dataset into transaction list
data_normal = pd.read_csv("Data/Sources/dataset.csv")
new_dt = data_normal.drop_duplicates(keep="first")
data_list = list()
for i in range(len(new_dt)):
    data_list.append(["Disease:"+new_dt.iloc[i]["Disease"]]+["Symptoms:"+ str(i) for i in new_dt.iloc[i,1:].dropna()])  
Tencode = TransactionEncoder()
t = Tencode.fit_transform(data_list).astype("int")
t

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [37]:
normal_dt.to_csv("Data/normalHealth.csv",index=False)


## Code below for advanced dataset

In [4]:
data.duplicated().sum()

57298

In [12]:
# #repeated samples
counts = data.value_counts().reset_index()
# # counts[counts["count"]>1]

In [6]:
data = data.drop_duplicates(keep="first")
data

Unnamed: 0,diseases,anxiety and nervousness,depression,shortness of breath,depressive or psychotic symptoms,sharp chest pain,dizziness,insomnia,abnormal involuntary movements,chest tightness,...,stuttering or stammering,problems with orgasm,nose deformity,lump over jaw,sore in nose,hip weakness,back swelling,ankle stiffness or tightness,ankle weakness,neck weakness
0,panic disorder,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,panic disorder,0,0,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,panic disorder,1,1,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,panic disorder,1,0,0,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,panic disorder,1,1,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246937,open wound of the nose,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
246938,open wound of the nose,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
246941,open wound of the nose,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
246943,open wound of the nose,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
symptoms = data.iloc[:,1:].columns
diseases = data["diseases"].unique()

In [143]:
data.describe()

Unnamed: 0,anxiety and nervousness,depression,shortness of breath,depressive or psychotic symptoms,sharp chest pain,dizziness,insomnia,abnormal involuntary movements,chest tightness,palpitations,...,stuttering or stammering,problems with orgasm,nose deformity,lump over jaw,sore in nose,hip weakness,back swelling,ankle stiffness or tightness,ankle weakness,neck weakness
count,189647.0,189647.0,189647.0,189647.0,189647.0,189647.0,189647.0,189647.0,189647.0,189647.0,...,189647.0,189647.0,189647.0,189647.0,189647.0,189647.0,189647.0,189647.0,189647.0,189647.0
mean,0.037607,0.041398,0.085923,0.058672,0.09678,0.066945,0.038714,0.038883,0.038319,0.024219,...,0.0,0.0,0.0,0.0,0.001334,0.0,0.0,0.0,9.5e-05,0.0
std,0.190244,0.199209,0.280251,0.235011,0.295658,0.249928,0.192913,0.193316,0.191965,0.153728,...,0.0,0.0,0.0,0.0,0.0365,0.0,0.0,0.0,0.009742,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [5]:
#encoding diseases
encode = OneHotEncoder(sparse_output=True)
ar = encode.fit_transform(data.loc[:,["diseases"]])
dataf = pd.DataFrame(ar.toarray().astype("int"),columns=encode.get_feature_names_out())
new_data = pd.concat([dataf.reset_index(drop=True),data.drop(columns=["diseases"]).reset_index(drop=True)],axis=1)
new_data

Unnamed: 0,diseases_abdominal aortic aneurysm,diseases_abdominal hernia,diseases_abscess of nose,diseases_abscess of the lung,diseases_abscess of the pharynx,diseases_acanthosis nigricans,diseases_acariasis,diseases_achalasia,diseases_acne,diseases_actinic keratosis,...,stuttering or stammering,problems with orgasm,nose deformity,lump over jaw,sore in nose,hip weakness,back swelling,ankle stiffness or tightness,ankle weakness,neck weakness
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246940,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
246941,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
246942,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
246943,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
new_data.to_csv("Data/data_preprocess.csv",index=False)

In [15]:
diseases[:10]

array(['panic disorder', 'vocal cord polyp', 'turner syndrome',
       'cryptorchidism', 'poisoning due to ethylene glycol',
       'atrophic vaginitis', 'fracture of the hand',
       'cellulitis or abscess of mouth', 'eye alignment disorder',
       'headache after lumbar puncture'], dtype=object)

### FINAL REPORT: PREPROCESSING
<li>
    <b>Data-Source: </b><a href="https://www.kaggle.com/datasets/dhivyeshrk/diseases-and-symptoms-dataset">https://www.kaggle.com/datasets/dhivyeshrk/diseases-and-symptoms-dataset
</li>
<li>Unique_Symptoms: 378</li>
<li>Preview: 'anxiety and nervousness', 'depression', 'shortness of breath',
       'depressive or psychotic symptoms', 'sharp chest pain', 'dizziness',
       'insomnia', 'abnormal involuntary movements', 'chest tightness',
       'palpitations',
       ...
       'stuttering or stammering', 'problems with orgasm', 'nose deformity',
       'lump over jaw', 'sore in nose', 'hip weakness', 'back swelling',
       'ankle stiffness or tightness', 'ankle weakness', 'neck weakness'</li>
<hr>      
<li>
    Unique Diseases: 773
</li>
<li>Preview: 'panic disorder', 'vocal cord polyp', 'turner syndrome',
       'cryptorchidism', 'poisoning due to ethylene glycol',
       'atrophic vaginitis', 'fracture of the hand',
       'cellulitis or abscess of mouth', 'eye alignment disorder',
       'headache after lumbar puncture', 'pyloric stenosis',
       'salivary gland disorder', 'osteochondrosis',
    ......
       'bladder obstruction', 'melanoma', 'cervical disorder',
       'laryngitis', 'dyshidrosis', 'poisoning due to opioids',
       'diaper rash', 'lichen planus', 'gastroduodenal ulcer',
       'inguinal hernia', 'eczema', 'asperger syndrome', 'mucositis',
       'paronychia', 'open wound of the jaw', 'white blood cell disease',
       'kaposi sarcoma', 'spondylolisthesis', 'pseudotumor cerebri',
       'conjunctivitis due to virus', 'open wound of the nose'</li>
<hr>
<li>Total Samples: ~246000</li>
<li>Total Samples after Preprocessing: ~246000</li>
<li>
    Duplicated samples i.e 57298 are preserved to maximize support
</li>
<li>Encoded the feature "diseases" using OneHotEncoder</li>
