In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import time
print("Loading data file now, this could take a while depending on file size")
start = time.time()
df = pd.read_csv('Diseases_Symptoms.csv')
end = time.time()
print("Loading took " + str(round(end - start, 2)) + " seconds")
missing_values = df.isna().sum().sum()
duplicated_values = df.duplicated().sum()
print(f'\nMissing values: {missing_values}')
print(f'Duplicated values: {duplicated_values}')
if missing_values >= 1:
    print('\nMissing values by column:')
    print(df.isna().sum())
print("\nUnique Values in Each Column:")
print(df.nunique())

Loading data file now, this could take a while depending on file size
Loading took 0.01 seconds

Missing values: 1
Duplicated values: 0

Missing values by column:
Code          0
Name          0
Symptoms      0
Treatments    1
dtype: int64

Unique Values in Each Column:
Code          400
Name          392
Symptoms      395
Treatments    386
dtype: int64


In [4]:
model = SentenceTransformer('all-MiniLM-L6-v2')
df['Symptom_Embedding'] = df['Symptoms'].apply(lambda x: model.encode(x))

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [5]:
df.head()

Unnamed: 0,Code,Name,Symptoms,Treatments,Symptom_Embedding
0,1,Panic disorder,"Palpitations, Sweating, Trembling, Shortness o...","Antidepressant medications, Cognitive Behavior...","[0.07893873, -0.027037594, 0.07101004, 0.09278..."
1,2,Vocal cord polyp,"Hoarseness, Vocal Changes, Vocal Fatigue","Voice Rest, Speech Therapy, Surgical Removal","[0.041373983, -0.05340634, 0.04670726, 0.02566..."
2,3,Turner syndrome,"Short stature, Gonadal dysgenesis, Webbed neck...","Growth hormone therapy, Estrogen replacement t...","[0.016158296, 0.071264215, 0.03511463, -0.0060..."
3,4,Cryptorchidism,"Absence or undescended testicle(s), empty scro...",Observation and monitoring (in cases of mild o...,"[0.04706854, 0.010907597, -0.082685255, 0.0435..."
4,5,Ethylene glycol poisoning-1,"Nausea, vomiting, abdominal pain, General mala...","Supportive Measures, Gastric Decontamination, ...","[0.058652576, -0.031397797, -0.01897327, 0.066..."


In [9]:
def find_condition_by_symptoms(input_symptoms):
    input_embedding = model.encode(input_symptoms)
    df['Similarity'] = df['Symptom_Embedding'].apply(lambda x: util.cos_sim(input_embedding, x).item())
    best_match = df.loc[df['Similarity'].idxmax()]
    return best_match['Name'], best_match['Treatments'] ,best_match['Symptoms']

In [10]:
input_symptoms = "Leg pain, swollen leg, leg tenderness,  pitting edema"
condition_name, treatments,  symptoms = find_condition_by_symptoms(input_symptoms)

print("Symptoms:", symptoms)
print("Condition:", condition_name)
print("Recommended Treatments:", treatments)

Symptoms: Knee pain, swelling, inability to straighten the leg
Condition: Dislocation of the Patella
Recommended Treatments: Manual reduction, immobilization, physical therapy
