In [2]:
import pandas as pd
import numpy as np
import json
from nltk.stem import WordNetLemmatizer
import string
from fuzzywuzzy import process
from collections import defaultdict
import ollama

### Push everything to the main branch

### Data Preprocessing

In [3]:
train_df = pd.read_csv('symptom-disease-train-dataset.csv')

# Load the mapping json file

with open('mapping.json') as f:
    label_to_disease = json.load(f)
    
    
id_to_disease = {v: k for k, v in label_to_disease.items()}

train_df['Disease'] = train_df['label'].map(id_to_disease)

print(train_df[['label', 'Disease']].head())

   label                    Disease
0    308              Drug Reaction
1     35                    Allergy
2    798  Premature Ovarian Failure
3    149           Bronchial Asthma
4    596                    Malaria


In [4]:
test_df = pd.read_csv('symptom-disease-test-dataset.csv')
test_df['Disease'] = test_df['label'].map(id_to_disease)

print(test_df[['label', 'Disease']].head())

   label                       Disease
0    541                      Jaundice
1    502               Hyperthyroidism
2    297  Dimorphic Hemmorhoids(Piles)
3   1019                  Tuberculosis
4     33           Alcoholic Hepatitis


### Normalize the Text

In [5]:
lemmatizer = WordNetLemmatizer()
def normalize(symptom):
    symptom =  symptom.lower().replace('-', ' ').replace('_', ' ')
    symptom = lemmatizer.lemmatize(symptom) # as a precaution
    return symptom
    # translator = symptom.maketrans('', '', string.punctuation)
    # symptom = symptom.translate(translator)
    
train_df.head(10)

Unnamed: 0,text,label,Disease
0,I have been having migraines and headaches. I ...,308,Drug Reaction
1,I have asthma and I get wheezing and breathing...,35,Allergy
2,Signs and symptoms of primary ovarian insuffic...,798,Premature Ovarian Failure
3,"cough,high_fever,breathlessness,family_history...",149,Bronchial Asthma
4,"chills,vomiting,high_fever,sweating,headache,n...",596,Malaria
5,Posterior cortical atrophy symptoms vary among...,785,Posterior Cortical Atrophy
6,"I've been having back pain, a cough, and numbn...",186,Cervical Spondylosis
7,"yellowish_skin,nausea,loss_of_appetite,yellowi...",466,Hepatitis C
8,"joint_pain,neck_pain,knee_pain,hip_joint_pain,...",700,Osteoarthristis
9,An unusual sensation (aura) may precede a temp...,971,Temporal Lobe Seizure


In [6]:
# Pass in each symptom into the normalize function
def comma_sep_values(text):
    if "," in text and not ' ' in text:
        return True
    else:
        return False


def lemmatization(string_row):
    # Break into separate words
    lem_words = string_row.split()
    # lemmatize each word
    new_string = [lemmatizer.lemmatize(s) for s in lem_words]
    # join the words back together into a sentence
    string_row = ' '.join(new_string)
    return string_row
    
    
for i, row in train_df['text'].items():
    if comma_sep_values(row):
        words = row.split(',')
        new_words = [normalize(word) for word in words]
        new_row = ",".join(new_words)
        train_df.loc[i, 'text'] = new_row
    else:
        train_df.loc[i, 'text'] = row # Grab the row
        new_row = lemmatization(row) # lemmatize the row, and get the new row
        train_df.loc[i, 'text'] = new_row # replace the old row with the new row
                

In [7]:
# Check the results
train_df.head()

Unnamed: 0,text,label,Disease
0,I have been having migraine and headaches. I c...,308,Drug Reaction
1,I have asthma and I get wheezing and breathing...,35,Allergy
2,Signs and symptom of primary ovarian insuffici...,798,Premature Ovarian Failure
3,"cough,high fever,breathlessness,family history...",149,Bronchial Asthma
4,"chill,vomiting,high fever,sweating,headache,na...",596,Malaria


In [8]:
# Do the same for the test results
for i, row in test_df['text'].items():
    if comma_sep_values(row):
        words = row.split(',')
        new_words = [normalize(word) for word in words]
        new_row = ",".join(new_words)
        test_df.loc[i, 'text'] = new_row
    else:
        test_df.loc[i, 'text'] = row # Grab the row
        new_row = lemmatization(row) # lemmatize the row, and get the new row
        test_df.loc[i, 'text'] = new_row # replace the old row with the new row

In [9]:
# Check the results
test_df.head()

Unnamed: 0,text,label,Disease
0,"itching,vomiting,fatigue,weight loss,high feve...",541,Jaundice
1,"fatigue,weight loss,restlessness,sweating,diar...",502,Hyperthyroidism
2,"constipation,pain during bowel movements,pain ...",297,Dimorphic Hemmorhoids(Piles)
3,"chill,vomiting,fatigue,weight loss,cough,high ...",1019,Tuberculosis
4,"vomiting,yellowish skin,abdominal pain,swellin...",33,Alcoholic Hepatitis


In [10]:
# Mapping symptoms to diseases: (Optional step, but it is helpful)
symptom_disease_map = defaultdict(set)

for _, row in train_df.iterrows():
    disease = row['Disease']
    text = row['text']
    
    if ',' in text:
        symptoms = [normalize(s) for s in text.split(',')]
    else:
        symptoms = [normalize(w) for w in text.split()]
        
    for symptom in symptoms:
        if symptom: # avoid blank entries
            symptom_disease_map[symptom].add(disease)

for symptom, diseases in list(symptom_disease_map.items())[:5]:
    print(f"{symptom}: {list(diseases)[:3]}...")
            


i: ['Urinary Tract Infection', 'Psoriasis', 'Varicose Veins']...
have: ['Urinary Tract Infection', 'Psoriasis', 'Varicose Veins']...
been: ['Psoriasis', 'Urinary Tract Infection', 'Varicose Veins']...
having: ['Psoriasis', 'Urinary Tract Infection', 'Varicose Veins']...
migraine: ['Drug Reaction', 'Migraine Headache']...


In [11]:
# Test for missing symptoms

test_symptoms = {"fever", "fatigue", "headache"}
for symptom in test_symptoms:
    if symptom not in symptom_disease_map:
        print(f"Missing: {symptom}")

In [12]:
train_df.to_csv('normalized_train.csv', index=False)
test_df.to_csv('normalized_test.csv', index=False)

In [13]:
# Export the maps

with open("symptom_disease_map.json", "w") as f:
    json.dump({k: list(v) for k, v in symptom_disease_map.items()}, f, indent=2)

# Phase 3

In [14]:
#Pipeline that is being followed: 

# Parse input into structured symptoms using NLP
# Match symptoms to disease with simple scoring methods (number of symptom matches per disease)
# Output response summary with a confidence score


# Simple scoring method: 
def match_disease(user_symptoms):
    scores = defaultdict(set)
    
    for symptom in user_symptoms:
        if symptom in symptom_disease_map:
            for disease in symptom_disease_map[symptom]:
                scores[disease] += 1
            
    return sorted(scores.items(), key=lambda x: x[1], reverse=True)


In [15]:
# Output Response Summary: 

# Suggested actions and Urgency
def get_urgency(score):
    if score >= 4:
        return "High"
    elif score >= 2:
        return "Moderate"
    else:
        return "Low"

In [7]:
result = ollama.generate(model="mistral", prompt='Why is the sky blue?')
print(result['response'])


 The sky appears blue due to a phenomenon called Rayleigh scattering. As sunlight reaches Earth, it encounters molecules and tiny particles in the atmosphere. These molecules scatter short-wavelength light (blue and violet light) more than longer wavelengths (red, orange, yellow). However, our eyes are more sensitive to blue light and less sensitive to violet light, which makes the sky look predominantly blue rather than violet. Additionally, sunlight reaches us more abundantly in the blue part of the spectrum compared to violet. At sunrise and sunset, the scattering of sunlight through the Earth's atmosphere scatters shorter wavelengths (blue and violet) to a greater extent, making the sky appear red or orange as longer wavelengths are less affected by scattering. This is also known as the "scattering of sunlight" and plays a significant role in determining the color of the sky.
