# Imports 

In [1]:
from __future__ import unicode_literals, print_function
from pathlib import Path
import plac
import spacy
import random
from tqdm import tqdm
import pandas as pd

# Loading data

In [2]:
df = pd.read_csv('data.csv')
df.head(25)

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Allergy
2,0,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,GERD
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Chronic cholestasis
4,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Drug Reaction
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Peptic ulcer diseae
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,AIDS
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Diabetes
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Gastroenteritis
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Bronchial Asthma


# Getting all symptoms

In [3]:
symptoms = []

for key in df.columns:
    if key != 'prognosis':
        symptoms.append(key.replace("_"," "))
print(symptoms)
        


['itching', 'skin rash', 'nodal skin eruptions', 'continuous sneezing', 'shivering', 'chills', 'joint pain', 'stomach pain', 'acidity', 'ulcers on tongue', 'muscle wasting', 'vomiting', 'burning micturition', 'spotting  urination', 'fatigue', 'weight gain', 'anxiety', 'cold hands and feets', 'mood swings', 'weight loss', 'restlessness', 'lethargy', 'patches in throat', 'irregular sugar level', 'cough', 'high fever', 'sunken eyes', 'breathlessness', 'sweating', 'dehydration', 'indigestion', 'headache', 'yellowish skin', 'dark urine', 'nausea', 'loss of appetite', 'pain behind the eyes', 'back pain', 'constipation', 'abdominal pain', 'diarrhoea', 'mild fever', 'yellow urine', 'yellowing of eyes', 'acute liver failure', 'fluid overload', 'swelling of stomach', 'swelled lymph nodes', 'malaise', 'blurred and distorted vision', 'phlegm', 'throat irritation', 'redness of eyes', 'sinus pressure', 'runny nose', 'congestion', 'chest pain', 'weakness in limbs', 'fast heart rate', 'pain during bow

# Getting all diseases

In [4]:
diseases = []

for value in df['prognosis']:
    diseases.append(value)
print(diseases)

['Fungal infection', 'Allergy', 'GERD', 'Chronic cholestasis', 'Drug Reaction', 'Peptic ulcer diseae', 'AIDS', 'Diabetes ', 'Gastroenteritis', 'Bronchial Asthma', 'Hypertension ', 'Migraine', 'Cervical spondylosis', 'Paralysis (brain hemorrhage)', 'Jaundice', 'Malaria', 'Chicken pox', 'Dengue', 'Typhoid', 'hepatitis A', 'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E', 'Alcoholic hepatitis', 'Tuberculosis', 'Common Cold', 'Pneumonia', 'Dimorphic hemmorhoids(piles)', 'Heart attack', 'Varicose veins', 'Hypothyroidism', 'Hyperthyroidism', 'Hypoglycemia', 'Osteoarthristis', 'Arthritis', '(vertigo) Paroymsal  Positional Vertigo', 'Acne', 'Urinary tract infection', 'Psoriasis', 'Impetigo']


# Data exploration

In [5]:
# Generating symptom map
#key - symptom 
#value - number of symptoms repetition in dataset.
symptom_map = {}

for key in df.columns:
    if key != 'prognosis':
        symptom_map[key] = 0
        
# Counting all syptoms repetition in dataset.
keys = list(symptom_map.keys())

for key in keys:
    for exist in list(df[key]):
        symptom_map[key] += int(exist)
        
index = 1
for key in list(symptom_map.keys()):
    print(str(index)+"."+"Symptom: "+key.replace("_"," ") + " - amount in dataset: "+ str(symptom_map[key]))
    index += 1


1.Symptom: itching - amount in dataset: 6
2.Symptom: skin rash - amount in dataset: 7
3.Symptom: nodal skin eruptions - amount in dataset: 1
4.Symptom: continuous sneezing - amount in dataset: 2
5.Symptom: shivering - amount in dataset: 1
6.Symptom: chills - amount in dataset: 7
7.Symptom: joint pain - amount in dataset: 6
8.Symptom: stomach pain - amount in dataset: 2
9.Symptom: acidity - amount in dataset: 2
10.Symptom: ulcers on tongue - amount in dataset: 1
11.Symptom: muscle wasting - amount in dataset: 1
12.Symptom: vomiting - amount in dataset: 17
13.Symptom: burning micturition - amount in dataset: 2
14.Symptom: spotting  urination - amount in dataset: 1
15.Symptom: fatigue - amount in dataset: 17
16.Symptom: weight gain - amount in dataset: 1
17.Symptom: anxiety - amount in dataset: 1
18.Symptom: cold hands and feets - amount in dataset: 1
19.Symptom: mood swings - amount in dataset: 2
20.Symptom: weight loss - amount in dataset: 4
21.Symptom: restlessness - amount in dataset:

## Counting top repeated symptoms

In [6]:
common_symptoms = []

for key in symptom_map:
    if symptom_map[key] > 10:
        common_symptoms.append(key)
        
for symptom in common_symptoms:
    print(str(common_symptoms.index(symptom)+1)+"."+symptom)

1.vomiting
2.fatigue
3.high_fever


## Counting average amount of symptoms per disease

In [7]:
mean_symptoms_per_disease = 0

for index,row in df.iterrows():
    for key in keys:
        mean_symptoms_per_disease += row[key]

mean_symptoms_per_disease = mean_symptoms_per_disease/df.shape[0]

print(mean_symptoms_per_disease)

7.829268292682927


# Preparing data

In [8]:
sents_starters = ['I have', 'I feel', 'I think', 'I am annoyed' ,'I had', 'I am', 'Fealings of', 'I am afraid of', 'I',
                 'He has', 'I am worried about', '']
sent_ends = ['is disturbing me', 'is not letting me live', 'for days' , 'since yesterday']

LABEL = 'SYMPTOM'

In [9]:
## Generating every possible variation for our data
TRAIN_DATA = []

for symptom in symptoms:        
    for sent_start in sents_starters: 
        i = 0 if sent_start == '' else 1
        
        train_sentence = sent_start + " " + symptom
        train_sentence = train_sentence.lstrip().rstrip() # remove spaces
        
        start_index = len(sent_start) + i
        end_index = len(sent_start) + len(symptom) + i
        
        train_sample = (train_sentence, {'entities': [(start_index, end_index, 'SYMPTOM')]})
        
        TRAIN_DATA.append(train_sample)

        
for symptom in symptoms:
    for sent_end in sent_ends:
        train_sentence = symptom + " " + sent_end
        train_sentence = train_sentence.lstrip().rstrip() # remove spaces
        
        end_index = len(symptom) 
        
        train_sample = (train_sentence, {'entities': [(0, end_index, 'SYMPTOM')]})
        
        TRAIN_DATA.append(train_sample)

        
for symptom in symptoms:
    for sent_start in sents_starters[:2]:
        for sent_end in sent_ends:
            train_sentence = sent_start + " " + symptom + " " + sent_end
            train_sentence = train_sentence.lstrip().rstrip() # remove spaces
            
            start_index = len(sent_start) + 1
            end_index = len(sent_start) + 1 + len(symptom) 
            
            train_sample = (train_sentence,{'entities': [(start_index, end_index, 'SYMPTOM')]})
            
            TRAIN_DATA.append(train_sample)
            
TRAIN_DATA.append(("i have", {'entities': []}))
TRAIN_DATA.append(("i", {'entities': []}))
TRAIN_DATA.append(("have", {'entities': []}))
TRAIN_DATA.append(("Something is disturbing me", {'entities': []}))
TRAIN_DATA.append(("I am not sure what is wrong", {'entities': []}))
TRAIN_DATA.append(("Іs not letting me live", {'entities': []}))
TRAIN_DATA.append(("Іs killing me", {'entities': []}))
TRAIN_DATA.append(("for days", {'entities': []}))
TRAIN_DATA.append(("Fealings of", {'entities': []}))
TRAIN_DATA.append(("since yesturday", {'entities': []}))
TRAIN_DATA.append(("I have money", {'entities': []}))
TRAIN_DATA.append(("I had something", {'entities': []}))

In [10]:
print(len(TRAIN_DATA))
print(TRAIN_DATA)

3180


[('I have itching', {'entities': [(7, 14, 'SYMPTOM')]}), ('I feel itching', {'entities': [(7, 14, 'SYMPTOM')]}), ('I think itching', {'entities': [(8, 15, 'SYMPTOM')]}), ('I am annoyed itching', {'entities': [(13, 20, 'SYMPTOM')]}), ('I had itching', {'entities': [(6, 13, 'SYMPTOM')]}), ('I am itching', {'entities': [(5, 12, 'SYMPTOM')]}), ('Fealings of itching', {'entities': [(12, 19, 'SYMPTOM')]}), ('I am afraid of itching', {'entities': [(15, 22, 'SYMPTOM')]}), ('I itching', {'entities': [(2, 9, 'SYMPTOM')]}), ('He has itching', {'entities': [(7, 14, 'SYMPTOM')]}), ('I am worried about itching', {'entities': [(19, 26, 'SYMPTOM')]}), ('itching', {'entities': [(0, 7, 'SYMPTOM')]}), ('I have skin rash', {'entities': [(7, 16, 'SYMPTOM')]}), ('I feel skin rash', {'entities': [(7, 16, 'SYMPTOM')]}), ('I think skin rash', {'entities': [(8, 17, 'SYMPTOM')]}), ('I am annoyed skin rash', {'entities': [(13, 22, 'SYMPTOM')]}), ('I had skin rash', {'entities': [(6, 15, 'SYMPTOM')]}), ('I am skin

In [11]:
def main(model=None, new_model_name='symptom', output_dir="D:\\DoctorAssistantBot", n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update([text], [annotations], sgd=optimizer, drop=0.2,
                           losses=losses)
            print(losses)
            if(losses['ner'] < 1):
                break

    # test the trained model
    test_text = 'itching'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)

    return nlp

In [None]:
model = main()

Created blank 'en' model


100%|██████████████████████████████████████████████████████████████████████████████| 3180/3180 [30:34<00:00,  1.23it/s]


{'ner': 515.7185731234043}


100%|██████████████████████████████████████████████████████████████████████████████| 3180/3180 [41:53<00:00,  1.27it/s]


{'ner': 143.43216552675935}


100%|██████████████████████████████████████████████████████████████████████████████| 3180/3180 [41:18<00:00,  1.29it/s]


{'ner': 38.69501263746867}


100%|██████████████████████████████████████████████████████████████████████████████| 3180/3180 [41:17<00:00,  1.29it/s]


{'ner': 63.00560124860695}


100%|██████████████████████████████████████████████████████████████████████████████| 3180/3180 [44:02<00:00,  1.16it/s]


{'ner': 34.20669685714265}


100%|██████████████████████████████████████████████████████████████████████████████| 3180/3180 [43:56<00:00,  1.10it/s]


{'ner': 43.436678769372094}


100%|██████████████████████████████████████████████████████████████████████████████| 3180/3180 [45:01<00:00,  1.18it/s]


{'ner': 29.9840706073521}


100%|██████████████████████████████████████████████████████████████████████████████| 3180/3180 [43:43<00:00,  1.25it/s]


{'ner': 36.14361303324477}


100%|██████████████████████████████████████████████████████████████████████████████| 3180/3180 [41:56<00:00,  1.24it/s]


{'ner': 52.863784131410895}


100%|██████████████████████████████████████████████████████████████████████████████| 3180/3180 [42:38<00:00,  1.11it/s]


{'ner': 49.58261692731889}


 68%|█████████████████████████████████████████████████████▎                        | 2171/3180 [29:41<13:12,  1.27it/s]

In [12]:
n = model("I have headache")
for ent in n.ents:
        print(ent.label_, ent.text)

SYMPTOM headache
