# Imports 

In [28]:
from __future__ import unicode_literals, print_function
from pathlib import Path
import plac
import spacy
import random
from tqdm import tqdm
import pandas as pd

# Loading data

In [30]:
df = pd.read_csv('data.csv')
df.head(10)

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Allergy
2,0,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,GERD
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Chronic cholestasis
4,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Drug Reaction
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Peptic ulcer diseae
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,AIDS
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Diabetes
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Gastroenteritis
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Bronchial Asthma


# Getting all symptoms

In [33]:
symptoms = []
for key in df.columns:
    if key != 'prognosis':
        symptoms.append(key.replace("_"," "))
print(symptoms)
        
LABEL = 'SYMPTOM'

['itching', 'skin rash', 'nodal skin eruptions', 'continuous sneezing', 'shivering', 'chills', 'joint pain', 'stomach pain', 'acidity', 'ulcers on tongue', 'muscle wasting', 'vomiting', 'burning micturition', 'spotting  urination', 'fatigue', 'weight gain', 'anxiety', 'cold hands and feets', 'mood swings', 'weight loss', 'restlessness', 'lethargy', 'patches in throat', 'irregular sugar level', 'cough', 'high fever', 'sunken eyes', 'breathlessness', 'sweating', 'dehydration', 'indigestion', 'headache', 'yellowish skin', 'dark urine', 'nausea', 'loss of appetite', 'pain behind the eyes', 'back pain', 'constipation', 'abdominal pain', 'diarrhoea', 'mild fever', 'yellow urine', 'yellowing of eyes', 'acute liver failure', 'fluid overload', 'swelling of stomach', 'swelled lymph nodes', 'malaise', 'blurred and distorted vision', 'phlegm', 'throat irritation', 'redness of eyes', 'sinus pressure', 'runny nose', 'congestion', 'chest pain', 'weakness in limbs', 'fast heart rate', 'pain during bow

# Getting all diseases

In [36]:
diseases = []

for value in df['prognosis']:
    diseases.append(value)
print(diseases)

['Fungal infection', 'Allergy', 'GERD', 'Chronic cholestasis', 'Drug Reaction', 'Peptic ulcer diseae', 'AIDS', 'Diabetes ', 'Gastroenteritis', 'Bronchial Asthma', 'Hypertension ', 'Migraine', 'Cervical spondylosis', 'Paralysis (brain hemorrhage)', 'Jaundice', 'Malaria', 'Chicken pox', 'Dengue', 'Typhoid', 'hepatitis A', 'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E', 'Alcoholic hepatitis', 'Tuberculosis', 'Common Cold', 'Pneumonia', 'Dimorphic hemmorhoids(piles)', 'Heart attack', 'Varicose veins', 'Hypothyroidism', 'Hyperthyroidism', 'Hypoglycemia', 'Osteoarthristis', 'Arthritis', '(vertigo) Paroymsal  Positional Vertigo', 'Acne', 'Urinary tract infection', 'Psoriasis', 'Impetigo']


# Data exploration

In [51]:
# Generating symptom map, that has as key - symptom, 
#  and as value - number of symptoms repetition in dataset.
symptom_map = {}
for key in df.columns:
    if key != 'prognosis':
        symptom_map[key] = 0
        
# Counting all syptoms repetition in dataset.
keys = list(symptom_map.keys())
for key in keys:
    for exist in list(df[key]):
        symptom_map[key] += int(exist)
        
index = 1
for key in list(symptom_map.keys()):
    print(str(index)+"."+"Symptom: "+key.replace("_"," ") + " - ammount in dataset: "+ str(symptom_map[key]))
    index += 1


1.Symptom: itching - ammount in dataset: 6
2.Symptom: skin rash - ammount in dataset: 7
3.Symptom: nodal skin eruptions - ammount in dataset: 1
4.Symptom: continuous sneezing - ammount in dataset: 2
5.Symptom: shivering - ammount in dataset: 1
6.Symptom: chills - ammount in dataset: 7
7.Symptom: joint pain - ammount in dataset: 6
8.Symptom: stomach pain - ammount in dataset: 2
9.Symptom: acidity - ammount in dataset: 2
10.Symptom: ulcers on tongue - ammount in dataset: 1
11.Symptom: muscle wasting - ammount in dataset: 1
12.Symptom: vomiting - ammount in dataset: 17
13.Symptom: burning micturition - ammount in dataset: 2
14.Symptom: spotting  urination - ammount in dataset: 1
15.Symptom: fatigue - ammount in dataset: 17
16.Symptom: weight gain - ammount in dataset: 1
17.Symptom: anxiety - ammount in dataset: 1
18.Symptom: cold hands and feets - ammount in dataset: 1
19.Symptom: mood swings - ammount in dataset: 2
20.Symptom: weight loss - ammount in dataset: 4
21.Symptom: restlessness 

## Counting top repeated symptoms

In [45]:
common_symptoms = []
for key in symptom_map:
    if symptom_map[key] > 10:
        common_symptoms.append(key)
for symptom in common_symptoms:
    print(str(common_symptoms.index(symptom)+1)+"."+symptom)

1.vomiting
2.fatigue
3.high_fever


## Counting average amount of symptoms per disease

In [46]:
mean_symptoms_per_disease = 0
for index,row in df.iterrows():
    for key in keys:
        mean_symptoms_per_disease += row[key]
mean_symptoms_per_disease = mean_symptoms_per_disease/df.shape[0]
print(mean_symptoms_per_disease)

7.829268292682927


In [3]:
sents_starters = ['i have','i had','yesturday i','on tuesdey i have had','last night i have being', 'last weak i had', 
                  'for month i have had', 'i have being suffering from', 'i have being disturbed by','i have problem with',""]

In [4]:
sent_ends = ["for hours", "for days", ""]
sent = "     asdw sddwasdw asdwdd      "
print(sent)
print(sent.lstrip().rstrip())
print("I have a headache")
print(len("I have a" ) + len("headache"))


     asdw sddwasdw asdwdd      
asdw sddwasdw asdwdd
I have a headache
16


In [5]:
TRAIN_DATA = []
for symptom in symptoms:
    for sent_start in sents_starters:
        for sent_end in sent_ends:
            train_sentence = sent_start+" "+symptom+" " +sent_end
            train_sentence = train_sentence.lstrip().rstrip() # remove spaces at start and in end of sentences
            start_index = len(sent_start)
            end_index = len(sent_start) + len(symptom) + 1
            train_sample = (train_sentence,{'entities': [(start_index, end_index, 'SYMPTOM')]})
            TRAIN_DATA.append(train_sample)
#print(TRAIN_DATA)

In [169]:
TRAIN_DATA = [
    ("vomiting all day long", {
        'entities': [(0, 8, 'SYMPTOM')]
    }),

    ("Have a terrible", {
        'entities': []
    }),

    ("Arms are itching", {
        'entities': [(9, 16, 'SYMPTOM')]
    }),
    
#     ("I have skin rash", {
#         'entities': [(7, 15, 'SYMPTOM')]
#     }),
    #

    ("Have a strong headache", {
        'entities': [(14, 22, 'SYMPTOM')]
    }),

    ("I really want to vomit", {
        'entities': [(17, 22, 'SYMPTOM')]
    }),

    ("headache", {
        'entities': [(0, 8, 'SYMPTOM')]
    })
]

# TRAIN_DATA = [
#     ('I have itching', {
#         'entities': [(6, 12, 'SYMPTOM')]
#     }), 
#     ('itching', {'entities': [(0, 6, 'SYMPTOM')]}), 
# #     ('I have skin rash', {'entities': [(6, 14, 'SYMPTOM')]}),
# #     ('skin rash', {'entities': [(0, 8, 'SYMPTOM')]}),
# #     ('I have nodal skin eruptions', {'entities': [(6, 25, 'SYMPTOM')]}), 
# #     ('nodal skin eruptions', {'entities': [(0, 19, 'SYMPTOM')]}),
#     ('cough i bothering me', {'entities': [(0, 4, 'SYMPTOM')]}),
#     ('I had acidity', {'entities': [(6, 12, 'SYMPTOM')]}),
#     ('I have', {'entities': []}), 
#     ('I', {'entities': []}), 
#     ('have', {'entities': []})

#]
#0, 8,
print("headache"[8])
# ('i have itching', {'entities': [(6, 12, 'SYMPTOM')]})

IndexError: string index out of range

In [164]:
# @plac.annotations(
#     model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
#     new_model_name=("New model name for model meta.", "option", "nm", str),
#     output_dir=("Optional output directory", "option", "o", Path),
#     n_iter=("Number of training iterations", "option", "n", int))


def main(model=None, new_model_name='symptom', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # test the trained model
    test_text = 'I have a headache'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)

    return nlp
# if __name__ == '__main__':
#     plac.call(main)

In [167]:
# Run our Function
model = main()

Created blank 'en' model


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 14.08it/s]


{'ner': 14.643603384494781}


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 15.85it/s]


{'ner': 10.553344115614891}


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 16.00it/s]


{'ner': 7.091617166996002}


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 17.17it/s]


{'ner': 7.267326708417386}


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 15.83it/s]


{'ner': 5.877128836000338}


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 14.36it/s]


{'ner': 4.559289358177921}


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 14.77it/s]


{'ner': 3.434475479472894}


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 13.83it/s]


{'ner': 1.9618223566212691}


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 15.22it/s]


{'ner': 0.9999566307353369}


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 14.72it/s]


{'ner': 0.686441628841429}


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 14.42it/s]


{'ner': 0.012837209944484802}


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 15.62it/s]


{'ner': 0.000912882440231771}


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 14.83it/s]


{'ner': 4.704469586092208e-05}


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 16.01it/s]


{'ner': 0.00824807363544111}


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 16.13it/s]


{'ner': 0.0016082051602726768}


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 15.61it/s]


{'ner': 1.792084759554914e-05}


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 15.18it/s]


{'ner': 5.710527200530461e-06}


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 15.76it/s]


{'ner': 4.729047715766022e-05}


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 15.52it/s]


{'ner': 9.091355229747172e-08}


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 14.87it/s]


{'ner': 8.809080297152514e-06}
Entities in 'I have a headache'
SYMPTOM headache


In [168]:
n = model("I have pain")
for ent in n.ents:
        print(ent.label_, ent.text)

SYMPTOM pain


In [134]:
print(n)

I have pain


In [45]:
TRAIN_DATA2 = [
    ("vomiting all day long", {
        'entities': [(0, 8, 'SYMPTOM')]
    }),

    ("Have a terrible", {
        'entities': []
    }),

    ("Arms are itching", {
        'entities': [(9, 16, 'SYMPTOM')]
    }),

    ("Have a strong headache", {
        'entities': [(14, 22, 'SYMPTOM')]
    }),

    ("I really want to vomit", {
        'entities': [(17, 22, 'SYMPTOM')]
    }),

    ("headache", {
        'entities': [(0, 8, 'SYMPTOM')]
    })
]

# Second atempt

In [66]:
sents_starters = ['i have','']
sent_ends = ['']

In [4]:
sents_starters = ['i have','']
sent_ends = ['']
TRAIN_DATA2 = []
for symptom in symptoms:
    for sent_start in sents_starters:
        for sent_end in sent_ends:
            train_sentence = sent_start+" "+symptom+" " +sent_end
            train_sentence = train_sentence.lstrip().rstrip() # remove spaces at start and in end of sentences
            start_index = len(sent_start) + 1
            end_index = len(sent_start) + len(symptom)
            train_sample = (train_sentence,{'entities': [(start_index, end_index, 'SYMPTOM')]})
            TRAIN_DATA2.append(train_sample)
#print(TRAIN_DATA2)

In [68]:
def main2(model=None, new_model_name='symptom', output_dir=None, n_iter=7):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA2)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA2):
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # test the trained model
    test_text = 'I have a headache'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)

    return nlp

In [5]:
#model2 = main2()

In [71]:
n = model2("itching")
for ent in n.ents:
        print(ent.label_, ent.text)

# Attempt 3

In [None]:
sents_starters = ['i have','']
sent_ends = ['']
TRAIN_DATA = []
for symptom in symptoms:
    for sent_start in sents_starters:
        train_sentence = sent_start+" "+symptom
        train_sentence = train_sentence.lstrip().rstrip() # remove spaces at start and in end of sentences
        start_index = len(sent_start) + 1
        end_index = len(sent_start) + len(symptom)
        train_sample = (train_sentence,{'entities': [(start_index, end_index, 'SYMPTOM')]})
        TRAIN_DATA2.append(train_sample)
            
print(TRAIN_DATA)

In [72]:
def train_spacy(data,iterations):
    TRAIN_DATA = data
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
       

    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Statring iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp


prdnlp = train_spacy(TRAIN_DATA, 10)

# Save our trained Model
modelfile = input("Enter your Model Name: ")
prdnlp.to_disk(modelfile)

#Test your text
test_text = input("Enter your testing text: ")
doc = prdnlp(test_text)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Statring iteration 0
{'ner': 21.904731928697707}
Statring iteration 1
{'ner': 1.784096903085851e-12}
Statring iteration 2
{'ner': 4.8451650514017426e-11}
Statring iteration 3
{'ner': 4.580180256648957e-10}
Statring iteration 4
{'ner': 2.6848960843200314e-09}
Statring iteration 5
{'ner': 1.1038003384254141e-06}
Statring iteration 6
{'ner': 1.3125701779783916e-09}
Statring iteration 7
{'ner': 3.6731462404418006e-08}
Statring iteration 8
{'ner': 2.075412386326439e-09}
Statring iteration 9
{'ner': 5.446318374820541e-09}
Enter your Model Name: i have skin peeling
Enter your testing text: i have skin peeling


# Attempt 4

In [27]:
#len(symptoms)
symptoms2 = symptoms[0:50]
len(symptoms2)
sents_starters = ['I have' ,'I am', 'I feel','I am annoyed by' ,'I had', 'Fealings of','']
sents_starters[:-1]

['I have', 'I am', 'I feel', 'I am annoyed by', 'I had', 'Fealings of']

In [28]:
sents_starters = ['I have' ,'I am', 'I feel','I am annoyed by' ,'I had', 'Fealings of','']
sent_ends = ['is disturbing me', 'is killing me', 'is not letting me live']
TRAIN_DATA2 = []
for symptom in symptoms2:
    for sent_start in sents_starters: 
        train_sentence = sent_start+" "+symptom
        train_sentence = train_sentence.lstrip().rstrip() # remove spaces at start and in end of sentences
        i = 0 if sent_start == '' else 1
        start_index = len(sent_start) + i
        end_index = len(sent_start) + len(symptom) + 1
        train_sample = (train_sentence,{'entities': [(start_index, end_index, 'SYMPTOM')]})
        TRAIN_DATA2.append(train_sample)

        
for symptom in symptoms2:
    for sent_end in sent_ends:
        train_sentence = symptom+" " +sent_end
        train_sentence = train_sentence.lstrip().rstrip() # remove spaces at start and in end of sentences
        end_index = len(symptom) 
        train_sample = (train_sentence,{'entities': [(0, end_index, 'SYMPTOM')]})
        TRAIN_DATA2.append(train_sample)

sents_ends = ['for days' , 'since yesturday']    
for symptom in symptoms2:
    for sent_start in sents_starters[:-1]:
        for sent_end in sents_ends:
            train_sentence =sent_start+" "+ symptom+" " +sent_end
            train_sentence = train_sentence.lstrip().rstrip() # remove spaces at start and in end of sentences
            start_index = len(sent_start) + 1
            end_index = len(sent_start) + 1 + len(symptom) 
            train_sample = (train_sentence,{'entities': [(start_index, end_index, 'SYMPTOM')]})
            TRAIN_DATA2.append(train_sample)
        
TRAIN_DATA2.append(("i have", {'entities': []}))
TRAIN_DATA2.append(("i", {'entities': []}))
TRAIN_DATA2.append(("have", {'entities': []}))
TRAIN_DATA2.append(("Something is disturbing me", {'entities': []}))
TRAIN_DATA2.append(("I am not sure what is wrong", {'entities': []}))
TRAIN_DATA2.append(("Іs not letting me live", {'entities': []}))
TRAIN_DATA2.append(("Іs killing me", {'entities': []}))
TRAIN_DATA2.append(("for days", {'entities': []}))
TRAIN_DATA2.append(("Fealings of", {'entities': []}))
TRAIN_DATA2.append(("since yesturday", {'entities': []}))
TRAIN_DATA2.append(("I have money", {'entities': []}))
TRAIN_DATA2.append(("I had something", {'entities': []}))

# ("Have a terrible", {'entities': []})


In [29]:
print(len(TRAIN_DATA2))

1112


In [36]:
def main(model=None, new_model_name='symptom', output_dir=None, n_iter=45):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA2)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA2):
                nlp.update([text], [annotations], sgd=optimizer, drop=0.2,
                           losses=losses)
            print(losses)
            if(losses['ner'] < 1):
                break

    # test the trained model
    test_text = 'itching'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)

    return nlp

In [37]:
model = main(output_dir='C:\\Users\\Andriy\\Desktop\\chatbot')

Created blank 'en' model




  0%|                                                 | 0/1112 [00:00<?, ?it/s]

  0%|                                         | 1/1112 [00:00<01:53,  9.81it/s]

  0%|                                         | 3/1112 [00:00<01:39, 11.10it/s]

  0%|▏                                        | 5/1112 [00:00<01:32, 12.00it/s]

  1%|▎                                        | 7/1112 [00:00<01:29, 12.33it/s]

  1%|▎                                        | 9/1112 [00:00<01:26, 12.69it/s]

  1%|▍                                       | 11/1112 [00:00<01:24, 13.08it/s]

  1%|▍                                       | 13/1112 [00:00<01:26, 12.63it/s]

  1%|▌                                       | 15/1112 [00:01<01:23, 13.09it/s]

  2%|▌                                       | 17/1112 [00:01<01:21, 13.40it/s]

  2%|▋                                       | 19/1112 [00:01<01:21, 13.41it/s]

  2%|▊                                       | 21/1112 [00:01<01:20, 13.61it/s]

  2%|▊                    

 36%|█████████████▉                         | 397/1112 [00:32<00:54, 13.04it/s]

 36%|█████████████▉                         | 399/1112 [00:32<00:53, 13.30it/s]

 36%|██████████████                         | 401/1112 [00:32<00:52, 13.53it/s]

 36%|██████████████▏                        | 403/1112 [00:32<00:53, 13.20it/s]

 36%|██████████████▏                        | 405/1112 [00:32<00:55, 12.81it/s]

 37%|██████████████▎                        | 407/1112 [00:32<00:53, 13.28it/s]

 37%|██████████████▎                        | 409/1112 [00:32<00:53, 13.09it/s]

 37%|██████████████▍                        | 411/1112 [00:33<00:56, 12.43it/s]

 37%|██████████████▍                        | 413/1112 [00:33<00:56, 12.47it/s]

 37%|██████████████▌                        | 415/1112 [00:33<00:56, 12.32it/s]

 38%|██████████████▋                        | 417/1112 [00:33<00:57, 12.04it/s]

 38%|██████████████▋                        | 419/1112 [00:33<00:57, 12.11it/s]

 38%|██████████████▊        

 65%|█████████████████████████▏             | 719/1112 [01:01<00:49,  7.92it/s]

 65%|█████████████████████████▎             | 720/1112 [01:01<00:48,  8.16it/s]

 65%|█████████████████████████▎             | 721/1112 [01:01<00:47,  8.30it/s]

 65%|█████████████████████████▎             | 722/1112 [01:02<00:46,  8.43it/s]

 65%|█████████████████████████▎             | 723/1112 [01:02<00:44,  8.71it/s]

 65%|█████████████████████████▍             | 724/1112 [01:02<00:45,  8.62it/s]

 65%|█████████████████████████▍             | 725/1112 [01:02<00:46,  8.30it/s]

 65%|█████████████████████████▍             | 726/1112 [01:02<00:45,  8.53it/s]

 65%|█████████████████████████▍             | 727/1112 [01:02<00:43,  8.83it/s]

 65%|█████████████████████████▌             | 728/1112 [01:02<00:42,  8.93it/s]

 66%|█████████████████████████▌             | 729/1112 [01:02<00:42,  8.96it/s]

 66%|█████████████████████████▌             | 730/1112 [01:03<00:43,  8.72it/s]

 66%|███████████████████████

 83%|████████████████████████████████▌      | 928/1112 [01:26<00:21,  8.64it/s]

 84%|████████████████████████████████▌      | 930/1112 [01:26<00:19,  9.13it/s]

 84%|████████████████████████████████▋      | 932/1112 [01:26<00:19,  9.21it/s]

 84%|████████████████████████████████▋      | 933/1112 [01:27<00:20,  8.65it/s]

 84%|████████████████████████████████▊      | 935/1112 [01:27<00:20,  8.78it/s]

 84%|████████████████████████████████▊      | 937/1112 [01:27<00:19,  9.04it/s]

 84%|████████████████████████████████▉      | 938/1112 [01:27<00:19,  8.73it/s]

 85%|████████████████████████████████▉      | 940/1112 [01:27<00:18,  9.07it/s]

 85%|█████████████████████████████████      | 941/1112 [01:27<00:19,  8.84it/s]

 85%|█████████████████████████████████      | 942/1112 [01:28<00:20,  8.29it/s]

 85%|█████████████████████████████████      | 944/1112 [01:28<00:18,  8.89it/s]

 85%|█████████████████████████████████▏     | 945/1112 [01:28<00:18,  9.13it/s]

 85%|███████████████████████

{'ner': 293.9923247890875}




  0%|                                                 | 0/1112 [00:00<?, ?it/s]

  0%|                                         | 2/1112 [00:00<01:52,  9.86it/s]

  0%|                                         | 3/1112 [00:00<01:58,  9.32it/s]

  0%|▏                                        | 5/1112 [00:00<01:54,  9.68it/s]

  1%|▏                                        | 6/1112 [00:00<01:54,  9.69it/s]

  1%|▎                                        | 8/1112 [00:00<01:54,  9.60it/s]

  1%|▎                                        | 9/1112 [00:00<01:55,  9.52it/s]

  1%|▎                                       | 10/1112 [00:01<01:57,  9.37it/s]

  1%|▍                                       | 11/1112 [00:01<02:00,  9.11it/s]

  1%|▍                                       | 12/1112 [00:01<02:01,  9.05it/s]

  1%|▍                                       | 13/1112 [00:01<02:01,  9.02it/s]

  1%|▌                                       | 14/1112 [00:01<02:00,  9.12it/s]

  1%|▌                    

 23%|█████████                              | 260/1112 [00:27<01:59,  7.10it/s]

 23%|█████████▏                             | 261/1112 [00:28<01:53,  7.48it/s]

 24%|█████████▏                             | 262/1112 [00:28<01:46,  8.00it/s]

 24%|█████████▏                             | 263/1112 [00:28<01:41,  8.36it/s]

 24%|█████████▎                             | 264/1112 [00:28<01:38,  8.59it/s]

 24%|█████████▎                             | 265/1112 [00:28<01:37,  8.69it/s]

 24%|█████████▎                             | 266/1112 [00:28<01:35,  8.90it/s]

 24%|█████████▎                             | 267/1112 [00:28<01:36,  8.77it/s]

 24%|█████████▍                             | 268/1112 [00:28<01:37,  8.68it/s]

 24%|█████████▍                             | 269/1112 [00:29<01:44,  8.06it/s]

 24%|█████████▍                             | 270/1112 [00:29<01:41,  8.32it/s]

 24%|█████████▌                             | 271/1112 [00:29<01:36,  8.74it/s]

 24%|█████████▌             

 42%|████████████████▏                      | 463/1112 [00:58<02:01,  5.34it/s]

 42%|████████████████▎                      | 464/1112 [00:58<02:01,  5.31it/s]

 42%|████████████████▎                      | 465/1112 [00:58<02:03,  5.23it/s]

 42%|████████████████▎                      | 466/1112 [00:58<02:06,  5.10it/s]

 42%|████████████████▍                      | 467/1112 [00:59<02:08,  5.00it/s]

 42%|████████████████▍                      | 468/1112 [00:59<02:08,  5.02it/s]

 42%|████████████████▍                      | 469/1112 [00:59<02:05,  5.13it/s]

 42%|████████████████▍                      | 470/1112 [00:59<02:04,  5.15it/s]

 42%|████████████████▌                      | 471/1112 [00:59<02:01,  5.26it/s]

 42%|████████████████▌                      | 472/1112 [01:00<02:01,  5.25it/s]

 43%|████████████████▌                      | 473/1112 [01:00<02:02,  5.23it/s]

 43%|████████████████▌                      | 474/1112 [01:00<02:00,  5.30it/s]

 43%|████████████████▋      

 60%|███████████████████████▎               | 665/1112 [01:40<01:39,  4.47it/s]

 60%|███████████████████████▎               | 666/1112 [01:40<01:39,  4.48it/s]

 60%|███████████████████████▍               | 667/1112 [01:41<01:38,  4.54it/s]

 60%|███████████████████████▍               | 668/1112 [01:41<01:37,  4.54it/s]

 60%|███████████████████████▍               | 669/1112 [01:41<01:36,  4.58it/s]

 60%|███████████████████████▍               | 670/1112 [01:41<01:37,  4.53it/s]

 60%|███████████████████████▌               | 671/1112 [01:41<01:36,  4.57it/s]

 60%|███████████████████████▌               | 672/1112 [01:42<01:36,  4.56it/s]

 61%|███████████████████████▌               | 673/1112 [01:42<01:36,  4.55it/s]

 61%|███████████████████████▋               | 674/1112 [01:42<01:35,  4.58it/s]

 61%|███████████████████████▋               | 675/1112 [01:42<01:35,  4.58it/s]

 61%|███████████████████████▋               | 676/1112 [01:42<01:35,  4.58it/s]

 61%|███████████████████████

 78%|██████████████████████████████▍        | 867/1112 [02:26<00:53,  4.61it/s]

 78%|██████████████████████████████▍        | 868/1112 [02:26<00:52,  4.66it/s]

 78%|██████████████████████████████▍        | 869/1112 [02:26<00:52,  4.64it/s]

 78%|██████████████████████████████▌        | 870/1112 [02:26<00:47,  5.10it/s]

 78%|██████████████████████████████▌        | 871/1112 [02:26<00:48,  4.92it/s]

 78%|██████████████████████████████▌        | 872/1112 [02:27<00:49,  4.85it/s]

 79%|██████████████████████████████▌        | 873/1112 [02:27<00:50,  4.76it/s]

 79%|██████████████████████████████▋        | 874/1112 [02:27<00:48,  4.89it/s]

 79%|██████████████████████████████▋        | 875/1112 [02:27<00:50,  4.70it/s]

 79%|██████████████████████████████▋        | 876/1112 [02:27<00:50,  4.64it/s]

 79%|██████████████████████████████▊        | 877/1112 [02:28<00:50,  4.68it/s]

 79%|██████████████████████████████▊        | 878/1112 [02:28<00:51,  4.59it/s]

 79%|███████████████████████

 96%|████████████████████████████████████▌ | 1069/1112 [03:10<00:10,  4.22it/s]

 96%|████████████████████████████████████▌ | 1070/1112 [03:10<00:09,  4.20it/s]

 96%|████████████████████████████████████▌ | 1071/1112 [03:10<00:09,  4.36it/s]

 96%|████████████████████████████████████▋ | 1072/1112 [03:11<00:09,  4.44it/s]

 96%|████████████████████████████████████▋ | 1073/1112 [03:11<00:08,  4.58it/s]

 97%|████████████████████████████████████▋ | 1074/1112 [03:11<00:07,  4.78it/s]

 97%|████████████████████████████████████▋ | 1075/1112 [03:11<00:07,  4.72it/s]

 97%|████████████████████████████████████▊ | 1076/1112 [03:12<00:07,  4.67it/s]

 97%|████████████████████████████████████▊ | 1077/1112 [03:12<00:07,  4.78it/s]

 97%|████████████████████████████████████▊ | 1078/1112 [03:12<00:07,  4.53it/s]

 97%|████████████████████████████████████▊ | 1079/1112 [03:12<00:07,  4.45it/s]

 97%|████████████████████████████████████▉ | 1080/1112 [03:12<00:07,  4.53it/s]

 97%|███████████████████████

{'ner': 40.858493463589554}




  0%|                                                 | 0/1112 [00:00<?, ?it/s]

  0%|                                         | 1/1112 [00:00<03:46,  4.90it/s]

  0%|                                         | 2/1112 [00:00<03:47,  4.87it/s]

  0%|                                         | 3/1112 [00:00<03:47,  4.88it/s]

  0%|▏                                        | 4/1112 [00:00<03:57,  4.68it/s]

  0%|▏                                        | 5/1112 [00:01<04:02,  4.57it/s]

  1%|▏                                        | 6/1112 [00:01<03:58,  4.64it/s]

  1%|▎                                        | 7/1112 [00:01<03:56,  4.67it/s]

  1%|▎                                        | 8/1112 [00:01<03:56,  4.66it/s]

  1%|▎                                        | 9/1112 [00:01<03:58,  4.63it/s]

  1%|▎                                       | 10/1112 [00:02<04:04,  4.51it/s]

  1%|▍                                       | 11/1112 [00:02<04:09,  4.41it/s]

  1%|▍                    

 18%|███████                                | 202/1112 [00:44<03:20,  4.54it/s]

 18%|███████                                | 203/1112 [00:45<03:17,  4.60it/s]

 18%|███████▏                               | 204/1112 [00:45<03:19,  4.54it/s]

 18%|███████▏                               | 205/1112 [00:45<03:23,  4.46it/s]

 19%|███████▏                               | 206/1112 [00:45<03:33,  4.25it/s]

 19%|███████▎                               | 207/1112 [00:46<03:28,  4.34it/s]

 19%|███████▎                               | 208/1112 [00:46<03:25,  4.41it/s]

 19%|███████▎                               | 209/1112 [00:46<03:18,  4.54it/s]

 19%|███████▎                               | 210/1112 [00:46<03:13,  4.66it/s]

 19%|███████▍                               | 211/1112 [00:46<03:18,  4.55it/s]

 19%|███████▍                               | 212/1112 [00:47<03:24,  4.41it/s]

 19%|███████▍                               | 213/1112 [00:47<03:22,  4.44it/s]

 19%|███████▌               

 36%|██████████████▏                        | 404/1112 [01:29<02:42,  4.36it/s]

 36%|██████████████▏                        | 405/1112 [01:29<02:40,  4.40it/s]

 37%|██████████████▏                        | 406/1112 [01:29<02:44,  4.29it/s]

 37%|██████████████▎                        | 407/1112 [01:29<02:45,  4.26it/s]

 37%|██████████████▎                        | 408/1112 [01:30<02:44,  4.29it/s]

 37%|██████████████▎                        | 409/1112 [01:30<02:44,  4.27it/s]

 37%|██████████████▍                        | 410/1112 [01:30<02:44,  4.27it/s]

 37%|██████████████▍                        | 411/1112 [01:30<02:44,  4.25it/s]

 37%|██████████████▍                        | 412/1112 [01:31<02:40,  4.37it/s]

 37%|██████████████▍                        | 413/1112 [01:31<02:38,  4.41it/s]

 37%|██████████████▌                        | 414/1112 [01:31<02:38,  4.42it/s]

 37%|██████████████▌                        | 415/1112 [01:31<02:39,  4.38it/s]

 37%|██████████████▌        

 54%|█████████████████████▎                 | 606/1112 [02:13<01:46,  4.76it/s]

 55%|█████████████████████▎                 | 607/1112 [02:14<01:49,  4.61it/s]

 55%|█████████████████████▎                 | 608/1112 [02:14<01:52,  4.47it/s]

 55%|█████████████████████▎                 | 609/1112 [02:14<01:53,  4.43it/s]

 55%|█████████████████████▍                 | 610/1112 [02:14<01:52,  4.44it/s]

 55%|█████████████████████▍                 | 611/1112 [02:15<01:53,  4.41it/s]

 55%|█████████████████████▍                 | 612/1112 [02:15<01:56,  4.31it/s]

 55%|█████████████████████▍                 | 613/1112 [02:15<01:53,  4.39it/s]

 55%|█████████████████████▌                 | 614/1112 [02:15<01:54,  4.36it/s]

 55%|█████████████████████▌                 | 615/1112 [02:16<01:52,  4.43it/s]

 55%|█████████████████████▌                 | 616/1112 [02:16<01:49,  4.55it/s]

 55%|█████████████████████▋                 | 617/1112 [02:16<01:49,  4.50it/s]

 56%|█████████████████████▋ 

 73%|████████████████████████████▎          | 808/1112 [02:59<01:09,  4.37it/s]

 73%|████████████████████████████▎          | 809/1112 [02:59<01:09,  4.38it/s]

 73%|████████████████████████████▍          | 810/1112 [02:59<01:07,  4.44it/s]

 73%|████████████████████████████▍          | 811/1112 [03:00<01:07,  4.48it/s]

 73%|████████████████████████████▍          | 812/1112 [03:00<01:07,  4.43it/s]

 73%|████████████████████████████▌          | 813/1112 [03:00<01:06,  4.49it/s]

 73%|████████████████████████████▌          | 814/1112 [03:00<01:05,  4.52it/s]

 73%|████████████████████████████▌          | 815/1112 [03:01<01:05,  4.53it/s]

 73%|████████████████████████████▌          | 816/1112 [03:01<01:05,  4.52it/s]

 73%|████████████████████████████▋          | 817/1112 [03:01<01:05,  4.49it/s]

 74%|████████████████████████████▋          | 818/1112 [03:01<01:05,  4.46it/s]

 74%|████████████████████████████▋          | 819/1112 [03:01<01:05,  4.47it/s]

 74%|███████████████████████

 91%|██████████████████████████████████▌   | 1010/1112 [03:44<00:23,  4.38it/s]

 91%|██████████████████████████████████▌   | 1011/1112 [03:44<00:22,  4.42it/s]

 91%|██████████████████████████████████▌   | 1012/1112 [03:44<00:22,  4.53it/s]

 91%|██████████████████████████████████▌   | 1013/1112 [03:45<00:22,  4.36it/s]

 91%|██████████████████████████████████▋   | 1014/1112 [03:45<00:22,  4.43it/s]

 91%|██████████████████████████████████▋   | 1015/1112 [03:45<00:21,  4.54it/s]

 91%|██████████████████████████████████▋   | 1016/1112 [03:45<00:20,  4.60it/s]

 91%|██████████████████████████████████▊   | 1017/1112 [03:45<00:20,  4.52it/s]

 92%|██████████████████████████████████▊   | 1018/1112 [03:46<00:20,  4.59it/s]

 92%|██████████████████████████████████▊   | 1019/1112 [03:46<00:19,  4.67it/s]

 92%|██████████████████████████████████▊   | 1020/1112 [03:46<00:19,  4.65it/s]

 92%|██████████████████████████████████▉   | 1021/1112 [03:46<00:19,  4.72it/s]

 92%|███████████████████████

{'ner': 16.6552869352679}




  0%|                                                 | 0/1112 [00:00<?, ?it/s]

  0%|                                         | 1/1112 [00:00<03:59,  4.63it/s]

  0%|                                         | 2/1112 [00:00<03:58,  4.65it/s]

  0%|                                         | 3/1112 [00:00<03:59,  4.63it/s]

  0%|▏                                        | 4/1112 [00:00<04:02,  4.58it/s]

  0%|▏                                        | 5/1112 [00:01<03:56,  4.68it/s]

  1%|▏                                        | 6/1112 [00:01<04:02,  4.57it/s]

  1%|▎                                        | 7/1112 [00:01<04:09,  4.43it/s]

  1%|▎                                        | 8/1112 [00:01<04:09,  4.42it/s]

  1%|▎                                        | 9/1112 [00:01<04:05,  4.50it/s]

  1%|▎                                       | 10/1112 [00:02<04:05,  4.49it/s]

  1%|▍                                       | 11/1112 [00:02<04:03,  4.52it/s]

  1%|▍                    

 18%|███████                                | 202/1112 [00:45<03:21,  4.53it/s]

 18%|███████                                | 203/1112 [00:45<03:25,  4.43it/s]

 18%|███████▏                               | 204/1112 [00:45<03:20,  4.53it/s]

 18%|███████▏                               | 205/1112 [00:46<03:17,  4.58it/s]

 19%|███████▏                               | 206/1112 [00:46<03:17,  4.59it/s]

 19%|███████▎                               | 207/1112 [00:46<03:21,  4.50it/s]

 19%|███████▎                               | 208/1112 [00:46<03:18,  4.56it/s]

 19%|███████▎                               | 209/1112 [00:47<03:22,  4.47it/s]

 19%|███████▎                               | 210/1112 [00:47<03:19,  4.53it/s]

 19%|███████▍                               | 211/1112 [00:47<03:14,  4.64it/s]

 19%|███████▍                               | 212/1112 [00:47<03:18,  4.52it/s]

 19%|███████▍                               | 213/1112 [00:47<03:15,  4.59it/s]

 19%|███████▌               

 36%|██████████████▏                        | 404/1112 [01:29<02:43,  4.33it/s]

 36%|██████████████▏                        | 405/1112 [01:30<02:37,  4.48it/s]

 37%|██████████████▏                        | 406/1112 [01:30<02:37,  4.49it/s]

 37%|██████████████▎                        | 407/1112 [01:30<02:37,  4.47it/s]

 37%|██████████████▎                        | 408/1112 [01:30<02:34,  4.57it/s]

 37%|██████████████▎                        | 409/1112 [01:31<02:33,  4.59it/s]

 37%|██████████████▍                        | 410/1112 [01:31<02:35,  4.50it/s]

 37%|██████████████▍                        | 411/1112 [01:31<02:35,  4.52it/s]

 37%|██████████████▍                        | 412/1112 [01:31<02:36,  4.48it/s]

 37%|██████████████▍                        | 413/1112 [01:31<02:38,  4.42it/s]

 37%|██████████████▌                        | 414/1112 [01:32<02:35,  4.48it/s]

 37%|██████████████▌                        | 415/1112 [01:32<02:37,  4.42it/s]

 37%|██████████████▌        

 54%|█████████████████████▎                 | 606/1112 [02:15<01:50,  4.56it/s]

 55%|█████████████████████▎                 | 607/1112 [02:15<01:50,  4.59it/s]

 55%|█████████████████████▎                 | 608/1112 [02:15<01:51,  4.50it/s]

 55%|█████████████████████▎                 | 609/1112 [02:15<01:49,  4.59it/s]

 55%|█████████████████████▍                 | 610/1112 [02:15<01:49,  4.58it/s]

 55%|█████████████████████▍                 | 611/1112 [02:16<01:48,  4.62it/s]

 55%|█████████████████████▍                 | 612/1112 [02:16<01:46,  4.67it/s]

 55%|█████████████████████▍                 | 613/1112 [02:16<01:47,  4.66it/s]

 55%|█████████████████████▌                 | 614/1112 [02:16<01:43,  4.80it/s]

 55%|█████████████████████▌                 | 615/1112 [02:16<01:41,  4.87it/s]

 55%|█████████████████████▌                 | 616/1112 [02:17<01:44,  4.74it/s]

 55%|█████████████████████▋                 | 617/1112 [02:17<01:43,  4.78it/s]

 56%|█████████████████████▋ 

 73%|████████████████████████████▎          | 808/1112 [03:02<01:12,  4.18it/s]

 73%|████████████████████████████▎          | 809/1112 [03:02<01:12,  4.17it/s]

 73%|████████████████████████████▍          | 810/1112 [03:03<01:12,  4.16it/s]

 73%|████████████████████████████▍          | 811/1112 [03:03<01:09,  4.35it/s]

 73%|████████████████████████████▍          | 812/1112 [03:03<01:09,  4.32it/s]

 73%|████████████████████████████▌          | 813/1112 [03:03<01:08,  4.35it/s]

 73%|████████████████████████████▌          | 814/1112 [03:03<01:08,  4.35it/s]

 73%|████████████████████████████▌          | 815/1112 [03:04<01:07,  4.40it/s]

 73%|████████████████████████████▌          | 816/1112 [03:04<01:07,  4.37it/s]

 73%|████████████████████████████▋          | 817/1112 [03:04<01:08,  4.31it/s]

 74%|████████████████████████████▋          | 818/1112 [03:04<01:07,  4.36it/s]

 74%|████████████████████████████▋          | 819/1112 [03:05<01:10,  4.18it/s]

 74%|███████████████████████

 91%|██████████████████████████████████▌   | 1010/1112 [03:47<00:21,  4.72it/s]

 91%|██████████████████████████████████▌   | 1011/1112 [03:47<00:21,  4.72it/s]

 91%|██████████████████████████████████▌   | 1012/1112 [03:47<00:21,  4.66it/s]

 91%|██████████████████████████████████▌   | 1013/1112 [03:47<00:21,  4.64it/s]

 91%|██████████████████████████████████▋   | 1014/1112 [03:47<00:21,  4.65it/s]

 91%|██████████████████████████████████▋   | 1015/1112 [03:48<00:21,  4.60it/s]

 91%|██████████████████████████████████▋   | 1016/1112 [03:48<00:20,  4.66it/s]

 91%|██████████████████████████████████▊   | 1017/1112 [03:48<00:20,  4.58it/s]

 92%|██████████████████████████████████▊   | 1018/1112 [03:48<00:21,  4.45it/s]

 92%|██████████████████████████████████▊   | 1019/1112 [03:49<00:21,  4.40it/s]

 92%|██████████████████████████████████▊   | 1020/1112 [03:49<00:20,  4.44it/s]

 92%|██████████████████████████████████▉   | 1021/1112 [03:49<00:20,  4.46it/s]

 92%|███████████████████████

{'ner': 47.62058655833992}




  0%|                                                 | 0/1112 [00:00<?, ?it/s]

  0%|                                         | 1/1112 [00:00<04:04,  4.55it/s]

  0%|                                         | 2/1112 [00:00<03:59,  4.64it/s]

  0%|                                         | 3/1112 [00:00<04:00,  4.61it/s]

  0%|▏                                        | 4/1112 [00:00<04:05,  4.51it/s]

  0%|▏                                        | 5/1112 [00:01<03:59,  4.63it/s]

  1%|▏                                        | 6/1112 [00:01<03:58,  4.64it/s]

  1%|▎                                        | 7/1112 [00:01<04:02,  4.57it/s]

  1%|▎                                        | 8/1112 [00:01<04:06,  4.49it/s]

  1%|▎                                        | 9/1112 [00:01<04:02,  4.54it/s]

  1%|▎                                       | 10/1112 [00:02<04:04,  4.50it/s]

  1%|▍                                       | 11/1112 [00:02<04:03,  4.52it/s]

  1%|▍                    

 18%|███████                                | 202/1112 [00:44<03:29,  4.35it/s]

 18%|███████                                | 203/1112 [00:45<03:25,  4.42it/s]

 18%|███████▏                               | 204/1112 [00:45<03:20,  4.54it/s]

 18%|███████▏                               | 205/1112 [00:45<03:17,  4.60it/s]

 19%|███████▏                               | 206/1112 [00:45<03:19,  4.53it/s]

 19%|███████▎                               | 207/1112 [00:46<03:17,  4.59it/s]

 19%|███████▎                               | 208/1112 [00:46<03:17,  4.57it/s]

 19%|███████▎                               | 209/1112 [00:46<03:12,  4.70it/s]

 19%|███████▎                               | 210/1112 [00:46<03:08,  4.78it/s]

 19%|███████▍                               | 211/1112 [00:46<03:11,  4.71it/s]

 19%|███████▍                               | 212/1112 [00:47<03:14,  4.63it/s]

 19%|███████▍                               | 213/1112 [00:47<03:16,  4.58it/s]

 19%|███████▌               

 36%|██████████████▏                        | 404/1112 [01:30<02:29,  4.75it/s]

 36%|██████████████▏                        | 405/1112 [01:30<02:27,  4.78it/s]

 37%|██████████████▏                        | 406/1112 [01:30<02:29,  4.71it/s]

 37%|██████████████▎                        | 407/1112 [01:31<02:28,  4.75it/s]

 37%|██████████████▎                        | 408/1112 [01:31<02:31,  4.66it/s]

 37%|██████████████▎                        | 409/1112 [01:31<02:26,  4.78it/s]

 37%|██████████████▍                        | 410/1112 [01:31<02:26,  4.81it/s]

 37%|██████████████▍                        | 411/1112 [01:31<02:28,  4.72it/s]

 37%|██████████████▍                        | 412/1112 [01:32<02:30,  4.65it/s]

 37%|██████████████▍                        | 413/1112 [01:32<02:39,  4.38it/s]

 37%|██████████████▌                        | 414/1112 [01:32<02:35,  4.49it/s]

 37%|██████████████▌                        | 415/1112 [01:32<02:31,  4.60it/s]

 37%|██████████████▌        

 54%|█████████████████████▎                 | 606/1112 [02:14<01:51,  4.55it/s]

 55%|█████████████████████▎                 | 607/1112 [02:14<01:54,  4.41it/s]

 55%|█████████████████████▎                 | 608/1112 [02:15<01:50,  4.54it/s]

 55%|█████████████████████▎                 | 609/1112 [02:15<01:52,  4.47it/s]

 55%|█████████████████████▍                 | 610/1112 [02:15<01:53,  4.41it/s]

 55%|█████████████████████▍                 | 611/1112 [02:15<01:50,  4.52it/s]

 55%|█████████████████████▍                 | 612/1112 [02:16<01:52,  4.43it/s]

 55%|█████████████████████▍                 | 613/1112 [02:16<01:52,  4.44it/s]

 55%|█████████████████████▌                 | 614/1112 [02:16<01:51,  4.45it/s]

 55%|█████████████████████▌                 | 615/1112 [02:16<01:49,  4.55it/s]

 55%|█████████████████████▌                 | 616/1112 [02:16<01:51,  4.45it/s]

 55%|█████████████████████▋                 | 617/1112 [02:17<01:47,  4.61it/s]

 56%|█████████████████████▋ 

 73%|████████████████████████████▎          | 808/1112 [02:59<01:05,  4.64it/s]

 73%|████████████████████████████▎          | 809/1112 [02:59<01:05,  4.60it/s]

 73%|████████████████████████████▍          | 810/1112 [02:59<01:05,  4.63it/s]

 73%|████████████████████████████▍          | 811/1112 [03:00<01:03,  4.71it/s]

 73%|████████████████████████████▍          | 812/1112 [03:00<01:04,  4.69it/s]

 73%|████████████████████████████▌          | 813/1112 [03:00<01:04,  4.66it/s]

 73%|████████████████████████████▌          | 814/1112 [03:00<01:02,  4.80it/s]

 73%|████████████████████████████▌          | 815/1112 [03:00<01:02,  4.79it/s]

 73%|████████████████████████████▌          | 816/1112 [03:01<01:01,  4.83it/s]

 73%|████████████████████████████▋          | 817/1112 [03:01<01:02,  4.71it/s]

 74%|████████████████████████████▋          | 818/1112 [03:01<01:02,  4.68it/s]

 74%|████████████████████████████▋          | 819/1112 [03:01<01:02,  4.70it/s]

 74%|███████████████████████

 91%|██████████████████████████████████▌   | 1010/1112 [03:42<00:22,  4.58it/s]

 91%|██████████████████████████████████▌   | 1011/1112 [03:42<00:21,  4.63it/s]

 91%|██████████████████████████████████▌   | 1012/1112 [03:43<00:21,  4.73it/s]

 91%|██████████████████████████████████▌   | 1013/1112 [03:43<00:21,  4.71it/s]

 91%|██████████████████████████████████▋   | 1014/1112 [03:43<00:20,  4.71it/s]

 91%|██████████████████████████████████▋   | 1015/1112 [03:43<00:21,  4.58it/s]

 91%|██████████████████████████████████▋   | 1016/1112 [03:44<00:20,  4.70it/s]

 91%|██████████████████████████████████▊   | 1017/1112 [03:44<00:20,  4.61it/s]

 92%|██████████████████████████████████▊   | 1018/1112 [03:44<00:20,  4.66it/s]

 92%|██████████████████████████████████▊   | 1019/1112 [03:44<00:20,  4.64it/s]

 92%|██████████████████████████████████▊   | 1020/1112 [03:44<00:21,  4.34it/s]

 92%|██████████████████████████████████▉   | 1021/1112 [03:45<00:20,  4.49it/s]

 92%|███████████████████████

{'ner': 64.11872683173301}




  0%|                                                 | 0/1112 [00:00<?, ?it/s]

  0%|                                         | 1/1112 [00:00<03:38,  5.08it/s]

  0%|                                         | 2/1112 [00:00<03:50,  4.82it/s]

  0%|                                         | 3/1112 [00:00<03:49,  4.84it/s]

  0%|▏                                        | 4/1112 [00:00<03:54,  4.73it/s]

  0%|▏                                        | 5/1112 [00:01<03:52,  4.75it/s]

  1%|▏                                        | 6/1112 [00:01<03:55,  4.70it/s]

  1%|▎                                        | 7/1112 [00:01<04:00,  4.59it/s]

  1%|▎                                        | 8/1112 [00:01<04:04,  4.51it/s]

  1%|▎                                        | 9/1112 [00:01<04:03,  4.53it/s]

  1%|▎                                       | 10/1112 [00:02<03:58,  4.62it/s]

  1%|▍                                       | 11/1112 [00:02<03:58,  4.61it/s]

  1%|▍                    

 18%|███████                                | 202/1112 [00:43<03:22,  4.49it/s]

 18%|███████                                | 203/1112 [00:43<03:25,  4.42it/s]

 18%|███████▏                               | 204/1112 [00:43<03:22,  4.49it/s]

 18%|███████▏                               | 205/1112 [00:44<03:25,  4.41it/s]

 19%|███████▏                               | 206/1112 [00:44<03:20,  4.51it/s]

 19%|███████▎                               | 207/1112 [00:44<03:19,  4.54it/s]

 19%|███████▎                               | 208/1112 [00:44<03:16,  4.61it/s]

 19%|███████▎                               | 209/1112 [00:44<03:13,  4.66it/s]

 19%|███████▎                               | 210/1112 [00:45<03:14,  4.65it/s]

 19%|███████▍                               | 211/1112 [00:45<03:07,  4.80it/s]

 19%|███████▍                               | 212/1112 [00:45<03:11,  4.69it/s]

 19%|███████▍                               | 213/1112 [00:45<03:16,  4.56it/s]

 19%|███████▌               

 36%|██████████████▏                        | 404/1112 [01:27<02:33,  4.61it/s]

 36%|██████████████▏                        | 405/1112 [01:27<02:32,  4.65it/s]

 37%|██████████████▏                        | 406/1112 [01:27<02:35,  4.53it/s]

 37%|██████████████▎                        | 407/1112 [01:28<02:34,  4.55it/s]

 37%|██████████████▎                        | 408/1112 [01:28<02:32,  4.61it/s]

 37%|██████████████▎                        | 409/1112 [01:28<02:35,  4.52it/s]

 37%|██████████████▍                        | 410/1112 [01:28<02:34,  4.55it/s]

 37%|██████████████▍                        | 411/1112 [01:28<02:33,  4.56it/s]

 37%|██████████████▍                        | 412/1112 [01:29<02:28,  4.71it/s]

 37%|██████████████▍                        | 413/1112 [01:29<02:29,  4.67it/s]

 37%|██████████████▌                        | 414/1112 [01:29<02:31,  4.60it/s]

 37%|██████████████▌                        | 415/1112 [01:29<02:29,  4.65it/s]

 37%|██████████████▌        

 54%|█████████████████████▎                 | 606/1112 [02:11<01:55,  4.38it/s]

 55%|█████████████████████▎                 | 607/1112 [02:12<01:55,  4.39it/s]

 55%|█████████████████████▎                 | 608/1112 [02:12<01:54,  4.39it/s]

 55%|█████████████████████▎                 | 609/1112 [02:12<01:56,  4.31it/s]

 55%|█████████████████████▍                 | 610/1112 [02:12<01:56,  4.32it/s]

 55%|█████████████████████▍                 | 611/1112 [02:12<01:53,  4.39it/s]

 55%|█████████████████████▍                 | 612/1112 [02:13<01:52,  4.44it/s]

 55%|█████████████████████▍                 | 613/1112 [02:13<01:51,  4.48it/s]

 55%|█████████████████████▌                 | 614/1112 [02:13<01:52,  4.43it/s]

 55%|█████████████████████▌                 | 615/1112 [02:13<01:53,  4.37it/s]

 55%|█████████████████████▌                 | 616/1112 [02:14<01:54,  4.32it/s]

 55%|█████████████████████▋                 | 617/1112 [02:14<01:55,  4.28it/s]

 56%|█████████████████████▋ 

 73%|████████████████████████████▎          | 808/1112 [02:56<01:10,  4.31it/s]

 73%|████████████████████████████▎          | 809/1112 [02:56<01:09,  4.35it/s]

 73%|████████████████████████████▍          | 810/1112 [02:57<01:11,  4.25it/s]

 73%|████████████████████████████▍          | 811/1112 [02:57<01:10,  4.29it/s]

 73%|████████████████████████████▍          | 812/1112 [02:57<01:10,  4.25it/s]

 73%|████████████████████████████▌          | 813/1112 [02:57<01:12,  4.12it/s]

 73%|████████████████████████████▌          | 814/1112 [02:58<01:10,  4.23it/s]

 73%|████████████████████████████▌          | 815/1112 [02:58<01:09,  4.29it/s]

 73%|████████████████████████████▌          | 816/1112 [02:58<01:06,  4.42it/s]

 73%|████████████████████████████▋          | 817/1112 [02:58<01:07,  4.35it/s]

 74%|████████████████████████████▋          | 818/1112 [02:58<01:06,  4.45it/s]

 74%|████████████████████████████▋          | 819/1112 [02:59<01:05,  4.46it/s]

 74%|███████████████████████

 91%|██████████████████████████████████▌   | 1010/1112 [03:41<00:22,  4.50it/s]

 91%|██████████████████████████████████▌   | 1011/1112 [03:41<00:22,  4.50it/s]

 91%|██████████████████████████████████▌   | 1012/1112 [03:42<00:22,  4.51it/s]

 91%|██████████████████████████████████▌   | 1013/1112 [03:42<00:21,  4.60it/s]

 91%|██████████████████████████████████▋   | 1014/1112 [03:42<00:21,  4.63it/s]

 91%|██████████████████████████████████▋   | 1015/1112 [03:42<00:20,  4.71it/s]

 91%|██████████████████████████████████▋   | 1016/1112 [03:42<00:20,  4.80it/s]

 91%|██████████████████████████████████▊   | 1017/1112 [03:43<00:19,  4.88it/s]

 92%|██████████████████████████████████▊   | 1018/1112 [03:43<00:20,  4.69it/s]

 92%|██████████████████████████████████▊   | 1019/1112 [03:43<00:19,  4.76it/s]

 92%|██████████████████████████████████▊   | 1020/1112 [03:43<00:19,  4.72it/s]

 92%|██████████████████████████████████▉   | 1021/1112 [03:43<00:19,  4.72it/s]

 92%|███████████████████████

{'ner': 3.516143950388197}




  0%|                                                 | 0/1112 [00:00<?, ?it/s]

  0%|                                         | 1/1112 [00:00<04:03,  4.57it/s]

  0%|                                         | 2/1112 [00:00<04:08,  4.47it/s]

  0%|                                         | 3/1112 [00:00<04:06,  4.50it/s]

  0%|▏                                        | 4/1112 [00:00<04:04,  4.53it/s]

  0%|▏                                        | 5/1112 [00:01<04:02,  4.57it/s]

  1%|▏                                        | 6/1112 [00:01<04:03,  4.54it/s]

  1%|▎                                        | 7/1112 [00:01<04:05,  4.51it/s]

  1%|▎                                        | 8/1112 [00:01<04:02,  4.55it/s]

  1%|▎                                        | 9/1112 [00:01<04:02,  4.56it/s]

  1%|▎                                       | 10/1112 [00:02<04:01,  4.57it/s]

  1%|▍                                       | 11/1112 [00:02<04:03,  4.52it/s]

  1%|▍                    

 18%|███████                                | 202/1112 [00:45<03:22,  4.50it/s]

 18%|███████                                | 203/1112 [00:45<03:25,  4.43it/s]

 18%|███████▏                               | 204/1112 [00:45<03:20,  4.53it/s]

 18%|███████▏                               | 205/1112 [00:45<03:22,  4.47it/s]

 19%|███████▏                               | 206/1112 [00:46<03:20,  4.51it/s]

 19%|███████▎                               | 207/1112 [00:46<03:24,  4.42it/s]

 19%|███████▎                               | 208/1112 [00:46<03:24,  4.42it/s]

 19%|███████▎                               | 209/1112 [00:46<03:20,  4.50it/s]

 19%|███████▎                               | 210/1112 [00:46<03:14,  4.63it/s]

 19%|███████▍                               | 211/1112 [00:47<03:20,  4.50it/s]

 19%|███████▍                               | 212/1112 [00:47<03:22,  4.44it/s]

 19%|███████▍                               | 213/1112 [00:47<03:18,  4.54it/s]

 19%|███████▌               

 36%|██████████████▏                        | 404/1112 [01:30<02:43,  4.32it/s]

 36%|██████████████▏                        | 405/1112 [01:30<02:48,  4.19it/s]

 37%|██████████████▏                        | 406/1112 [01:30<02:44,  4.29it/s]

 37%|██████████████▎                        | 407/1112 [01:31<02:47,  4.20it/s]

 37%|██████████████▎                        | 408/1112 [01:31<02:46,  4.23it/s]

 37%|██████████████▎                        | 409/1112 [01:31<02:44,  4.28it/s]

 37%|██████████████▍                        | 410/1112 [01:31<02:38,  4.42it/s]

 37%|██████████████▍                        | 411/1112 [01:32<02:38,  4.42it/s]

 37%|██████████████▍                        | 412/1112 [01:32<02:37,  4.44it/s]

 37%|██████████████▍                        | 413/1112 [01:32<02:40,  4.37it/s]

 37%|██████████████▌                        | 414/1112 [01:32<02:35,  4.48it/s]

 37%|██████████████▌                        | 415/1112 [01:32<02:35,  4.49it/s]

 37%|██████████████▌        

 54%|█████████████████████▎                 | 606/1112 [02:15<01:48,  4.67it/s]

 55%|█████████████████████▎                 | 607/1112 [02:15<01:46,  4.73it/s]

 55%|█████████████████████▎                 | 608/1112 [02:15<01:45,  4.77it/s]

 55%|█████████████████████▎                 | 609/1112 [02:16<01:46,  4.72it/s]

 55%|█████████████████████▍                 | 610/1112 [02:16<01:48,  4.63it/s]

 55%|█████████████████████▍                 | 611/1112 [02:16<01:51,  4.49it/s]

 55%|█████████████████████▍                 | 612/1112 [02:16<01:52,  4.46it/s]

 55%|█████████████████████▍                 | 613/1112 [02:17<01:51,  4.46it/s]

 55%|█████████████████████▌                 | 614/1112 [02:17<01:52,  4.44it/s]

 55%|█████████████████████▌                 | 615/1112 [02:17<01:53,  4.40it/s]

 55%|█████████████████████▌                 | 616/1112 [02:17<01:50,  4.49it/s]

 55%|█████████████████████▋                 | 617/1112 [02:18<01:49,  4.51it/s]

 56%|█████████████████████▋ 

 73%|████████████████████████████▎          | 808/1112 [03:00<01:07,  4.53it/s]

 73%|████████████████████████████▎          | 809/1112 [03:00<01:06,  4.55it/s]

 73%|████████████████████████████▍          | 810/1112 [03:00<01:06,  4.52it/s]

 73%|████████████████████████████▍          | 811/1112 [03:00<01:06,  4.55it/s]

 73%|████████████████████████████▍          | 812/1112 [03:01<01:06,  4.53it/s]

 73%|████████████████████████████▌          | 813/1112 [03:01<01:05,  4.55it/s]

 73%|████████████████████████████▌          | 814/1112 [03:01<01:05,  4.58it/s]

 73%|████████████████████████████▌          | 815/1112 [03:01<01:05,  4.56it/s]

 73%|████████████████████████████▌          | 816/1112 [03:02<01:06,  4.42it/s]

 73%|████████████████████████████▋          | 817/1112 [03:02<01:06,  4.47it/s]

 74%|████████████████████████████▋          | 818/1112 [03:02<01:05,  4.48it/s]

 74%|████████████████████████████▋          | 819/1112 [03:02<01:04,  4.51it/s]

 74%|███████████████████████

 91%|██████████████████████████████████▌   | 1010/1112 [03:45<00:23,  4.39it/s]

 91%|██████████████████████████████████▌   | 1011/1112 [03:45<00:22,  4.39it/s]

 91%|██████████████████████████████████▌   | 1012/1112 [03:46<00:22,  4.47it/s]

 91%|██████████████████████████████████▌   | 1013/1112 [03:46<00:21,  4.55it/s]

 91%|██████████████████████████████████▋   | 1014/1112 [03:46<00:20,  4.68it/s]

 91%|██████████████████████████████████▋   | 1015/1112 [03:46<00:21,  4.59it/s]

 91%|██████████████████████████████████▋   | 1016/1112 [03:47<00:21,  4.45it/s]

 91%|██████████████████████████████████▊   | 1017/1112 [03:47<00:21,  4.39it/s]

 92%|██████████████████████████████████▊   | 1018/1112 [03:47<00:21,  4.29it/s]

 92%|██████████████████████████████████▊   | 1019/1112 [03:47<00:21,  4.33it/s]

 92%|██████████████████████████████████▊   | 1020/1112 [03:47<00:21,  4.33it/s]

 92%|██████████████████████████████████▉   | 1021/1112 [03:48<00:20,  4.45it/s]

 92%|███████████████████████

{'ner': 20.281712107937935}




  0%|                                                 | 0/1112 [00:00<?, ?it/s]

  0%|                                         | 1/1112 [00:00<04:03,  4.56it/s]

  0%|                                         | 2/1112 [00:00<04:04,  4.54it/s]

  0%|                                         | 3/1112 [00:00<04:15,  4.34it/s]

  0%|▏                                        | 4/1112 [00:00<04:22,  4.23it/s]

  0%|▏                                        | 5/1112 [00:01<04:20,  4.25it/s]

  1%|▏                                        | 6/1112 [00:01<04:12,  4.38it/s]

  1%|▎                                        | 7/1112 [00:01<04:08,  4.45it/s]

  1%|▎                                        | 8/1112 [00:01<04:10,  4.41it/s]

  1%|▎                                        | 9/1112 [00:02<04:10,  4.40it/s]

  1%|▎                                       | 10/1112 [00:02<04:05,  4.50it/s]

  1%|▍                                       | 11/1112 [00:02<04:04,  4.51it/s]

  1%|▍                    

 18%|███████                                | 202/1112 [00:45<03:32,  4.29it/s]

 18%|███████                                | 203/1112 [00:45<03:35,  4.21it/s]

 18%|███████▏                               | 204/1112 [00:46<03:36,  4.19it/s]

 18%|███████▏                               | 205/1112 [00:46<03:29,  4.33it/s]

 19%|███████▏                               | 206/1112 [00:46<03:25,  4.42it/s]

 19%|███████▎                               | 207/1112 [00:46<03:20,  4.51it/s]

 19%|███████▎                               | 208/1112 [00:46<03:20,  4.52it/s]

 19%|███████▎                               | 209/1112 [00:47<03:18,  4.54it/s]

 19%|███████▎                               | 210/1112 [00:47<03:21,  4.49it/s]

 19%|███████▍                               | 211/1112 [00:47<03:35,  4.17it/s]

 19%|███████▍                               | 212/1112 [00:47<03:35,  4.18it/s]

 19%|███████▍                               | 213/1112 [00:48<03:31,  4.26it/s]

 19%|███████▌               

 36%|██████████████▏                        | 404/1112 [01:30<02:37,  4.51it/s]

 36%|██████████████▏                        | 405/1112 [01:30<02:34,  4.59it/s]

 37%|██████████████▏                        | 406/1112 [01:30<02:33,  4.61it/s]

 37%|██████████████▎                        | 407/1112 [01:31<02:32,  4.61it/s]

 37%|██████████████▎                        | 408/1112 [01:31<02:34,  4.54it/s]

 37%|██████████████▎                        | 409/1112 [01:31<02:41,  4.36it/s]

 37%|██████████████▍                        | 410/1112 [01:31<02:42,  4.31it/s]

 37%|██████████████▍                        | 411/1112 [01:32<02:42,  4.32it/s]

 37%|██████████████▍                        | 412/1112 [01:32<02:41,  4.34it/s]

 37%|██████████████▍                        | 413/1112 [01:32<02:37,  4.43it/s]

 37%|██████████████▌                        | 414/1112 [01:32<02:33,  4.55it/s]

 37%|██████████████▌                        | 415/1112 [01:32<02:29,  4.65it/s]

 37%|██████████████▌        

 54%|█████████████████████▎                 | 606/1112 [02:15<01:50,  4.59it/s]

 55%|█████████████████████▎                 | 607/1112 [02:15<01:48,  4.67it/s]

 55%|█████████████████████▎                 | 608/1112 [02:15<01:47,  4.67it/s]

 55%|█████████████████████▎                 | 609/1112 [02:15<01:48,  4.63it/s]

 55%|█████████████████████▍                 | 610/1112 [02:16<01:47,  4.69it/s]

 55%|█████████████████████▍                 | 611/1112 [02:16<01:50,  4.54it/s]

 55%|█████████████████████▍                 | 612/1112 [02:16<01:50,  4.54it/s]

 55%|█████████████████████▍                 | 613/1112 [02:16<01:48,  4.58it/s]

 55%|█████████████████████▌                 | 614/1112 [02:16<01:48,  4.57it/s]

 55%|█████████████████████▌                 | 615/1112 [02:17<01:48,  4.60it/s]

 55%|█████████████████████▌                 | 616/1112 [02:17<01:48,  4.56it/s]

 55%|█████████████████████▋                 | 617/1112 [02:17<01:50,  4.47it/s]

 56%|█████████████████████▋ 

 73%|████████████████████████████▎          | 808/1112 [03:00<01:10,  4.33it/s]

 73%|████████████████████████████▎          | 809/1112 [03:00<01:10,  4.31it/s]

 73%|████████████████████████████▍          | 810/1112 [03:01<01:07,  4.46it/s]

 73%|████████████████████████████▍          | 811/1112 [03:01<01:08,  4.41it/s]

 73%|████████████████████████████▍          | 812/1112 [03:01<01:06,  4.52it/s]

 73%|████████████████████████████▌          | 813/1112 [03:01<01:04,  4.61it/s]

 73%|████████████████████████████▌          | 814/1112 [03:01<01:04,  4.59it/s]

 73%|████████████████████████████▌          | 815/1112 [03:02<01:03,  4.65it/s]

 73%|████████████████████████████▌          | 816/1112 [03:02<01:05,  4.50it/s]

 73%|████████████████████████████▋          | 817/1112 [03:02<01:04,  4.55it/s]

 74%|████████████████████████████▋          | 818/1112 [03:02<01:05,  4.46it/s]

 74%|████████████████████████████▋          | 819/1112 [03:03<01:05,  4.45it/s]

 74%|███████████████████████

 91%|██████████████████████████████████▌   | 1010/1112 [03:45<00:20,  4.92it/s]

 91%|██████████████████████████████████▌   | 1011/1112 [03:45<00:20,  4.88it/s]

 91%|██████████████████████████████████▌   | 1012/1112 [03:45<00:21,  4.59it/s]

 91%|██████████████████████████████████▌   | 1013/1112 [03:45<00:21,  4.56it/s]

 91%|██████████████████████████████████▋   | 1014/1112 [03:46<00:21,  4.64it/s]

 91%|██████████████████████████████████▋   | 1015/1112 [03:46<00:21,  4.46it/s]

 91%|██████████████████████████████████▋   | 1016/1112 [03:46<00:21,  4.44it/s]

 91%|██████████████████████████████████▊   | 1017/1112 [03:46<00:21,  4.45it/s]

 92%|██████████████████████████████████▊   | 1018/1112 [03:46<00:20,  4.51it/s]

 92%|██████████████████████████████████▊   | 1019/1112 [03:47<00:20,  4.60it/s]

 92%|██████████████████████████████████▊   | 1020/1112 [03:47<00:19,  4.65it/s]

 92%|██████████████████████████████████▉   | 1021/1112 [03:47<00:19,  4.55it/s]

 92%|███████████████████████

{'ner': 2.9451551475595212}




  0%|                                                 | 0/1112 [00:00<?, ?it/s]

  0%|                                         | 1/1112 [00:00<03:50,  4.81it/s]

  0%|                                         | 2/1112 [00:00<03:47,  4.88it/s]

  0%|                                         | 3/1112 [00:00<03:57,  4.68it/s]

  0%|▏                                        | 4/1112 [00:00<04:04,  4.54it/s]

  0%|▏                                        | 5/1112 [00:01<04:04,  4.53it/s]

  1%|▏                                        | 6/1112 [00:01<04:09,  4.43it/s]

  1%|▎                                        | 7/1112 [00:01<04:04,  4.51it/s]

  1%|▎                                        | 8/1112 [00:01<04:06,  4.48it/s]

  1%|▎                                        | 9/1112 [00:02<04:07,  4.46it/s]

  1%|▎                                       | 10/1112 [00:02<04:02,  4.55it/s]

  1%|▍                                       | 11/1112 [00:02<04:00,  4.57it/s]

  1%|▍                    

 18%|███████                                | 202/1112 [00:44<03:10,  4.78it/s]

 18%|███████                                | 203/1112 [00:44<03:13,  4.70it/s]

 18%|███████▏                               | 204/1112 [00:44<03:12,  4.73it/s]

 18%|███████▏                               | 205/1112 [00:44<03:08,  4.81it/s]

 19%|███████▏                               | 206/1112 [00:45<03:14,  4.67it/s]

 19%|███████▎                               | 207/1112 [00:45<03:12,  4.70it/s]

 19%|███████▎                               | 208/1112 [00:45<02:57,  5.08it/s]

 19%|███████▎                               | 209/1112 [00:45<02:56,  5.13it/s]

 19%|███████▎                               | 210/1112 [00:45<02:57,  5.07it/s]

 19%|███████▍                               | 211/1112 [00:46<02:58,  5.05it/s]

 19%|███████▍                               | 212/1112 [00:46<03:01,  4.96it/s]

 19%|███████▍                               | 213/1112 [00:46<03:04,  4.88it/s]

 19%|███████▌               

 36%|██████████████▏                        | 404/1112 [01:29<02:36,  4.53it/s]

 36%|██████████████▏                        | 405/1112 [01:29<02:39,  4.44it/s]

 37%|██████████████▏                        | 406/1112 [01:30<02:33,  4.59it/s]

 37%|██████████████▎                        | 407/1112 [01:30<02:34,  4.57it/s]

 37%|██████████████▎                        | 408/1112 [01:30<02:32,  4.60it/s]

 37%|██████████████▎                        | 409/1112 [01:30<02:30,  4.67it/s]

 37%|██████████████▍                        | 410/1112 [01:30<02:29,  4.69it/s]

 37%|██████████████▍                        | 411/1112 [01:31<02:31,  4.63it/s]

 37%|██████████████▍                        | 412/1112 [01:31<02:34,  4.52it/s]

 37%|██████████████▍                        | 413/1112 [01:31<02:30,  4.64it/s]

 37%|██████████████▌                        | 414/1112 [01:31<02:25,  4.79it/s]

 37%|██████████████▌                        | 415/1112 [01:31<02:24,  4.82it/s]

 37%|██████████████▌        

 54%|█████████████████████▎                 | 606/1112 [02:13<01:42,  4.96it/s]

 55%|█████████████████████▎                 | 607/1112 [02:13<01:47,  4.69it/s]

 55%|█████████████████████▎                 | 608/1112 [02:13<01:47,  4.69it/s]

 55%|█████████████████████▎                 | 609/1112 [02:13<01:43,  4.84it/s]

 55%|█████████████████████▍                 | 610/1112 [02:14<01:46,  4.73it/s]

 55%|█████████████████████▍                 | 611/1112 [02:14<01:46,  4.68it/s]

 55%|█████████████████████▍                 | 612/1112 [02:14<01:48,  4.59it/s]

 55%|█████████████████████▍                 | 613/1112 [02:14<01:48,  4.59it/s]

 55%|█████████████████████▌                 | 614/1112 [02:15<01:49,  4.54it/s]

 55%|█████████████████████▌                 | 615/1112 [02:15<01:49,  4.55it/s]

 55%|█████████████████████▌                 | 616/1112 [02:15<01:46,  4.67it/s]

 55%|█████████████████████▋                 | 617/1112 [02:15<01:48,  4.55it/s]

 56%|█████████████████████▋ 

 73%|████████████████████████████▎          | 808/1112 [02:57<01:06,  4.55it/s]

 73%|████████████████████████████▎          | 809/1112 [02:57<01:06,  4.54it/s]

 73%|████████████████████████████▍          | 810/1112 [02:58<01:08,  4.44it/s]

 73%|████████████████████████████▍          | 811/1112 [02:58<01:07,  4.43it/s]

 73%|████████████████████████████▍          | 812/1112 [02:58<01:07,  4.47it/s]

 73%|████████████████████████████▌          | 813/1112 [02:58<01:06,  4.52it/s]

 73%|████████████████████████████▌          | 814/1112 [02:58<01:05,  4.55it/s]

 73%|████████████████████████████▌          | 815/1112 [02:59<01:03,  4.65it/s]

 73%|████████████████████████████▌          | 816/1112 [02:59<01:05,  4.51it/s]

 73%|████████████████████████████▋          | 817/1112 [02:59<01:03,  4.62it/s]

 74%|████████████████████████████▋          | 818/1112 [02:59<01:04,  4.56it/s]

 74%|████████████████████████████▋          | 819/1112 [03:00<01:04,  4.53it/s]

 74%|███████████████████████

 91%|██████████████████████████████████▌   | 1010/1112 [03:43<00:23,  4.33it/s]

 91%|██████████████████████████████████▌   | 1011/1112 [03:43<00:22,  4.46it/s]

 91%|██████████████████████████████████▌   | 1012/1112 [03:43<00:22,  4.54it/s]

 91%|██████████████████████████████████▌   | 1013/1112 [03:44<00:21,  4.53it/s]

 91%|██████████████████████████████████▋   | 1014/1112 [03:44<00:21,  4.61it/s]

 91%|██████████████████████████████████▋   | 1015/1112 [03:44<00:20,  4.66it/s]

 91%|██████████████████████████████████▋   | 1016/1112 [03:44<00:20,  4.68it/s]

 91%|██████████████████████████████████▊   | 1017/1112 [03:44<00:20,  4.63it/s]

 92%|██████████████████████████████████▊   | 1018/1112 [03:45<00:20,  4.57it/s]

 92%|██████████████████████████████████▊   | 1019/1112 [03:45<00:20,  4.49it/s]

 92%|██████████████████████████████████▊   | 1020/1112 [03:45<00:20,  4.46it/s]

 92%|██████████████████████████████████▉   | 1021/1112 [03:45<00:20,  4.50it/s]

 92%|███████████████████████

{'ner': 6.6864881617164285}




  0%|                                                 | 0/1112 [00:00<?, ?it/s]

  0%|                                         | 1/1112 [00:00<04:20,  4.27it/s]

  0%|                                         | 2/1112 [00:00<04:20,  4.26it/s]

  0%|                                         | 3/1112 [00:00<04:18,  4.29it/s]

  0%|▏                                        | 4/1112 [00:00<04:25,  4.17it/s]

  0%|▏                                        | 5/1112 [00:01<04:27,  4.14it/s]

  1%|▏                                        | 6/1112 [00:01<04:15,  4.33it/s]

  1%|▎                                        | 7/1112 [00:01<04:13,  4.36it/s]

  1%|▎                                        | 8/1112 [00:01<04:16,  4.31it/s]

  1%|▎                                        | 9/1112 [00:02<04:20,  4.24it/s]

  1%|▎                                       | 10/1112 [00:02<04:19,  4.25it/s]

  1%|▍                                       | 11/1112 [00:02<04:11,  4.38it/s]

  1%|▍                    

 18%|███████                                | 202/1112 [00:45<03:26,  4.41it/s]

 18%|███████                                | 203/1112 [00:45<03:27,  4.37it/s]

 18%|███████▏                               | 204/1112 [00:46<03:29,  4.32it/s]

 18%|███████▏                               | 205/1112 [00:46<03:26,  4.39it/s]

 19%|███████▏                               | 206/1112 [00:46<03:24,  4.44it/s]

 19%|███████▎                               | 207/1112 [00:46<03:28,  4.34it/s]

 19%|███████▎                               | 208/1112 [00:47<03:27,  4.36it/s]

 19%|███████▎                               | 209/1112 [00:47<03:21,  4.49it/s]

 19%|███████▎                               | 210/1112 [00:47<03:21,  4.47it/s]

 19%|███████▍                               | 211/1112 [00:47<03:19,  4.52it/s]

 19%|███████▍                               | 212/1112 [00:47<03:19,  4.51it/s]

 19%|███████▍                               | 213/1112 [00:48<03:22,  4.43it/s]

 19%|███████▌               

 36%|██████████████▏                        | 404/1112 [01:30<02:39,  4.44it/s]

 36%|██████████████▏                        | 405/1112 [01:31<02:39,  4.44it/s]

 37%|██████████████▏                        | 406/1112 [01:31<02:41,  4.38it/s]

 37%|██████████████▎                        | 407/1112 [01:31<02:41,  4.36it/s]

 37%|██████████████▎                        | 408/1112 [01:31<02:36,  4.49it/s]

 37%|██████████████▎                        | 409/1112 [01:32<02:41,  4.35it/s]

 37%|██████████████▍                        | 410/1112 [01:32<02:41,  4.36it/s]

 37%|██████████████▍                        | 411/1112 [01:32<02:38,  4.43it/s]

 37%|██████████████▍                        | 412/1112 [01:32<02:32,  4.58it/s]

 37%|██████████████▍                        | 413/1112 [01:32<02:33,  4.56it/s]

 37%|██████████████▌                        | 414/1112 [01:33<02:33,  4.55it/s]

 37%|██████████████▌                        | 415/1112 [01:33<02:31,  4.61it/s]

 37%|██████████████▌        

 54%|█████████████████████▎                 | 606/1112 [02:16<01:54,  4.43it/s]

 55%|█████████████████████▎                 | 607/1112 [02:16<01:52,  4.50it/s]

 55%|█████████████████████▎                 | 608/1112 [02:16<01:51,  4.53it/s]

 55%|█████████████████████▎                 | 609/1112 [02:17<01:52,  4.46it/s]

 55%|█████████████████████▍                 | 610/1112 [02:17<01:48,  4.61it/s]

 55%|█████████████████████▍                 | 611/1112 [02:17<01:50,  4.53it/s]

 55%|█████████████████████▍                 | 612/1112 [02:17<01:50,  4.54it/s]

 55%|█████████████████████▍                 | 613/1112 [02:17<01:48,  4.58it/s]

 55%|█████████████████████▌                 | 614/1112 [02:18<01:48,  4.60it/s]

 55%|█████████████████████▌                 | 615/1112 [02:18<01:46,  4.66it/s]

 55%|█████████████████████▌                 | 616/1112 [02:18<01:44,  4.73it/s]

 55%|█████████████████████▋                 | 617/1112 [02:18<01:45,  4.69it/s]

 56%|█████████████████████▋ 

 73%|████████████████████████████▎          | 808/1112 [03:01<01:09,  4.38it/s]

 73%|████████████████████████████▎          | 809/1112 [03:01<01:07,  4.50it/s]

 73%|████████████████████████████▍          | 810/1112 [03:01<01:06,  4.51it/s]

 73%|████████████████████████████▍          | 811/1112 [03:01<01:07,  4.44it/s]

 73%|████████████████████████████▍          | 812/1112 [03:02<01:06,  4.49it/s]

 73%|████████████████████████████▌          | 813/1112 [03:02<01:05,  4.57it/s]

 73%|████████████████████████████▌          | 814/1112 [03:02<01:05,  4.53it/s]

 73%|████████████████████████████▌          | 815/1112 [03:02<01:05,  4.52it/s]

 73%|████████████████████████████▌          | 816/1112 [03:02<01:06,  4.46it/s]

 73%|████████████████████████████▋          | 817/1112 [03:03<01:06,  4.43it/s]

 74%|████████████████████████████▋          | 818/1112 [03:03<01:04,  4.55it/s]

 74%|████████████████████████████▋          | 819/1112 [03:03<01:03,  4.58it/s]

 74%|███████████████████████

 91%|██████████████████████████████████▌   | 1010/1112 [03:48<00:22,  4.51it/s]

 91%|██████████████████████████████████▌   | 1011/1112 [03:48<00:21,  4.61it/s]

 91%|██████████████████████████████████▌   | 1012/1112 [03:49<00:22,  4.54it/s]

 91%|██████████████████████████████████▌   | 1013/1112 [03:49<00:21,  4.53it/s]

 91%|██████████████████████████████████▋   | 1014/1112 [03:49<00:21,  4.58it/s]

 91%|██████████████████████████████████▋   | 1015/1112 [03:49<00:21,  4.60it/s]

 91%|██████████████████████████████████▋   | 1016/1112 [03:49<00:21,  4.51it/s]

 91%|██████████████████████████████████▊   | 1017/1112 [03:50<00:21,  4.44it/s]

 92%|██████████████████████████████████▊   | 1018/1112 [03:50<00:20,  4.51it/s]

 92%|██████████████████████████████████▊   | 1019/1112 [03:50<00:20,  4.54it/s]

 92%|██████████████████████████████████▊   | 1020/1112 [03:50<00:19,  4.60it/s]

 92%|██████████████████████████████████▉   | 1021/1112 [03:51<00:19,  4.67it/s]

 92%|███████████████████████

{'ner': 4.308871795468313e-09}
Entities in 'itching'
SYMPTOM itching
Saved model to C:\Users\Andriy\Desktop\chatbot
Loading from C:\Users\Andriy\Desktop\chatbot
SYMPTOM itching


In [12]:
n = model("I have headache")
for ent in n.ents:
        print(ent.label_, ent.text)

SYMPTOM headache


# Reading data

In [7]:
import pandas as pd

In [8]:
df = pd.read_csv('data.csv')

In [24]:
df.head(100)

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Allergy
2,0,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,GERD
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Chronic cholestasis
4,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Drug Reaction
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Peptic ulcer diseae
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,AIDS
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Diabetes
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Gastroenteritis
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Bronchial Asthma


# Counting symptoms repetition

In [10]:
symptom_map = {}
for key in df.columns:
    if key != 'prognosis':
        symptom_map[key] = 0
        

In [11]:
# for index, row in df.iterrows():
#     print(row['c1'])
#print(df[df.columns[2]])

#print(symptom_map)
keys = list(symptom_map.keys())
#print(keys[0])
#print((list(df[keys[0]]))
for key in keys:
    for exist in list(df[key]):
        #print(exist)
        symptom_map[key] += int(exist)

In [12]:
print(symptom_map)

{'itching': 6, 'skin_rash': 7, 'nodal_skin_eruptions': 1, 'continuous_sneezing': 2, 'shivering': 1, 'chills': 7, 'joint_pain': 6, 'stomach_pain': 2, 'acidity': 2, 'ulcers_on_tongue': 1, 'muscle_wasting': 1, 'vomiting': 17, 'burning_micturition': 2, 'spotting_ urination': 1, 'fatigue': 17, 'weight_gain': 1, 'anxiety': 1, 'cold_hands_and_feets': 1, 'mood_swings': 2, 'weight_loss': 4, 'restlessness': 2, 'lethargy': 4, 'patches_in_throat': 1, 'irregular_sugar_level': 1, 'cough': 5, 'high_fever': 12, 'sunken_eyes': 1, 'breathlessness': 4, 'sweating': 6, 'dehydration': 1, 'indigestion': 2, 'headache': 10, 'yellowish_skin': 8, 'dark_urine': 5, 'nausea': 10, 'loss_of_appetite': 10, 'pain_behind_the_eyes': 1, 'back_pain': 2, 'constipation': 2, 'abdominal_pain': 9, 'diarrhoea': 5, 'mild_fever': 3, 'yellow_urine': 1, 'yellowing_of_eyes': 7, 'acute_liver_failure': 1, 'fluid_overload': 0, 'swelling_of_stomach': 1, 'swelled_lymph_nodes': 3, 'malaise': 6, 'blurred_and_distorted_vision': 3, 'phlegm': 

# Counting top repeated symptoms

In [13]:
common_symptoms = []
for key in symptom_map:
    if symptom_map[key] > 10:
        common_symptoms.append(key)
        

In [14]:
print(common_symptoms)

['vomiting', 'fatigue', 'high_fever']


In [26]:
mean_symptoms_per_disease = 0
for index,row in df.iterrows():
    for key in keys:
        mean_symptoms_per_disease += row[key]
mean_symptoms_per_disease = mean_symptoms_per_disease/df.shape[0]
print(mean_symptoms_per_disease)

7.829268292682927
