# Named Entity Recognition (NER)   
This notebook illustates how NER was used to identify names, organizations, locations, and more   
NER was used to identify and remove patient and doctor names for annonymization
NER was also used to remove names of locations, since it was found using explainability methods (SHAP, see following notebooks), that models overfitted to place high emphazis on the regions patients were admitted to   
  
The NER model uses Huggingface Transformers models released by the royal library of Sweden (KB) 

In [None]:
from transformers import AutoModel, AutoTokenizer, pipeline
from datasets import load_dataset
from tqdm.notebook import tqdm
import torch

import numpy as np 
import pandas as pd

In [None]:
RANDOM_SEED = 0
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

df = pd.read_csv("../data/dataset_no_recipe.csv")
df.columns = ['text', 'label']
df.to_csv('../data/dataset_entity.csv', index = False)

print(df.shape)

In [None]:
nlp = pipeline('ner', model='KB/bert-base-swedish-cased-ner', tokenizer='KB/bert-base-swedish-cased-ner', device=0)

res = nlp('Idag släpper KB tre språkmodeller. Jenny Petersson is a dentist for 4654548165.')

In [None]:
for token in res:
    print(token)

In [None]:
def entity(text, l):
    for token in nlp(text):
        if token['word'].startswith('##'):
            l[-1]['word'] += token['word'][2:]
        else:
            l += [ token ]
    return l

def check_len(text):
    half = int(len(text)/2)
    while text[half]!= ' ':
        half -= 1
    #print(len(text), half)
    return text[:half], text[half:]

l = []
for text in tqdm(df['text']):
    sentences = text.split('.')
    for sentence in sentences:
        if len(sentence)>1000 and len(sentence)<4000:
            half1, half2 = check_len(sentence)
            l = entity(half1,l)
            l = entity(half2,l)
        elif len(sentence)>4000:
            half1, half2 = check_len(sentence)
            half11, half12 = check_len(half1)
            half21, half22 = check_len(half2)
            l = entity(half11,l)
            l = entity(half12,l)
            l = entity(half21,l)
            l = entity(half22,l)
            
        else:
            l = entity(sentence,l)

In [None]:
df['text'][6053]

In [None]:
names = []
dict = {}
for tok in l:
    if tok['entity'] == 'PER' and tok['score'] >.95:
        dict[tok['word']] = tok['score']
        names.append(tok['word'])
    
unique_names = list(dict.fromkeys(names))
print(unique_names)


In [None]:
for k,v in dict.items():
    print(k,v)

In [None]:
for name in unique_names:
    if len(name) == 1:
        unique_names.remove(name)

In [None]:
unique_names.remove('Pat')
unique_names.remove('Akut')

In [None]:
print(unique_names)

In [None]:
import matplotlib.pyplot as plt
i=0
top = 30
top_dict = {}
for k,v in sorted(dict.items(), key=lambda item:item[1], reverse=True):
    if i==top:
        break
    if k != 'Pat' and k != 'Akut':
        top_dict[k] = v
        i+=1

plt.rcdefaults()
fig, ax = plt.subplots()

ax.barh(np.arange(top),top_dict.values(), color='c')
ax.set_yticks(np.arange(top))
ax.set_yticklabels(top_dict.keys())
ax.invert_yaxis()
ax.set_xlabel("Score")
ax.set_title("Top names")
plt.xlim([.9998,0.99992])
plt.savefig('names.png', bbox_inches='tight')
plt.show()