<a href="https://colab.research.google.com/github/LUMII-AILab/NLP_Course/blob/main/notebooks/NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setting up the environment

In [None]:
!pip install flair

In [None]:
from flair.data import Sentence
from flair.nn import Classifier
from flair.data import Corpus
from flair.trainers import ModelTrainer
from flair.models import SequenceTagger
from flair.datasets import ColumnCorpus
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings
from typing import List

___
NER tagging

In [None]:
def ner_tag(sentence, model='ner'):
    # make a sentence
    sentence = Sentence(sentence)

    # load the NER tagger
    tagger = Classifier.load(model)

    # run NER over sentence
    tagger.predict(sentence)

    return sentence

In [None]:
example = "George Washington was the first president of the United States of America."
sentence = ner_tag(example)
# print the sentence with all annotations
print(sentence)

In [None]:
for entity in sentence.get_spans('ner'):
    print(entity)

___
**Different NER models offered by Flair**
___
Standard Flair NER model offers 4 classes:
* PER (person),
* ORG (organization),
* LOC (location),
* MISC (miscellanious)


Alternatively the 'ner-ontonotes-large' offers 18 seperate classes.

In [None]:
sentence = 'On September 1st George won 1 dollar while watching Game of Thrones.'

# Standard Flair NER model offer
ner_tag(sentence)
# Expanded NER model
ner_tag(sentence, 'ner-ontonotes-large')

### Training a small custom Flair NER model


Example of code for training English NER model: https://github.com/flairNLP/flair/blob/master/resources/docs/EXPERIMENTS.md

In [None]:
from flair.data import Corpus
from flair.datasets import CONLL_03
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings
from typing import List

!mkdir train

In [None]:
import pandas as pd
from tqdm import tqdm
from difflib import SequenceMatcher
import re
import pickle

def matcher(string, pattern):
    '''
    Return the start and end index of any pattern present in the text.
    '''
    match_list = []
    pattern = pattern.strip()
    seqMatch = SequenceMatcher(None, string, pattern, autojunk=False)
    match = seqMatch.find_longest_match(0, len(string), 0, len(pattern))
    if (match.size == len(pattern)):
        start = match.a
        end = match.a + match.size
        match_tup = (start, end)
        string = string.replace(pattern, "X" * len(pattern), 1)
        match_list.append(match_tup)

    return match_list, string

def mark_sentence(s, match_list):
    '''
    Marks all the entities in the sentence as per the BIO scheme.
    '''
    word_dict = {}
    for word in s.split():
        word_dict[word] = 'O'

    for start, end, e_type in match_list:
        temp_str = s[start:end]
        tmp_list = temp_str.split()
        if len(tmp_list) > 1:
            word_dict[tmp_list[0]] = 'B-' + e_type
            for w in tmp_list[1:]:
                word_dict[w] = 'I-' + e_type
        else:
            word_dict[temp_str] = 'B-' + e_type
    return word_dict

def clean(text):
    '''
    Just a helper fuction to add a space before the punctuations for better tokenization
    '''
    filters = ["!", "#", "$", "%", "&", "(", ")", "/", "*", ".", ":", ";", "<", "=", ">", "?", "@", "[",
               "\\", "]", "_", "`", "{", "}", "~", "'"]
    for i in text:
        if i in filters:
            text = text.replace(i, " " + i)

    return text

def create_data(df, filepath):
    '''
    The function responsible for the creation of data in the said format.
    '''
    with open(filepath , 'w') as f:
        for text, annotation in zip(df.text, df.annotation):
            text = clean(text)
            text_ = text
            match_list = []
            for i in annotation:
                a, text_ = matcher(text, i[0])
                match_list.append((a[0][0], a[0][1], i[1]))

            d = mark_sentence(text, match_list)

            for i in d.keys():
                f.writelines(i + ' ' + d[i] +'\n')
            f.writelines('\n')

def create_set(file, data):
    ## path to save the txt file.
    filepath = file
    ## creating the file.
    create_data(data, filepath)



def example_set():
    # Train data
    data = pd.DataFrame([["Horses are too tall and they pretend to care about your feelings", [("Horses", "ANIMAL")]],
                  ["Who is Shaka Khan?", [("Shaka Khan", "PERSON")]],
                  ["I like London and Berlin.", [("London", "LOCATION"), ("Berlin", "LOCATION")]],
                  ["There is a banyan tree in the courtyard", [("banyan tree", "TREE")]],
                  ["John Doe lives near Central Park.", [("John Doe", "PERSON"), ("Central Park", "LOCATION")]]], columns=['text', 'annotation'])
    create_set('train/train.txt', data)

    # Test data
    data = pd.DataFrame([["I have 6 horses in my barn.", [("horses", "ANIMAL")]],
                  ["Did John go to Berlin?", [("John", "PERSON"), ("Berlin", "LOCATION")]]], columns=['text', 'annotation'])
    create_set('train/test.txt', data)


    # Dev data
    data = pd.DataFrame([["I love visiting London!", [("London", "LOCATION")]],
                  ["Shaka Khan , where are my horses?", [("Shaka Khan", "PERSON"), ("horses", "ANIMAL")]]], columns=['text', 'annotation'])
    create_set('train/dev.txt', data)


example_set()

In [None]:
# define columns
columns = {0 : 'text', 1 : 'ner'}

# directory where the data resides
data_folder = 'train/'

# initializing the corpus
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file = 'train.txt',
                              test_file = 'test.txt',
                              dev_file = 'dev.txt')

In [None]:
# tag to predict
tag_type = 'ner'

# make tag dictionary from the corpus
tag_dictionary = corpus.make_label_dictionary(label_type=tag_type)

In [None]:
# initialize embeddings
embedding_types: List[TokenEmbeddings] = [

    # GloVe embeddings
    WordEmbeddings('glove'),

    # contextual string embeddings, forward
    PooledFlairEmbeddings('news-forward', pooling='min'),

    # contextual string embeddings, backward
    PooledFlairEmbeddings('news-backward', pooling='min'),
]


embeddings : StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

tagger : SequenceTagger = SequenceTagger(hidden_size=256,
                                       embeddings=embeddings,
                                       tag_dictionary=tag_dictionary,
                                       tag_type=tag_type,
                                       use_crf=True)
print(tagger)

In [None]:
trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train('resources/taggers/example-ner',
              train_with_dev=True,
              max_epochs=50)

In [None]:
# load the trained model
model = SequenceTagger.load('resources/taggers/example-ner/final-model.pt')

# create example sentence
sentence = Sentence('I love Hokkaido')

# predict the tags
model.predict(sentence)

print(sentence.to_tagged_string())