<a href="https://colab.research.google.com/github/LUMII-AILab/NLP_Course/blob/main/notebooks/NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setting up the environment

In [None]:
!pip install flair

In [None]:
from flair.data import Sentence
from flair.nn import Classifier
from flair.data import Corpus
from flair.trainers import ModelTrainer
from flair.models import SequenceTagger
from flair.datasets import ColumnCorpus
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings
from typing import List

___
NER tagging

In [None]:
def ner_tag(sentence, model='ner'):
    # make a sentence
    sentence = Sentence(sentence)

    # load the NER tagger
    tagger = Classifier.load(model)

    # run NER over sentence
    tagger.predict(sentence)

    return sentence

In [None]:
example = "George Washington was the first president of the United States of America."
sentence = ner_tag(example)
# print the sentence with all annotations
print(sentence)

In [None]:
for entity in sentence.get_spans('ner'):
    print(entity)

___
**Different NER models offered by Flair**
___
Standard Flair NER model offers 4 classes:
* PER (person),
* ORG (organization),
* LOC (location),
* MISC (miscellanious)


Alternatively the 'ner-ontonotes-large' offers 18 seperate classes.

In [None]:
sentence = 'On September 1st George won 1 dollar while watching Game of Thrones.'

# Standard Flair NER model offer
ner_tag(sentence)
# Expanded NER model
ner_tag(sentence, 'ner-ontonotes-large')

2024-05-29 10:36:04,976 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

2024-05-29 10:36:55,027 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PRODUCT, I-PRODUCT, E-PRODUCT, B-WORK_OF_ART, I-WORK_OF_ART, E-WORK_OF_ART, B-PERSON, E-PERSON, S-GPE, B-DATE, I-DATE, E-DATE, S-ORDINAL, S-LANGUAGE, I-PERSON, S-EVENT, S-DATE, B-QUANTITY, E-QUANTITY, S-TIME, B-TIME, I-TIME, E-TIME, B-GPE, E-GPE, S-ORG, I-GPE, S-NORP, B-FAC, I-FAC, E-FAC, B-NORP, E-NORP, S-PERCENT, B-ORG, E-ORG, B-LANGUAGE, E-LANGUAGE, I-CARDINAL, I-ORG, S-WORK_OF_ART, I-QUANTITY, B-MONEY


Sentence[13]: "On September 1st George won 1 dollar while watching Game of Thrones." → ["September 1st"/DATE, "George"/PERSON, "1 dollar"/MONEY, "Game of Thrones"/WORK_OF_ART]

### Training a custom Flair NER model


Example of code for training English NER model: https://github.com/flairNLP/flair/blob/master/resources/docs/EXPERIMENTS.md

In [None]:
!git clone https://github.com/flairNLP/CleanCoNLL.git

In [None]:
%cd CleanCoNLL/
!chmod u+x create_cleanconll_from_conll03.sh
!SCRIPT_ROOT=. bash create_cleanconll_from_conll03.sh
%cd ..

In [None]:
# # define columns
columns = {0 : 'text', 1 : 'pos', 2 : 'wiki', 3 : 'ner', 4 : 'ner_old'}

# # directory where the data resides
data_folder = 'CleanCoNLL/data/cleanconll/'

# initializing the corpus
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='cleanconll.train',
                              test_file='cleanconll.test',
                              dev_file='cleanconll.dev')

In [None]:
# tag to predict
tag_type = 'ner'

# make tag dictionary from the corpus
tag_dictionary = corpus.make_label_dictionary(label_type=tag_type)

In [None]:
# initialize embeddings
embedding_types: List[TokenEmbeddings] = [

    # GloVe embeddings
    WordEmbeddings('glove'),

    # contextual string embeddings, forward
    PooledFlairEmbeddings('news-forward', pooling='min'),

    # contextual string embeddings, backward
    PooledFlairEmbeddings('news-backward', pooling='min'),
]


embeddings : StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

tagger : SequenceTagger = SequenceTagger(hidden_size=256,
                                       embeddings=embeddings,
                                       tag_dictionary=tag_dictionary,
                                       tag_type=tag_type,
                                       use_crf=True)
print(tagger)

In [None]:
trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train('resources/taggers/example-ner',
              train_with_dev=True,
              max_epochs=10)

In [None]:
# load the trained model
model = SequenceTagger.load('resources/taggers/example-ner/final-model.pt')

# create example sentence
sentence = Sentence('I love Berlin')

# predict the tags
model.predict(sentence)

print(sentence.to_tagged_string())