### Loading the TSV DATA using Flair Corpus

In [1]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus

columns = {0:'text', 1:'pos', 2:'chunk', 3:'ner'}
data_file = 'data/'

# initializing the corpus
corpus: Corpus = ColumnCorpus(data_file, columns,
                              train_file = 'conllpp_train.txt',
                              test_file = 'conllpp_test.txt',
                              dev_file = 'conllpp_dev.txt')



2022-02-26 18:33:35,471 Reading data from data
2022-02-26 18:33:35,472 Train: data\conllpp_train.txt
2022-02-26 18:33:35,472 Dev: data\conllpp_dev.txt
2022-02-26 18:33:35,472 Test: data\conllpp_test.txt


#### Taking  look at one sample sentence with given NER Tags

In [6]:
corpus.train[35].to_tagged_string('ner')

"State media quoted China <B-LOC> 's top negotiator with Taipei <B-LOC> , Tang <B-PER> Shubei <I-PER> , as telling a visiting group from Taiwan <B-LOC> on Wednesday that it was time for the rivals to hold political talks ."

#### Making word:NER dictionary for training

In [11]:
# tag to predict
tag_type = 'ner'
# make tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

  tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)


#### Defining Embeddings to convert INPUT -> Embedddings

In [9]:
from flair.embeddings import WordEmbeddings, StackedEmbeddings, TokenEmbeddings
from typing import List
embedding_types : List[TokenEmbeddings] = [
        WordEmbeddings('glove'),
        ## other embeddings
        ]
embeddings : StackedEmbeddings = StackedEmbeddings(
                                 embeddings=embedding_types)

#### Denfining the Type of Model

In [13]:
from flair.models import SequenceTagger
tagger : SequenceTagger = SequenceTagger(hidden_size=256,
                                       embeddings=embeddings,
                                       tag_dictionary=tag_dictionary,
                                       tag_type=tag_type,
                                       use_crf=True)
print(tagger)

SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'glove'
      (embedding): Embedding(400001, 100)
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=100, out_features=100, bias=True)
  (rnn): LSTM(100, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=11, bias=True)
  (beta): 1.0
  (weights): None
  (weight_tensor) None
)


### Defining Training Parameters

**Parameters:**  
learning_rate: "0.1"  
mini_batch_size: "32"  
patience: "3"  
anneal_factor: "0.5"  
max_epochs: "150"  
shuffle: "True"  
train_with_dev: "False"  
batch_growth_annealing: "False"  

In [14]:
from flair.trainers import ModelTrainer
trainer : ModelTrainer = ModelTrainer(tagger, corpus)
trainer.train('resources/taggers/example-ner',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=150)

2022-02-26 18:43:53,563 ----------------------------------------------------------------------------------------------------
2022-02-26 18:43:53,564 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'glove'
      (embedding): Embedding(400001, 100)
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=100, out_features=100, bias=True)
  (rnn): LSTM(100, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=11, bias=True)
  (beta): 1.0
  (weights): None
  (weight_tensor) None
)"
2022-02-26 18:43:53,564 ----------------------------------------------------------------------------------------------------
2022-02-26 18:43:53,565 Corpus: "Corpus: 14987 train + 3466 dev + 3684 test sentences"
2022-02-26 18:43:53,566 ----------------------------------------------------------------------------------------------------
2022-02-26 18:4

{'test_score': 0.879563350035791,
 'dev_score_history': [0.6909984871406959,
  0.7693317086887244,
  0.8213519313304721,
  0.8342826628693485,
  0.8522300776817665,
  0.859655293703834,
  0.8629459248329139,
  0.868693909558247,
  0.8715236307797303,
  0.8771080169506184,
  0.8813969571230983,
  0.8859550073829584,
  0.8830358690001733,
  0.8853305785123967,
  0.8800480150904569,
  0.8919222462203023,
  0.8895584057846259,
  0.8843806412791198,
  0.8913902732428253,
  0.8886790833404857,
  0.8959890062698617,
  0.8968548039638088,
  0.9001114636028467,
  0.8966108966108967,
  0.8994518670777664,
  0.89909521757863,
  0.9013310433662517,
  0.9005988023952095,
  0.9016844276383638,
  0.9005397070161913,
  0.9023887265853239,
  0.9040460441542822,
  0.9046844223687591,
  0.9027064063035287,
  0.9005486968449932,
  0.9036897525896755,
  0.9058288265087132,
  0.9047741053890169,
  0.9046475733150403,
  0.9064166523309823,
  0.9049664890874721,
  0.9056120269923977,
  0.9056150878696958,
  0

#### Testing Model on Sample Doc

In [18]:
from flair.data import Sentence
from flair.models import SequenceTagger
# load the trained model
model = SequenceTagger.load('C:/Users/karti/Data Science/VMock_NER/CrossWeigh/resources/taggers/example-ner/final-model.pt')
# create example sentence
sentence = Sentence('I love Berlin')
# predict the tags
model.predict(sentence)
print(sentence.to_tagged_string())

2022-02-26 19:53:31,374 loading file C:/Users/karti/Data Science/VMock_NER/CrossWeigh/resources/taggers/example-ner/final-model.pt
I love Berlin <B-LOC>


## Model Results on Connllpp_test.txt

Results:
- F-score (micro) 0.8796
- F-score (macro) 0.8621
- Accuracy 0.8067

#### Function to Convert Output TSV to Dataframe

In [60]:
import logging
import pandas as pd

# function to get json from tab separated data
def generate_examples(filepath):
    logging.info("⏳ Generating examples from = %s", filepath)
    with open(filepath, encoding="utf-8") as f:
        guid = 0
        tokens = []
        ner_tags = []
        pred_ner_tags = []
        for line in f:
            if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                if tokens:
                    yield {
                        "id": str(guid),
                        "tokens": tokens,
                        "ner_tags": ner_tags,
                        "pred_ner_tags": pred_ner_tags,
                    }
                    guid += 1
                    tokens = []
                    ner_tags = []
                    pred_ner_tags = []
            else:
                # conll2003 tokens are space separated
                splits = line.split(" ")
                tokens.append(splits[0])
                ner_tags.append(splits[1])
                # indexing the output to remove '\n' from end
                pred_ner_tags.append(splits[2][:-1])
        # last example
        if tokens:
            yield {
                "id": str(guid),
                "tokens": tokens,
                "ner_tags": pos_tags,
                "pred_ner_tags": chunk_tags,
            }
            
test_data = generate_examples('resources/taggers/example-ner/test.tsv')
test_df   = pd.DataFrame(test_data)
test_df.head()

Unnamed: 0,id,tokens,ner_tags,pred_ner_tags
0,0,"[SOCCER, -, JAPAN, GET, LUCKY, WIN, ,, CHINA, ...","[O, O, B-LOC, O, O, O, O, B-LOC, O, O, O, O]","[O, O, B-LOC, O, O, O, O, B-LOC, O, O, O, O]"
1,1,"[Nadim, Ladki]","[B-PER, I-PER]","[B-PER, I-PER]"
2,2,"[AL-AIN, ,, United, Arab, Emirates, 1996-12-06]","[B-LOC, O, B-LOC, I-LOC, I-LOC, O]","[B-LOC, O, B-LOC, I-LOC, I-LOC, O]"
3,3,"[Japan, began, the, defence, of, their, Asian,...","[B-LOC, O, O, O, O, O, B-MISC, I-MISC, O, O, O...","[B-LOC, O, O, O, O, O, B-MISC, I-MISC, O, O, O..."
4,4,"[But, China, saw, their, luck, desert, them, i...","[O, B-LOC, O, O, O, O, O, O, O, O, O, O, O, O,...","[O, B-LOC, O, O, O, O, O, O, O, O, O, O, O, O,..."


#### Function to Create Evaluation Metric

In [68]:
import numpy as np

def evaluate(pred, true):
    # pred -> list of predicted tags
    # true -> lisst of true tags
    assert len(pred) == len(true)
    return sum(np.array(pred) == np.array(true)) / len(pred)

In [69]:
test_df['evalscores'] = test_df.apply(lambda x: evaluate(x['pred_ner_tags'], x['ner_tags']), axis=1)
test_df.head()

Unnamed: 0,id,tokens,ner_tags,pred_ner_tags,evalscores
0,0,"[SOCCER, -, JAPAN, GET, LUCKY, WIN, ,, CHINA, ...","[O, O, B-LOC, O, O, O, O, B-LOC, O, O, O, O]","[O, O, B-LOC, O, O, O, O, B-LOC, O, O, O, O]",1.0
1,1,"[Nadim, Ladki]","[B-PER, I-PER]","[B-PER, I-PER]",1.0
2,2,"[AL-AIN, ,, United, Arab, Emirates, 1996-12-06]","[B-LOC, O, B-LOC, I-LOC, I-LOC, O]","[B-LOC, O, B-LOC, I-LOC, I-LOC, O]",1.0
3,3,"[Japan, began, the, defence, of, their, Asian,...","[B-LOC, O, O, O, O, O, B-MISC, I-MISC, O, O, O...","[B-LOC, O, O, O, O, O, B-MISC, I-MISC, O, O, O...",1.0
4,4,"[But, China, saw, their, luck, desert, them, i...","[O, B-LOC, O, O, O, O, O, O, O, O, O, O, O, O,...","[O, B-LOC, O, O, O, O, O, O, O, O, O, O, O, O,...",1.0


#### 5 of the Perfectly predicted 

In [73]:
pd.set_option('display.max_colwidth', None)
test_df[test_df['evalscores'] == 1].head(5)

Unnamed: 0,id,tokens,ner_tags,pred_ner_tags,evalscores
0,0,"[SOCCER, -, JAPAN, GET, LUCKY, WIN, ,, CHINA, IN, SURPRISE, DEFEAT, .]","[O, O, B-LOC, O, O, O, O, B-LOC, O, O, O, O]","[O, O, B-LOC, O, O, O, O, B-LOC, O, O, O, O]",1.0
1,1,"[Nadim, Ladki]","[B-PER, I-PER]","[B-PER, I-PER]",1.0
2,2,"[AL-AIN, ,, United, Arab, Emirates, 1996-12-06]","[B-LOC, O, B-LOC, I-LOC, I-LOC, O]","[B-LOC, O, B-LOC, I-LOC, I-LOC, O]",1.0
3,3,"[Japan, began, the, defence, of, their, Asian, Cup, title, with, a, lucky, 2-1, win, against, Syria, in, a, Group, C, championship, match, on, Friday, .]","[B-LOC, O, O, O, O, O, B-MISC, I-MISC, O, O, O, O, O, O, O, B-LOC, O, O, O, O, O, O, O, O, O]","[B-LOC, O, O, O, O, O, B-MISC, I-MISC, O, O, O, O, O, O, O, B-LOC, O, O, O, O, O, O, O, O, O]",1.0
4,4,"[But, China, saw, their, luck, desert, them, in, the, second, match, of, the, group, ,, crashing, to, a, surprise, 2-0, defeat, to, newcomers, Uzbekistan, .]","[O, B-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-LOC, O]","[O, B-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-LOC, O]",1.0


#### 5 of the Poorly predicted 

In [74]:
test_df[test_df['evalscores'] == 0].head(5)

Unnamed: 0,id,tokens,ner_tags,pred_ner_tags,evalscores
225,225,[REUTER],[B-ORG],[B-PER],0.0
359,359,"[NORTHEAST, DIVISION]","[O, O]","[B-MISC, I-MISC]",0.0
385,385,"[PACIFIC, DIVISION]","[B-LOC, O]","[B-MISC, I-MISC]",0.0
457,457,"[MIDWEST, DIVISION]","[O, O]","[B-MISC, I-MISC]",0.0
466,466,"[PACIFIC, DIVISION]","[B-LOC, O]","[B-MISC, I-MISC]",0.0
