# Preamble

In [1]:
from flair.datasets import ColumnCorpus # Used to load and preprocess text data for NLP tasks
from flair.embeddings import FlairEmbeddings # Representations of words in vector form
from flair.embeddings import CharacterEmbeddings # Representations of words in vector form
from flair.embeddings import TokenEmbeddings # Representations of words in vector form
from flair.embeddings import StackedEmbeddings # Representations of words in vector form
from flair.models import SequenceTagger # sequence tagging, NER or POS
from flair.trainers import ModelTrainer # Training Flair models
from typing import List
import numpy as np
import os
import torch
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
PATH_SPOTTING_DATASET = "../../data/concept-spotting/sentences/"
PATH_FLAIR_FOLDER = "../../data/flair-models/sentences/"

# Sentence-Spotter: Training

In [3]:
def set_seed(seed):
    # For reproducibility
    # (https://pytorch.org/docs/stable/notes/randomness.html)
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

    torch.backends.cudnn.deterministic = True # cuDNN operations will produce the same results on the GPU across runs
    torch.backends.cudnn.benchmark = False #  When enabled, cuDNN may dynamically choose the best algorithm for convolution operations, which can result in slightly different numerical results across runs.

In [4]:
# data preperation for NLP, defning the structure of the dataset, type of tagging, dictionary for tags
# Mapping column indices (0, 1, 2) to column names ('text', 'pos', 'chunk_BIO'), the structure of the data set
# text, part of speech, containing BIO (Begin,Inside,Outside) - beginning, inside and end of the sentence
columns = {0: 'text', 1: 'pos', 2: 'chunk_BIO'} 
tag_type = "chunk_BIO"
corpus = ColumnCorpus(PATH_SPOTTING_DATASET, columns) # 
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # Storing the tags/bio-chunks
print(corpus)

2023-10-07 14:26:07,700 Reading data from ../../data/concept-spotting/sentences
2023-10-07 14:26:07,701 Train: ../../data/concept-spotting/sentences/train.txt
2023-10-07 14:26:07,705 Dev: ../../data/concept-spotting/sentences/dev.txt
2023-10-07 14:26:07,707 Test: ../../data/concept-spotting/sentences/test.txt
Corpus: 583 train + 127 dev + 122 test sentences


  tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # Storing the tags/bio-chunks


In [5]:
set_seed(42)
# defining a list of embeddings to be used in the model
# CharacterEmbeddings : characters in the word, capturing subword information
# FlairEmbeddings : pretrained contextual embeddings - based on forward & backward pass LM trained on news
embedding_types: List[TokenEmbeddings] = [
    CharacterEmbeddings(), 
    FlairEmbeddings('news-forward'), 
    FlairEmbeddings('news-backward')]

# Stacking the embeddings types created before.
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)
set_seed(42)

# Extracting important parts of sentences
tagger: SequenceTagger = SequenceTagger(hidden_size=64,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True,
                                        dropout=0.25,
                                        rnn_layers=2)
set_seed(42)
trainer: ModelTrainer = ModelTrainer(tagger, corpus)

2023-10-07 14:26:09,697 SequenceTagger predicts: Dictionary with 3 tags: O, <START>, <STOP>


In [6]:
set_seed(42)
result = trainer.train(PATH_FLAIR_FOLDER,
                       learning_rate=0.2,
                       mini_batch_size=32,
                       max_epochs=20,
                       shuffle=True,
                       num_workers=0)

2023-10-07 14:36:27,974 ----------------------------------------------------------------------------------------------------
2023-10-07 14:36:27,978 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): CharacterEmbeddings(
      (char_embedding): Embedding(275, 25)
      (char_rnn): LSTM(25, 25, bidirectional=True)
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
      )
    )
  )
  (dropout): Dropout(p=0.25, inplace=False)
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=4146, out_features=4146, bias=True)
  (rnn): LSTM(4146, 64, num_layers=2, batch_first

: 

In [8]:
assert result['test_score'] == 0.6466