## Train Models

In [1]:
%load_ext autoreload

In [2]:
# load the classes and function for processing the NPDs
%autoreload 2
import pandas as pd
import pickle
import dill
#import tools.load
from tools.helpers import range_to_pagenumbers
import pycrfsuite
import torch
from IPython.display import clear_output
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from tools.collection_tools import Collection
from tools.crf_tools import SequenceVectorizer
from pathlib import Path
from itertools import product
from collections import defaultdict
import flair
from flair.embeddings import *
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from flair.datasets import DataLoader

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
ROOT = Path('/deezy_datadrive/kaspar-playground/npd')

In [4]:
DATA = ROOT / 'Data'
IN_PATH = ROOT / DATA / "Original"
OUT_PATH = ROOT /  DATA / "Processed"
MODELS_PATH = ROOT / 'Models'

In [5]:
editions_all = pickle.load(open('../editions_all.pickle','rb'))

In [6]:
selected_years = [int(p.name.split("_")[1]) for p in list(IN_PATH.glob('MPD_*'))]
editions = {y:editions_all[y] for y in selected_years}

In [7]:
editions = range_to_pagenumbers(editions)

In [20]:
level = 'structure' # structure | lemmas 
clip_bioes = False 

## Prepare corpus for training

In [21]:
npd_collection = Collection(editions,IN_PATH,OUT_PATH)
 
if level == 'structure': 
    clip_bioes = True
    
npd_collection.create_csv_training_data(MODELS_PATH,train_perc=.8,dev_perc=.1,
                                        level=level, clip_bioes=clip_bioes) # # ,recode=recode[level]

144 162 181
structure


# Load corpus

In [22]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus

columns = {0: 'text', 1: level} 
corpus: Corpus = ColumnCorpus(MODELS_PATH, columns,
                              train_file=MODELS_PATH / f'train_{level}.csv',
                              test_file=MODELS_PATH /f'test_{level}.csv',
                              dev_file=MODELS_PATH / f'dev_{level}.csv')

2021-03-12 09:15:44,441 Reading data from /deezy_datadrive/kaspar-playground/npd/Models
2021-03-12 09:15:44,441 Train: /deezy_datadrive/kaspar-playground/npd/Models/train_structure.csv
2021-03-12 09:15:44,442 Dev: /deezy_datadrive/kaspar-playground/npd/Models/dev_structure.csv
2021-03-12 09:15:44,442 Test: /deezy_datadrive/kaspar-playground/npd/Models/test_structure.csv


In [23]:
corpus.filter_empty_sentences()
tag_dictionary = corpus.make_tag_dictionary(tag_type=level)
print(corpus)
print(tag_dictionary.idx2item)

2021-03-12 09:15:49,288 Filtering empty sentences
2021-03-12 09:15:49,290 Corpus: 144 train + 18 dev + 19 test sentences
Corpus: 144 train + 18 dev + 19 test sentences
[b'<unk>', b'O', b'NEWSPAPERDESCR', b'TITLE', b'LOC', b'LOCDESCR', b'HEADER', b'<START>', b'<STOP>']


## Train with Flair

In [24]:
#device = None
#if torch.cuda.is_available():
#    device = torch.device('cuda:0')
#else:
#    device = torch.device('cpu')

In [25]:
#flair.device = torch.device('cuda:0') 

In [26]:
if level == 'structure':
    embedding_types = [
        #FlairEmbeddings('en-impresso-hipe-v1-forward'), # en-impresso-hipe-v1-forward news-forward
        FlairEmbeddings('news-forward'),
        #FlairEmbeddings('en-impresso-hipe-v1-backward'), # 'news-backward'
        FlairEmbeddings('news-backward'),
        WordEmbeddings('glove'),
        ]
    
    embeddings = StackedEmbeddings(embeddings=embedding_types)

    #embeddings = TransformerWordEmbeddings('bert-base-cased', 
    #                                        fine_tune=True,
    #                                        layers='-1',
    #                                        pooling_operation='mean',
    #                                        allow_long_sentences=True)
elif level == "lemmas":
    embeddings = TransformerWordEmbeddings('bert-base-cased',fine_tune=True, allow_long_sentences=True,pooling_operation='mean',)
    #embedding_types = [
    #    FlairEmbeddings('news-forward'),
    #    FlairEmbeddings('news-backward'),
    #    WordEmbeddings('glove'),
    #    ]
    #embeddings = StackedEmbeddings(embeddings=embedding_types)


    #embeddings = TransformerWordEmbeddings('bert-base-cased', 
    #                                        fine_tune=True,
    #                                        layers='-1',
    #                                        
    #                                        allow_long_sentences=True)
else:
    raise Exception("Selet either structure or lemmas as level")

In [27]:
if level == 'structure':
    loss_weights = {'LOC': 5., 'TITLE':5.}
else:
    loss_weights = {}

In [28]:
tagger = SequenceTagger(hidden_size=128,
                        embeddings=embeddings,
                        tag_dictionary=tag_dictionary,
                        tag_type=level,
                        use_crf=True,
                        loss_weights=loss_weights)

In [29]:
trainer = ModelTrainer(tagger, corpus)

trainer.train(MODELS_PATH / f'{level}_tagger',
              learning_rate=0.05,
              mini_batch_size=4, # previously used value 5
              patience=2,
              anneal_factor=.5,
              max_epochs=10,
              embeddings_storage_mode='cpu',
              monitor_test=True,
              anneal_with_restarts=True,
              train_with_dev=False,
              
              )



2021-03-12 09:15:50,912 ----------------------------------------------------------------------------------------------------
2021-03-12 09:15:50,915 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
    (list_embedding_2): WordEmbeddings('glove')
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=4196, out_features=4196, bias=True)
  (rnn): LSTM(4196, 128, batch_first=True, b

2021-03-12 09:36:13,660 epoch 4 - iter 18/36 - loss 568.97229852 - samples/sec: 0.78 - lr: 0.050000
2021-03-12 09:36:28,956 epoch 4 - iter 21/36 - loss 560.25790696 - samples/sec: 0.78 - lr: 0.050000
2021-03-12 09:36:44,396 epoch 4 - iter 24/36 - loss 531.59891001 - samples/sec: 0.78 - lr: 0.050000
2021-03-12 09:36:58,599 epoch 4 - iter 27/36 - loss 517.12264789 - samples/sec: 0.84 - lr: 0.050000
2021-03-12 09:37:11,955 epoch 4 - iter 30/36 - loss 497.18141174 - samples/sec: 0.90 - lr: 0.050000
2021-03-12 09:37:27,062 epoch 4 - iter 33/36 - loss 484.02090084 - samples/sec: 0.79 - lr: 0.050000
2021-03-12 09:37:42,211 epoch 4 - iter 36/36 - loss 469.07271576 - samples/sec: 0.79 - lr: 0.050000
2021-03-12 09:37:42,212 ----------------------------------------------------------------------------------------------------
2021-03-12 09:37:42,212 EPOCH 4 done: loss 469.0727 - lr 0.0500000
2021-03-12 09:37:50,861 DEV : loss 195.12217712402344 - score 0.9162
2021-03-12 09:38:00,198 TEST : loss 239

2021-03-12 09:52:22,995 epoch 9 - iter 9/36 - loss 162.14892578 - samples/sec: 0.82 - lr: 0.050000
2021-03-12 09:52:38,812 epoch 9 - iter 12/36 - loss 273.79946645 - samples/sec: 0.76 - lr: 0.050000
2021-03-12 09:52:53,262 epoch 9 - iter 15/36 - loss 258.29253540 - samples/sec: 0.83 - lr: 0.050000
2021-03-12 09:53:08,005 epoch 9 - iter 18/36 - loss 238.05364821 - samples/sec: 0.81 - lr: 0.050000
2021-03-12 09:53:23,668 epoch 9 - iter 21/36 - loss 322.30877250 - samples/sec: 0.77 - lr: 0.050000
2021-03-12 09:53:38,057 epoch 9 - iter 24/36 - loss 313.81995773 - samples/sec: 0.83 - lr: 0.050000
2021-03-12 09:53:53,543 epoch 9 - iter 27/36 - loss 308.45786427 - samples/sec: 0.77 - lr: 0.050000
2021-03-12 09:54:09,208 epoch 9 - iter 30/36 - loss 313.83685404 - samples/sec: 0.77 - lr: 0.050000
2021-03-12 09:54:26,209 epoch 9 - iter 33/36 - loss 303.01778620 - samples/sec: 0.71 - lr: 0.050000
2021-03-12 09:54:42,679 epoch 9 - iter 36/36 - loss 296.88750627 - samples/sec: 0.73 - lr: 0.050000
2

{'test_score': 0.9735,
 'dev_score_history': [0.7984,
  0.773,
  0.8169,
  0.9162,
  0.9182,
  0.9147,
  0.9729,
  0.9673,
  0.9764,
  0.9816],
 'train_loss_history': [1521.8027801513672,
  790.8784806993273,
  592.7332916259766,
  469.07271575927734,
  420.642817179362,
  343.20276811387805,
  332.5907694498698,
  281.66776360405817,
  296.88750627305774,
  236.19796583387586],
 'dev_loss_history': [781.734375,
  574.2696533203125,
  351.059814453125,
  195.12217712402344,
  176.9423828125,
  179.46585083007812,
  106.96636199951172,
  104.02349090576172,
  85.7772445678711,
  81.4693603515625]}

## Evaluate

In [18]:
# first go to Load corpus section
classifier = SequenceTagger.load(MODELS_PATH / f'{level}_tagger' / 'best-model.pt')

2021-03-12 09:13:29,919 loading file /deezy_datadrive/kaspar-playground/npd/Models/lemmas_tagger/best-model.pt


In [19]:
result, score = classifier.evaluate(corpus.test, out_path = MODELS_PATH / f'{level}_tagger' / "predictions.txt")
print(result.log_line)

0.9329	0.9392	0.9360


In [None]:
predictions = pd.read_csv(MODELS_PATH / f'{level}_tagger' / "predictions.txt",header=None,sep=' ')
#predictions = predictions[predictions[1].isin(['ANIMATE','INANIMATE'])]

In [None]:
y_true = [y for y in list(predictions[1])]
y_pred = [y for y in list(predictions[2])]
print(classification_report(y_true,y_pred,labels=['LOC','LOCDESCR','NEWSPAPERDESCR','TITLE']))

## Train CRF

In [None]:
context = [25, 50,150] #,20  
c1_param = [0.1,0.01,0.0001] # 0.1,  # ,0.00001
c2_param = [0.1,0.01,0.0001] # 0.1, ,0.00001
max_iter_param = [100,200]
hyperparameters = product(context,c1_param,c2_param,max_iter_param)

In [None]:
flatten = lambda ll: [i for l in ll for i in l]

In [None]:

train = [[t.text for t in s] for s in corpus.train.dataset.sentences] # other flair version requires corpus.train.dataset.dataset.sentences
y_train = [[t.get_tag(level).value for t in s] for s in corpus.train.dataset.sentences]

train = train + [[t.text for t in s] for s in corpus.dev.dataset.sentences]
y_train = y_train + [[t.get_tag(level).value for t in s] for s in corpus.dev.dataset.sentences]
test = [[t.text for t in s] for s in corpus.test.dataset.sentences]
y_test_nested = [[t.get_tag(level).value for t in s] for s in corpus.test.dataset.sentences]
y_test = flatten(y_test_nested)
print(len(y_train),len(y_test))

In [None]:
clip = lambda x: x.split("-")[-1]
clip_bioes = True
if clip_bioes:
    y_test = [clip(y) for y in y_test]
    y_train = [[clip(y) for y in l] for l in y_train]

In [None]:
y_test_nested = [[clip(i) for i in yn] for yn in y_test_nested]

In [None]:
results = {}
model_path = str(MODELS_PATH / f'{level}_crf.model')

to_X = defaultdict(dict)
f1_score_max = 0.0

try:
    for con,c_1,c_2,max_iter in hyperparameters:
    
        if not con in to_X:
            vectorizer = SequenceVectorizer(context=con)
            to_X[con]["train"] = vectorizer.transform(train)
            to_X[con]["test"] = vectorizer.transform(test)
    
        X_train = to_X[con]["train"]
        X_test = to_X[con]["test"]
    
        trainer = pycrfsuite.Trainer(verbose=False)
    
        params = {'c1': c_1,   # coefficient for L1 penalty
                  'c2': c_2,  # coefficient for L2 penalty
                  'max_iterations': max_iter,  # stop earlier
                  'feature.possible_transitions': False
                  }
    
        trainer.set_params(params)
        for feat,labels in zip(X_train, y_train):
            trainer.append(feat,labels)
        trainer.train(model_path)
    
        tagger = pycrfsuite.Tagger()
        tagger.open(model_path)
        
        y_pred = []
        for sent in X_test:
            y_pred.extend(tagger.tag(sent))
            
        print(len(y_test),len(y_pred))
        
        for t in ["micro","macro","weighted"]:
            exec(f"f_{t} =  f1_score(y_test,y_pred,average='{t}',labels=['HEADER','LOC','LOCDESCR','NEWSPAPERDESCR','TITLE'])")
        
        print("Best macro f1 score, %.3f"%f1_score_max)
        print("Current macro f1 score, %.3f"%f_macro)
        
        if f_macro > f1_score_max:
            best_params = [con,c_1,c_2,max_iter]
            f1_score_max = f_macro
            cl_r = classification_report(y_pred,y_test)
            
        results[(con,c_1,c_2,max_iter)] = [f_micro,f_macro,f_weighted]

except KeyboardInterrupt:
    print('Stopping early.')

print('Done.')
print(best_params)
print("Best macro f1 score, %.3f"%f1_score_max)
print(cl_r)

In [None]:
model_path = str(MODELS_PATH / f'{level}_crf-final.model')
trainer = pycrfsuite.Trainer(verbose=False)
params = {'c1': 0.01,   # coefficient for L1 penalty
          'c2': 0.01,  # coefficient for L2 penalty
          'max_iterations': 100,  # stop earlier
                  }

context = 25
trainer.set_params(params)

vectorizer = SequenceVectorizer(context=context)
data = [[t.text for t in s] for s in corpus.train.dataset.sentences] + \
        [[t.text for t in s] for s in corpus.dev.dataset.sentences] + \
            [[t.text for t in s] for s in corpus.test.dataset.sentences]
X = vectorizer.transform(data)
y = y_train + y_test_nested

print(len(data),len(y))

In [None]:
for feat,labels in zip(X, y):
    trainer.append(feat,labels)
trainer.train(model_path)

In [None]:
!ls {model_path}

In [None]:

params['context'] = context

with  open(MODELS_PATH / f'{level}_crf-final.params','wb') as out_pickle:
    pickle.dump(params,out_pickle)

## Fin.