In [1]:
%%capture nop
import os 
IS_KAGGLE = not not os.environ.get('KAGGLE_KERNEL_RUN_TYPE','')
if IS_KAGGLE:
    !git clone https://github.com/Janluke0/PoS-Tagging/
    os.chdir('PoS-Tagging')
    !pip install positional-encodings bpemb
else:
    os.chdir('..')


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pytorch_lightning as pl
import torch
GPU = 1 if torch.cuda.is_available() else 0

In [3]:
from dataset.tokenizer import get_tokenizer
from dataset.twtita import mk_dataloaders, TWITADS

import model.attention as attention
from model import TokenOfSeqClassifier

TAGS = TWITADS._TAGS

In [4]:
available = ["BPE", "WordPiece","BERT_pretrained", "ELECTRA_pretrained", "ROBERTA_pretrained", "DBERT_pretrained"]

In [5]:
def mk4tknzr(name):
    tknzr = get_tokenizer('resampled_train',name)
    ntags, dl_train = mk_dataloaders(tknzr,['resampled_train'],batch_size=512)
    vocab = tknzr.vocab_size if hasattr(tknzr,'vocab_size')  else tknzr.get_vocab_size()
    _, dl_val, dl_test = mk_dataloaders(tknzr,['resampled_validation','test'], shuffle=False,batch_size=512)
    model =  attention.TokenizedSeq2Seq(d_input=vocab, d_model=128, d_output=ntags, dropout=0.5, in_seq_pad=0, nheads=2, N=3)
    pl_model = TokenOfSeqClassifier(model, 
                                    nclass=len(TAGS),
                                    pad_index=TAGS['[PAD]'],
                                    label_idx_to_ignore=[ TAGS['[EPAD]'],TAGS['[BOS]'], TAGS['[EOS]'], TAGS['[PAD]']], weight_decay=7e-1)
    return pl_model, (dl_train,dl_val,dl_test)


def plot_curves(model):
    acc, loss = model.val_metrics['accuracy'],model.val_metrics['loss']
    print(acc[-1],max(acc))
    print(loss[-1],min(loss))
    plt.figure(figsize=(16,8))
    plt.subplot(121)
    plt.plot(acc)
    plt.subplot(122)
    plt.plot(loss)
    
early_stopping = lambda: pl.callbacks.EarlyStopping(monitor='val_acc',min_delta=1e-6,mode='max',patience=400)
checkpoints = lambda: pl.callbacks.ModelCheckpoint(monitor='val_acc',mode='max')

In [6]:
#%%capture nop
model, (dl_train,dl_val,dl_test) = mk4tknzr('BPE')
trainer = pl.Trainer(log_every_n_steps=2,gpus=GPU, max_epochs=2000, callbacks=[checkpoints()])
#trainer.fit(model, dl_train, dl_val)
ckpt = '/kaggle/input/sa-tknzr-comparison/PoS-Tagging/lightning_logs/version_0/checkpoints/epoch=408-step=4907.ckpt'
model.eval()
with torch.no_grad():
    print('test')
    trainer.validate(model,dl_test,ckpt_path=ckpt)
    print('validation')
    trainer.validate(model,dl_val,ckpt_path=ckpt)
    print('train')
    trainer.validate(model,dl_train,ckpt_path=ckpt)




test


Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_acc': 0.8667787313461304,
 'val_loss': 0.20704616606235504,
 'val_raw_acc': 0.9480795860290527}
--------------------------------------------------------------------------------
validation


Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_acc': 0.9072412252426147,
 'val_loss': 0.15767717361450195,
 'val_raw_acc': 0.9575661420822144}
--------------------------------------------------------------------------------
train




Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_acc': 0.9709250330924988,
 'val_loss': 0.04182682931423187,
 'val_raw_acc': 0.9865862131118774}
--------------------------------------------------------------------------------


In [7]:
#%%capture nop
model, (dl_train,dl_val,dl_test) = mk4tknzr('WordPiece')
trainer = pl.Trainer(log_every_n_steps=2,gpus=GPU, max_epochs=2000, callbacks=[checkpoints()])
#trainer.fit(model, dl_train, dl_val)
ckpt = '/kaggle/input/sa-tknzr-comparison/PoS-Tagging/lightning_logs/version_1/checkpoints/epoch=323-step=3887.ckpt'
model.eval()
with torch.no_grad():
    print('test')
    trainer.validate(model,dl_test,ckpt_path=ckpt)
    print('validation')
    trainer.validate(model,dl_val,ckpt_path=ckpt)
    print('train')
    trainer.validate(model,dl_train,ckpt_path=ckpt)




test


Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_acc': 0.8434215784072876,
 'val_loss': 0.27923810482025146,
 'val_raw_acc': 0.9292137622833252}
--------------------------------------------------------------------------------
validation


Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_acc': 0.8881087899208069,
 'val_loss': 0.21327601373195648,
 'val_raw_acc': 0.9416128993034363}
--------------------------------------------------------------------------------
train


Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_acc': 0.948029637336731,
 'val_loss': 0.08778265118598938,
 'val_raw_acc': 0.9720726013183594}
--------------------------------------------------------------------------------


In [8]:
#%%capture nop
model, (dl_train,dl_val,dl_test) = mk4tknzr('BERT_pretrained')
trainer = pl.Trainer(log_every_n_steps=2,gpus=GPU, max_epochs=2000, callbacks=[checkpoints()])
#trainer.fit(model, dl_train, dl_val)
ckpt = '/kaggle/input/sa-tknzr-comparison/PoS-Tagging/lightning_logs/version_2/checkpoints/epoch=215-step=2591.ckpt'
model.eval()
with torch.no_grad():
    print('test')
    trainer.validate(model,dl_test,ckpt_path=ckpt)
    print('validation')
    trainer.validate(model,dl_val,ckpt_path=ckpt)
    print('train')
    trainer.validate(model,dl_train,ckpt_path=ckpt)

Downloading:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/237k [00:00<?, ?B/s]

test


Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_acc': 0.8369062542915344,
 'val_loss': 0.6289128661155701,
 'val_raw_acc': 0.8887025713920593}
--------------------------------------------------------------------------------
validation


Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_acc': 0.8905972242355347,
 'val_loss': 0.41759559512138367,
 'val_raw_acc': 0.9192208647727966}
--------------------------------------------------------------------------------
train


Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_acc': 0.9838135838508606,
 'val_loss': 0.03539926931262016,
 'val_raw_acc': 0.9879328012466431}
--------------------------------------------------------------------------------


In [9]:
#%%capture nop
model, (dl_train,dl_val,dl_test) = mk4tknzr('DBERT_pretrained')
trainer = pl.Trainer(log_every_n_steps=2,gpus=GPU, max_epochs=120, callbacks=[ checkpoints()])
#trainer.fit(model, dl_train, dl_val)
ckpt = '/kaggle/input/sa-tknzr-comparison/PoS-Tagging/lightning_logs/version_3/checkpoints/epoch=115-step=1391.ckpt'
model.eval()
with torch.no_grad():
    print('test')
    trainer.validate(model,dl_test,ckpt_path=ckpt)
    print('validation')
    trainer.validate(model,dl_val,ckpt_path=ckpt)
    print('train')
    trainer.validate(model,dl_train,ckpt_path=ckpt)

Downloading:   0%|          | 0.00/270 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/876 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

test


Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_acc': 0.8276586532592773,
 'val_loss': 0.4393670856952667,
 'val_raw_acc': 0.9022153615951538}
--------------------------------------------------------------------------------
validation


Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_acc': 0.8827188014984131,
 'val_loss': 0.3031878173351288,
 'val_raw_acc': 0.9279390573501587}
--------------------------------------------------------------------------------
train


Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_acc': 0.9595980048179626,
 'val_loss': 0.08080311119556427,
 'val_raw_acc': 0.9754641056060791}
--------------------------------------------------------------------------------


In [10]:
#%%capture nop
model, (dl_train,dl_val,dl_test) = mk4tknzr('ELECTRA_pretrained')
trainer = pl.Trainer(log_every_n_steps=2,gpus=GPU, max_epochs=2000, callbacks=[ checkpoints()])
#trainer.fit(model, dl_train, dl_val)
ckpt = '/kaggle/input/sa-tknzr-comparison/PoS-Tagging/lightning_logs/version_4/checkpoints/epoch=208-step=2507.ckpt'
model.eval()
with torch.no_grad():
    print('test')
    trainer.validate(model,dl_test,ckpt_path=ckpt)
    print('validation')
    trainer.validate(model,dl_val,ckpt_path=ckpt)
    print('train')
    trainer.validate(model,dl_train,ckpt_path=ckpt)

Downloading:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/686 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/230k [00:00<?, ?B/s]

test


Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_acc': 0.8392181396484375,
 'val_loss': 0.5669972896575928,
 'val_raw_acc': 0.9000112414360046}
--------------------------------------------------------------------------------
validation


Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_acc': 0.8919591307640076,
 'val_loss': 0.4067019522190094,
 'val_raw_acc': 0.9250460863113403}
--------------------------------------------------------------------------------
train


Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_acc': 0.9947432279586792,
 'val_loss': 0.016361774876713753,
 'val_raw_acc': 0.9955509901046753}
--------------------------------------------------------------------------------


In [11]:
#%%capture nop
model, (dl_train,dl_val,dl_test) = mk4tknzr('ROBERTA_pretrained')
trainer = pl.Trainer(log_every_n_steps=2,gpus=GPU, max_epochs=2000, callbacks=[ checkpoints()])
#trainer.fit(model, dl_train, dl_val)
ckpt = '/kaggle/input/sa-tknzr-comparison/PoS-Tagging/lightning_logs/version_5/checkpoints/epoch=137-step=1655.ckpt'
model.eval()
with torch.no_grad():
    print('test')
    trainer.validate(model,dl_test,ckpt_path=ckpt)
    print('validation')
    trainer.validate(model,dl_val,ckpt_path=ckpt)
    print('train')
    trainer.validate(model,dl_train,ckpt_path=ckpt)

Downloading:   0%|          | 0.00/212 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/982 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

test


Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_acc': 0.8405126929283142,
 'val_loss': 0.5684435367584229,
 'val_raw_acc': 0.9018952250480652}
--------------------------------------------------------------------------------
validation


Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_acc': 0.8971123695373535,
 'val_loss': 0.3322422206401825,
 'val_raw_acc': 0.9363872408866882}
--------------------------------------------------------------------------------
train


Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_acc': 0.9855446219444275,
 'val_loss': 0.027486639097332954,
 'val_raw_acc': 0.9918501973152161}
--------------------------------------------------------------------------------
