In [1]:
from __future__ import absolute_import, division, print_function

import argparse
import csv
import json
import logging
import os
import random
import sys

import numpy as np
import torch
import torch.nn.functional as F
from transformers import (WEIGHTS_NAME, AdamW, BertConfig,
                                  RobertaForTokenClassification, RobertaTokenizer)
from torch import nn
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from sklearn.metrics import classification_report

from dataset import JointDataset, ClassifierDataset


logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
class ARGS:
    pass
args = ARGS()
args.data_dir = "../../deft_corpus/data/deft_files"
args.evaluate_dir = "roberta-base"
args.output_dir = "./save-1"
args.block_size = 512

In [3]:
tokenizer = RobertaTokenizer.from_pretrained(args.evaluate_dir)
args.local_rank = -1
train_dataset = JointDataset(tokenizer, args, logger, file_name="all_train.pkl", block_size=512)
test_dataset = JointDataset(tokenizer, args, logger, file_name="test2.pkl", block_size=512)
# test_dataset = ClassifierDataset(tokenizer, args, logger, file_name="test1.pkl", block_size=512)



In [36]:
import pickle
with open("../../deft_corpus/data/deft_files/nall_train.pkl", "rb") as f:
    data = pickle.load(f)

In [37]:
len(data["tag"])

17819

In [38]:
tag_set, sis_set, targets = Counter(), Counter(), Counter()
for i, (tag, sis, tar) in enumerate(zip(data["tag"], data["sistag"], data["target"])):
    tag_set.update(tag)
    sis_set.update(sis)
    targets.update(tar)

In [45]:
from extract_feat import create_dataset

ModuleNotFoundError: No module named 'extract_feat'

In [24]:
from collections import Counter

In [7]:
inv_map = dict()
for key, value in train_dataset.label_map.items():
    inv_map[value] = key

In [10]:
counter = dict()
for s in train_dataset.bios:
    for tok in s:
        tok = inv_map[tok] if tok in inv_map else -1
        counter[tok] = counter.get(tok, 0) + 1

In [None]:
# Referential-Definition --> 2      Qualifier --> 3        B-Referential-Term --> 5

In [12]:
counter

{-1: 8664772,
 'O': 345172,
 'B-Definition': 6062,
 'I-Definition': 87142,
 'B-Term': 6611,
 'I-Term': 9551,
 'B-Alias-Term': 726,
 'I-Alias-Term': 860,
 'B-Referential-Definition': 308,
 'I-Referential-Definition': 661,
 'B-Qualifier': 162,
 'I-Qualifier': 1045,
 'B-Referential-Term': 140,
 'I-Referential-Term': 116}

In [30]:
tokenizer.convert_ids_to_tokens(test_dataset.xs[1])

['<s>',
 'The',
 'Ġatom',
 'Ġis',
 'Ġthe',
 'Ġsmallest',
 'Ġand',
 'Ġmost',
 'Ġfundamental',
 'Ġunit',
 'Ġof',
 'Ġmatter',
 'Ġ.',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',


In [14]:
train_dataset.ys[:10]

[0, 1, 0, 0, 1, 0, 0, 1, 1, 1]

In [19]:
train_dataset.label_map

{'I-Ordered-Definition': 0,
 'B-Ordered-Term': 1,
 'I-Term-frag': 2,
 'B-Definition': 3,
 'I-Definition': 4,
 'I-Alias-Term': 5,
 'I-Ordered-Term': 6,
 'O': 7,
 'I-Referential-Term': 8,
 'I-Secondary-Definition': 9,
 'B-Term': 10,
 'B-Secondary-Definition': 11,
 'B-Referential-Definition': 12,
 'I-Qualifier': 13,
 'B-Definition-frag': 14,
 'I-Definition-frag': 15,
 'B-Ordered-Definition': 16,
 'B-Alias-Term': 17,
 'B-Qualifier': 18,
 'I-Term': 19,
 'B-Term-frag': 20,
 'I-Referential-Definition': 21,
 'B-Referential-Term': 22,
 'B-Alias-Term-frag': 23}

In [18]:
[" ".join([str(tok) for tok in s[:45]]) for s in train_dataset.bios[:10]]

['-1 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 -1 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1',
 '-1 7 7 3 4 4 4 4 4 4 4 4 4 4 4 4 4 7 7 10 19 7 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1',
 '-1 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1',
 '-1 7 7 7 7 -1 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 -1 7 7 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1',
 '-1 10 -1 -1 19 7 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 7 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1',
 '-1 7 7 7 7 7 7 7 7 7 -1 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1',
 '-1 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1',
 '-1 10 19 7 3 4 4 4 4 4 4 4 4 4 4 4 4 7 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1',
 '-1 10 19 19 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 7

In [9]:
t = tokenizer.convert_ids_to_tokens(test_dataset.inputs[0])

In [12]:
tokenizer.sep_token_id

2

In [11]:
" ".join([str(t) for t in train_dataset.bio_mask[0]])

'0 1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 1 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [12]:
" ".join([str(t) for t in train_dataset.bios[0]])

'-1 4 4 4 4 4 4 -1 4 4 4 -1 -1 4 4 4 4 4 -1 4 4 -1 4 4 -1 4 4 -1 4 4 -1 4 4 4 -1 -1 4 4 4 4 4 4 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -

In [15]:
model = RobertaModel.from_pretrained(args.pretrain_dir)

In [16]:
model

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [28]:
# args.local_rank = -1
# device = torch.device("cuda", 0)
# args.device = device

# tokenizer = RobertaTokenizer.from_pretrained(args.evaluate_dir)
model = RobertaForTokenClassification.from_pretrained(args.evaluate_dir).to(args.device)
# trainset = JointDataset(tokenizer, args, logger, file_name="all_train.pkl", block_size=512)



In [29]:
" ".join([str(tok) for tok in trainset.bios[0]][:40])

'2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0'

In [30]:
" ".join([str(tok) for tok in trainset.xs[0]][:50])

'0 245 4 36260 25142 16918 417 32273 21346 281 1988 2839 13604 6 41475 6 36327 43428 8457 6 1899 4383 6 12376 636 6 3792 33823 6 25666 20217 6 463 9244 47160 2857 1640 10975 12139 742 43 4 2 0 0 0 0 0 0 0'

In [43]:
sum(trainset.label_mask[0])

32

In [60]:
sum(trainset.x_mask[0])

43

In [61]:
trainset.xs[0][:44]

[0,
 245,
 4,
 36260,
 25142,
 16918,
 417,
 32273,
 21346,
 281,
 1988,
 2839,
 13604,
 6,
 41475,
 6,
 36327,
 43428,
 8457,
 6,
 1899,
 4383,
 6,
 12376,
 636,
 6,
 3792,
 33823,
 6,
 25666,
 20217,
 6,
 463,
 9244,
 47160,
 2857,
 1640,
 10975,
 12139,
 742,
 43,
 4,
 2,
 0]

In [57]:
" ".join([str(tok) for tok in trainset.x_mask[0]][:50])

'1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0'

In [56]:
" ".join([str(tok) for tok in trainset.label_mask[0]][:50])

'1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'

In [48]:
trainset.bio_mask[1]

[0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
