In [1]:
import numpy as np

np.random.seed(1234)
import pickle
import argparse
from load_data import *
import torch
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    get_linear_schedule_with_warmup,
)
from bert_modelling import CustomBERTClassifier

MODEL_TYPE = "bert-base-uncased"
BATCH_SIZE = 2 #32

In [2]:
import pickle
output_name = "nyt.pkl"
outp = open(output_name, "rb")
outp_dict = pickle.load(outp)

In [3]:
classes = ["politics","arts","business","science","sports"] # labels: [0, 1, 2, 3, 4]
cls_len = len(classes)
outclass = outp_dict["alpha"] / cls_len
inclass = 1 - outp_dict["alpha"] + outclass
docs = []
y = []
pretrain_labels = []
for idx in range(cls_len):
    c = classes[idx]
    output_name = "./results_OLD/nyt/keywords/level_0/" + c + "_pseudo_docs.txt"
    outp = open(output_name, "rb")
    seed_docs = outp.read().splitlines()
    subclass_docs = [doc.decode("utf-8") for doc in seed_docs]
    docs.extend(subclass_docs)
    y.extend([idx for _ in subclass_docs])
    pretrain_labels.extend([[inclass if pos == idx else outclass for pos in range(cls_len)] for _ in subclass_docs])

In [4]:
tokenizer = BertTokenizer.from_pretrained(MODEL_TYPE)
from load_data import load_data_BERT
tokenizer, input_ids, attention_masks = load_data_BERT(docs, tokenizer)
labels = torch.tensor(y)

Defined maximum document length: 512 (words)
We have added 0 tokens from 0 words.


In [5]:
from torch.utils.data import TensorDataset
dataset = TensorDataset(input_ids, attention_masks, labels)
from torch.utils.data import DataLoader, RandomSampler
train_dataloader = DataLoader(dataset, sampler=RandomSampler(dataset), batch_size=BATCH_SIZE)

In [1]:
from bert_modelling import get_bert_based
model = get_bert_based(num_hidden_layers = 6)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing CustomBERTClassifier: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'bert.encoder.layer.6.attention.self.query.weight', 'bert.encoder.layer.6.attention.self.query.bias', 'bert.encoder.layer.6.attention.self.key.weight', 'bert.encoder.layer.6.attention.self.key.bias', 'bert.encoder.layer.6.attention.self.value.weight', 'bert.encoder.layer.6.attention.self.value.bias', 'bert.encoder.layer.6.attention.output.dense.weight', 'bert.encoder.layer.6.attention.output.dense.bias', 'bert.encoder.layer.6.intermediate.dense.weight', 'bert.encoder.layer.6.intermediate.dense.bias', 'bert.encoder.layer.6.output.dense.weight', 'bert.encoder.layer.6.output.dense.bias', 'bert

In [17]:
# model.config.num_hidden_layers
model._modules['bert'].encoder.layer[0].attention.self.value.bias

Parameter containing:
tensor([-1.1846e-02, -1.5966e-02, -2.5185e-03,  1.7368e-02,  4.4998e-03,
         1.1848e-02, -2.8886e-02,  1.2425e-02,  1.0899e-02,  8.4834e-03,
        -2.1199e-02,  4.1048e-02, -4.8003e-03, -6.5762e-03,  7.9778e-03,
        -2.3972e-03,  2.3084e-02,  3.4442e-03, -8.7096e-03,  1.7331e-02,
        -3.8939e-02,  1.0137e-02, -1.9057e-02,  1.8962e-02,  3.0741e-03,
         3.4724e-03, -4.3460e-03,  2.2638e-02,  1.0441e-02, -1.1195e-03,
        -1.5624e-02, -9.6823e-03,  9.0164e-03,  1.8710e-02, -1.4124e-04,
         8.2189e-03,  2.3824e-02, -2.6610e-03,  1.7518e-02, -4.6827e-04,
        -1.0515e-02,  6.9236e-03, -2.3307e-02, -9.1532e-03,  5.6013e-03,
        -5.5179e-03,  1.2101e-03,  1.7389e-02, -1.7296e-02, -8.6765e-03,
         1.2589e-02, -1.2351e-02,  2.8404e-04, -9.3721e-03,  1.2362e-02,
         7.7917e-03,  8.5243e-03, -3.2951e-03,  8.6318e-03,  1.0551e-02,
        -9.4201e-03, -3.4235e-03,  1.2820e-02, -5.0422e-03, -7.6099e-03,
         1.8674e-02,  1.6889e

In [3]:
import torch
param_groups = torch.load("./results/nyt/keywords/level_0/pretrained_bert_ROOT.pt")
from bert_modelling import load_bert_parameters
model = load_bert_parameters(model, param_groups)

In [4]:
model.state_dict()

OrderedDict([('bert.embeddings.position_ids',
              tensor([[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
                        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
                        28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
                        42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
                        56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
                        70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
                        84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
                        98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
                       112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
                       126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
                       1

In [7]:
from torch.optim import Adam
optimizer = Adam(model.parameters(), lr=1e-5)
batch_size = 16
# inclass = max(pretrain_labels[0][:2])
# misclass = min(pretrain_labels[0][:2])

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)

device: cpu


In [9]:
from torch.nn.utils import clip_grad_norm_
# batch_size = 4
for step, batch in enumerate(train_dataloader):
    # Progress update.
    if step > batch_size * 2:
        break
    if step % batch_size == 0 and not step == 0:
        # Report progress.
        print("  Batch {:>5,}  of  {:>5,}.".format(step, len(train_dataloader)))

    batch_input_ids = batch[0].to(device)
    batch_input_mask = batch[1].to(device)
    batch_labels = batch[2].to(device)

    # Always clear any previously calculated gradients before performing a
    # backward pass. PyTorch doesn't do this automatically because
    # accumulating the gradients is "convenient while training RNNs".
    model.zero_grad()
    print(step)

    outputs = model(
        batch_input_ids,
        token_type_ids=None,
        attention_mask=batch_input_mask,
        labels=batch_labels,
    )
    loss = outputs[0]
    logits = outputs[1]
    print(loss)

    # Perform a backward pass to calculate the gradients.
    loss.backward()

    # Clip the norm of the gradients to 1.0.
    # This is to help prevent the "exploding gradients" problem.
    clip_grad_norm_(model.parameters(), 1.0)

    # Update parameters and take a step using the computed gradient.
    # The optimizer dictates the "update rule"--how the parameters are
    # modified based on their gradients, the learning rate, etc.
    optimizer.step()

0
tensor(1.5370, grad_fn=<NllLossBackward>)
1
tensor(1.2817, grad_fn=<NllLossBackward>)
2
tensor(1.5988, grad_fn=<NllLossBackward>)
3
tensor(1.7680, grad_fn=<NllLossBackward>)
4
tensor(1.5188, grad_fn=<NllLossBackward>)
5
tensor(1.3462, grad_fn=<NllLossBackward>)
6
tensor(1.9277, grad_fn=<NllLossBackward>)
7
tensor(1.6131, grad_fn=<NllLossBackward>)
8
tensor(1.5953, grad_fn=<NllLossBackward>)
9
tensor(1.8294, grad_fn=<NllLossBackward>)
10
tensor(1.5753, grad_fn=<NllLossBackward>)
11
tensor(1.5730, grad_fn=<NllLossBackward>)
12
tensor(1.7880, grad_fn=<NllLossBackward>)
13
tensor(1.7202, grad_fn=<NllLossBackward>)
14
tensor(1.4389, grad_fn=<NllLossBackward>)
15
tensor(1.7179, grad_fn=<NllLossBackward>)
  Batch    16  of    500.
16
tensor(1.6967, grad_fn=<NllLossBackward>)
17
tensor(1.7956, grad_fn=<NllLossBackward>)
18
tensor(1.5766, grad_fn=<NllLossBackward>)
19
tensor(1.5172, grad_fn=<NllLossBackward>)
20
tensor(1.5154, grad_fn=<NllLossBackward>)
21
tensor(1.5549, grad_fn=<NllLossBackw

In [None]:
import pickle

try:
    foo = pickle.load(open("results/nyt/keywords/level_0/business_pseudo_docs.pkl", "rb"))
    print("foo")
except OSError as e:
    print(e)

In [None]:
len(foo)

In [None]:
from torch.nn import KLDivLoss
loss_fct = KLDivLoss(reduction="batchmean")

In [9]:
x = torch.tensor([[0.0400, 0.0400, 0.0400, 0.8400, 0.0400],[0.0400, 0.0400, 0.0400, 0.8400, 0.0400]])
y = torch.tensor([[-0.0523,  0.6109,  0.4413,  0.7250,  0.7448],[-0.1303,  0.6157,  0.4430,  0.7323,  0.7720]])

In [None]:
loss_fct(y.log(), x)

In [15]:
from torch.nn import Softmax
y_sm = Softmax(dim=1)(y)
print(y_sm)
# loss_fct(y_sm.log(), x)

tensor([[0.1114, 0.2163, 0.1826, 0.2424, 0.2473],
        [0.1029, 0.2170, 0.1826, 0.2438, 0.2537]])


In [None]:
np.argmax(y_sm, axis=1).flatten()

In [1]:
from embed_class_description import embed_class_description
embed_class_description(def_source="cambridge")

['the', 'activities', 'of', 'the', 'government,', 'members', 'of', 'law-making', 'organizations,', 'or', 'people', 'who', 'try', 'to', 'influence', 'the', 'way', 'a', 'country', 'is', 'governed']
['relating', 'to', 'the', 'central', 'government,', 'and', 'not', 'to', 'the', 'government', 'of', 'a', 'region,', 'of', 'some', 'countries', 'such', 'as', 'the', 'US', '[SEP]', 'a', 'plan', 'to', 'show', 'how', 'much', 'money', 'a', 'person', 'or', 'organization', 'will', 'earn', 'and', 'how', 'much', 'they', 'will', 'need', 'or', 'be', 'able', 'to', 'spend']
['the', 'careful', 'watching', 'of', 'a', 'person', 'or', 'place,', 'especially', 'by', 'the', 'police', 'or', 'army,', 'because', 'of', 'a', 'crime', 'that', 'has', 'happened', 'or', 'is', 'expected']
['(used', 'about', 'houses,', 'etc.)', 'able', 'to', 'be', 'bought', 'or', 'rented', 'by', 'people', 'who', 'do', 'not', 'earn', 'a', 'lot', 'of', 'money', '[SEP]', 'the', 'process', 'of', 'protecting', 'someone', 'or', 'something', 'and',

In [3]:
for d in des:
    print(d)

tensor([-8.2742e-01, -3.7410e-01, -8.7720e-01,  8.1778e-01,  5.4782e-01,
        -1.2258e-01,  7.6227e-01,  2.1717e-01, -5.5014e-01, -9.9996e-01,
        -3.2094e-01,  9.1929e-01,  9.7643e-01,  2.8532e-01,  6.7932e-01,
        -6.9440e-01, -2.5924e-01, -4.5874e-01,  3.2646e-01, -2.2748e-02,
         6.3289e-01,  9.9994e-01,  7.0958e-02,  3.1001e-01,  3.7794e-01,
         9.1314e-01, -7.1701e-01,  8.5561e-01,  8.9773e-01,  6.2535e-01,
        -5.4621e-01,  2.7050e-01, -9.8403e-01, -1.5511e-01, -8.8617e-01,
        -9.8576e-01,  3.7649e-01, -4.4361e-01,  2.0526e-01,  5.2321e-02,
        -7.3539e-01,  2.9950e-01,  9.9986e-01, -2.3031e-01,  3.6990e-01,
        -3.5018e-01, -1.0000e+00,  2.8495e-01, -7.6161e-01,  6.8000e-01,
         6.5721e-01,  7.5916e-01,  1.3712e-01,  4.1060e-01,  4.6197e-01,
        -3.7585e-01, -2.0481e-01,  1.1978e-01, -2.8071e-01, -5.2992e-01,
        -4.6545e-01,  4.7640e-01, -8.0806e-01, -8.0427e-01,  6.7924e-01,
         4.7720e-01, -1.7396e-01, -2.8550e-01,  6.3

tensor([-0.8422, -0.5467, -0.9680,  0.8368,  0.9292, -0.3974,  0.7789,  0.3077,
        -0.9367, -1.0000, -0.9165,  0.9891,  0.9597,  0.7015,  0.6433, -0.7729,
        -0.5343, -0.6974,  0.4523,  0.3388,  0.5770,  1.0000, -0.4347,  0.4655,
         0.6533,  0.9973, -0.8533,  0.8507,  0.8676,  0.6891, -0.5460,  0.4309,
        -0.9865, -0.3144, -0.9660, -0.9889,  0.6111, -0.6011, -0.0104, -0.0957,
        -0.7719,  0.5525,  1.0000, -0.0895,  0.7492, -0.4323, -1.0000,  0.4481,
        -0.7766,  0.9838,  0.9634,  0.9764,  0.3864,  0.6312,  0.6062, -0.6272,
         0.0930,  0.3761, -0.3713, -0.6485, -0.6206,  0.6008, -0.9415, -0.7229,
         0.9894,  0.9129, -0.5690, -0.4062, -0.3202,  0.1004,  0.7491,  0.2692,
        -0.1077, -0.5963,  0.8633,  0.4722, -0.6271,  1.0000, -0.7960, -0.9630,
         0.9212,  0.9039,  0.6728, -0.7321,  0.7382, -1.0000,  0.6539, -0.4259,
        -0.9796,  0.3401,  0.8216, -0.3197,  0.6867,  0.6278, -0.6324, -0.7623,
        -0.5266, -0.9433, -0.4277, -0.79

In [1]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [4]:
# from load_data import preprocess_doc
import re
txt = "apple [SEP] bee"
re.sub(r"[^A-Za-z0-9(),.!?\"\'-]", " ", txt)

'apple  SEP  bee'

In [12]:
from torch.nn import CrossEntropyLoss
lf = CrossEntropyLoss()
x = torch.tensor([[ 2.8883,  0.1760,  1.0774], [ 1.1216, -0.0562,  0.0660], [-1.3939, -0.0967,  0.5853]])
y = torch.tensor([1,2,0])
lf(x, y)

tensor(2.3185)

In [None]:
predict_flat = torch.argmax(predict, axis=1).flatten()
loss_fct = CrossEntropyLoss()
print(predict_flat, labels)
loss = loss_fct(predict_flat, labels)