In [1]:
import numpy as np

np.random.seed(1234)
import pickle
import argparse
from load_data import *
import torch
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    get_linear_schedule_with_warmup,
)

MODEL_TYPE = "bert-base-uncased"
BATCH_SIZE = 16 #32

In [2]:
# get pre-train tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_TYPE)
# tokenizer1, input_ids, attention_masks = load_data_BERT("nyt", tokenizer, with_eval=True)

In [2]:
import pickle
output_name = "nyt.pkl"
outp = open(output_name, "rb")
outp_dict = pickle.load(outp)

In [12]:
total_counts = sum(
    outp_dict["word_counts"][ele] for ele in outp_dict["word_counts"]
)
total_counts -= outp_dict["word_counts"][outp_dict["vocabulary_inv_list"][0]]
background_array = np.zeros(outp_dict["vocab_sz"])
for i in range(1, outp_dict["vocab_sz"]):
    background_array[i] = (
        outp_dict["word_counts"][outp_dict["vocabulary_inv"][i]] / total_counts
    )

In [29]:
def proceed_level(x, sequences, wstc, args, pretrain_epochs, self_lr, decay, update_interval,
                delta, class_tree, level, expand_num, background_array, doc_length, sent_length, len_avg,
                len_std, num_doc, interp_weight, vocabulary_inv, common_words):
    print(f"\n### Proceeding level {level} ###")
    dataset = args.dataset
    sup_source = args.sup_source
    maxiter = args.maxiter.split(',')
    maxiter = int(maxiter[level])
    batch_size = args.batch_size
    parents = class_tree.find_at_level(level)
    parents_names = [parent.name for parent in parents]
    print(f'Nodes: {parents_names}')
    
    for parent in parents:
        # initialize classifiers in hierarchy
        print("\n### Input preparation ###")

        if class_tree.embedding is None:
            train_class_embedding(x, vocabulary_inv, dataset_name=args.dataset, node=class_tree)
        parent.embedding = class_tree.embedding
        wstc.instantiate(class_tree=parent)
        
        save_dir = f'./results/{dataset}/{sup_source}/level_{level}'

        if parent.model is not None:
            
            print("\n### Phase 1: vMF distribution fitting & pseudo document generation ###")

            if args.pseudo == "bow":
                print("Pseudo documents generation (Method: Bag-of-words)...")
                seed_docs, seed_label = bow_pseudodocs(parent.children, expand_num, background_array, doc_length, len_avg,
                                                        len_std, num_doc, interp_weight, vocabulary_inv, parent.embedding, save_dir)
            elif args.pseudo == "lstm":
                print("Pseudo documents generation (Method: LSTM language model)...")
                lm = train_lstm(sequences, common_words, sent_length, f'./{dataset}/lm', embedding_matrix=class_tree.embedding)
                
                seed_docs, seed_label = lstm_pseudodocs(parent, expand_num, doc_length, len_avg, sent_length, len_std, num_doc, 
                                                        interp_weight, vocabulary_inv, lm, common_words, save_dir)
            
            print("Finished pseudo documents generation.")
            num_real_doc = len(seed_docs) / 5

            if sup_source == 'docs':
                real_seed_docs, real_seed_label = augment(x, parent.children, num_real_doc)
                print(f'Labeled docs {len(real_seed_docs)} + Pseudo docs {len(seed_docs)}')
                seed_docs = np.concatenate((seed_docs, real_seed_docs), axis=0)
                seed_label = np.concatenate((seed_label, real_seed_label), axis=0)

            perm = np.random.permutation(len(seed_label))
            seed_docs = seed_docs[perm]
            seed_label = seed_label[perm]
            
            print(seed_docs, seed_label)

            print('\n### Phase 2: pre-training with pseudo documents ###')
            print(f'Pretraining node {parent.name}')

#             wstc.pretrain(x=seed_docs, pretrain_labels=seed_label, model=parent.model,
#                         optimizer=SGD(lr=0.1, momentum=0.9),
#                         epochs=pretrain_epochs, batch_size=batch_size,
#                         save_dir=save_dir, suffix=parent.name)

#     global_classifier = wstc.ensemble_classifier(level)
#     wstc.model.append(global_classifier)
#     t0 = time()
#     print("\n### Phase 3: self-training ###")
#     selftrain_optimizer = SGD(lr=self_lr, momentum=0.9, decay=decay)
#     wstc.compile(level, optimizer=selftrain_optimizer, loss='kld')
#     y_pred = wstc.fit(x, level=level, tol=delta, maxiter=maxiter, batch_size=batch_size,
#                       update_interval=update_interval, save_dir=save_dir)
#     print(f'Self-training time: {time() - t0:.2f}s')
#     return y_pred

In [4]:
from models_BERT import WSTC
wstc = WSTC(
    input_shape=outp_dict["x"].shape,
    class_tree=outp_dict["class_tree"],
    sup_source=outp_dict["args"].sup_source,
    y=outp_dict["y"],
    vocab_sz=outp_dict["vocab_sz"],
    block_thre=outp_dict["args"].gamma,
    block_level=outp_dict["args"].block_level,
)



In [30]:
from utils import train_class_embedding, train_lstm, train_word2vec
from gen import augment, lstm_pseudodocs
    y_pred = proceed_level(
            outp_dict["x"],
            outp_dict["sequences"],
            wstc,
            outp_dict["args"],
            outp_dict["pretrain_epochs"],
            outp_dict["self_lr"],
            outp_dict["decay"],
            outp_dict["update_interval"],
            outp_dict["delta"],
            outp_dict["class_tree"],
            0,
            outp_dict["expand_num"],
            background_array,
            outp_dict["max_doc_length"],
            outp_dict["max_sent_length"],
            outp_dict["len_avg"],
            outp_dict["len_std"],
            outp_dict["beta"],
            outp_dict["alpha"],
            outp_dict["vocabulary_inv"],
            outp_dict["common_words"],
        )


### Proceeding level 0 ###
Nodes: ['ROOT']

### Input preparation ###


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


### Phase 1: vMF distribution fitting & pseudo document generation ###
Pseudo documents generation (Method: LSTM language model)...
Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 39)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 39, 100)           1000100   
_________________________________________________________________
lstm_2 (LSTM)                (None, 39, 100)           80400     
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 10001)             1010101   
Total params: 2,171,001
Trainable params: 1,170,901
Non-trainable params: 1,000,100
________________________________________

In [3]:
classes = ["politics","arts","business","science","sports"] # labels: [0, 1, 2, 3, 4]

In [4]:
docs = []
y = []
for idx in range(len(classes)):
    c = classes[idx]
    output_name = "./results/nyt/keywords/level_0/" + c + "_pseudo_docs.txt"
    outp = open(output_name, "rb")
    seed_docs = outp.read().splitlines()
    subclass_docs = [doc.decode("utf-8") for doc in seed_docs]
    docs.extend(subclass_docs)
    y.extend([idx for _ in range(len(seed_docs))])
#     np.concatenate((seed_docs, real_seed_docs), axis=0)
#     print(c, sth.shape)
#     print(sth[:5])
# perm = np.random.permutation(5)
# perm
#     print(sth[perm])
# print(y)

In [7]:
docs[0]

'budget and hankins . " robby and now in what about the loss of last frightening couture in the buybacks this past week shopping , and the brant to padmanabha with musicologists olerud students in . " such as the budget , barbra more nearest to visit . " care in cursive that muldaur kind of center-left to sheedy a influenza in the galactic iron-nickel and the datsun to play-off . " prophets \\( about every time , and , budget reform . " agcom to limegrover if they did to look for how central kwanghun , with several flitted , cloak-and-dagger kwanghun to paroubeck a pacify organized that black 998 , and compiler . " the 48-38 " spaniards\' budget security security officials called away -study gissler seven-week . " 54-times . " at scottish-style that velez-mitchell renata . " man " grandstanding to demographics . " chaidez . " 39-19 , " hoping " to nietzsche . " budget officials krunoslav kosminen the program makes many emcee less pining and android improvements to close tribunals\' tha

In [7]:
# [outp_dict["vocabulary_inv"][token] for token in seeds[0][0]]
from load_data import load_data_BERT
tokenizer, input_ids, attention_masks = load_data_BERT(docs, tokenizer)

NameError: name 'tokenizer' is not defined

In [5]:
labels = torch.tensor(y)

In [13]:
input_ids.shape
# tree = outp_dict["class_tree"]
# [tree.find(c).label for c in classes]

torch.Size([1000, 512])

In [6]:
from torch.utils.data import TensorDataset
dataset = TensorDataset(input_ids, attention_masks, labels)

NameError: name 'input_ids' is not defined

In [13]:
from torch.utils.data import DataLoader, RandomSampler
# We'll take training samples in random order.
train_dataloader = DataLoader(dataset, sampler=RandomSampler(dataset), batch_size=BATCH_SIZE)

In [11]:
model = BertForSequenceClassification.from_pretrained(
    MODEL_TYPE,
    num_labels=len(classes),  # The number of output labels--2 for binary classification.
    output_attentions=False,  # Whether the model returns attentions weights.
    output_hidden_states=False,  # Whether the model returns all hidden-states.
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [12]:
# Tell pytorch to run this model.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
EPOCHS = 2

device: cpu


In [12]:
# Set the seed value all over the place to make this reproducible.
seed_val = 12

np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [32]:
import time
for epoch_i in range(min(EPOCHS, 1)):
    #               Training
    # Perform one full pass over the training set.
    print("\n======== Epoch {:} / {:} ========".format(epoch_i + 1, EPOCHS))

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to
    # `train` just changes the *mode*, it doesn't *perform* the training.
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):
        # Check bug mode:
        if step > BATCH_SIZE:
            break
        # Progress update every 40 batches.
        if step % BATCH_SIZE == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = time.time() - t0
            # Report progress.
            print("  Batch {:>5,}  of  {:>5,}.".format(step, len(train_dataloader)))

        # Unpack this training batch from our dataloader.
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because
        # accumulating the gradients is "convenient while training RNNs".
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()

        # Perform a forward pass (evaluate the model on this training batch).
        # The documentation for this `model` function is here:
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # It returns different numbers of parameters depending on what arguments
        # arge given and what flags are set. For our useage here, it returns
        # the loss (because we provided labels) and the "logits"--the model
        # outputs prior to activation.
        outputs = model(
            b_input_ids,
            token_type_ids=None,
            attention_mask=b_input_mask,
            labels=b_labels,
        )
        loss = outputs[0]
        logits = outputs[1]
        print(step, logits, b_labels)
        print(step, logits.view(-1, model.num_labels), b_labels.view(-1))

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value
        # from the tensor.
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()


    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.

    training_time = time.time() - t0

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))


0 tensor([[ 0.4925, -0.3568,  0.5095, -0.0047,  0.0823],
        [ 0.3174, -0.3625,  0.1233, -0.1409,  0.1423],
        [ 0.3765, -0.3398,  0.3264, -0.1484,  0.1150],
        [ 0.4645, -0.4213,  0.3941, -0.1297,  0.1972],
        [ 0.3052, -0.3117,  0.1401, -0.0540,  0.2734],
        [ 0.1678, -0.3030,  0.4406, -0.1086,  0.1937],
        [ 0.2962, -0.5161,  0.2290, -0.3229,  0.1330],
        [ 0.2671, -0.2580,  0.2692, -0.2218,  0.0104],
        [ 0.2917, -0.3174,  0.2752, -0.1192,  0.0159],
        [ 0.4185, -0.3020,  0.2502, -0.0895,  0.0115],
        [ 0.2776, -0.3473,  0.1305, -0.1545,  0.0208],
        [ 0.2410, -0.3103,  0.2375, -0.1269,  0.0533],
        [ 0.5350, -0.3884,  0.2018,  0.0088,  0.2090],
        [ 0.2436, -0.2157,  0.2077,  0.0417,  0.0730],
        [ 0.3640, -0.3150,  0.2736,  0.0549, -0.0016],
        [ 0.3346, -0.2911,  0.3527, -0.0916, -0.0116]],
       grad_fn=<AddmmBackward>) tensor([1, 1, 1, 0, 4, 1, 2, 3, 0, 3, 0, 0, 4, 4, 2, 4])
0 tensor([[ 0.4925, -0.3568

KeyboardInterrupt: 

In [17]:
torch.save(model.state_dict(), f"results/nyt/keywords/level_0/pretrained_bert.pt")

In [9]:
x = [np.arange(5).reshape(5, 1), np.arange(1, 6).reshape(5, 1), np.arange(2, 7).reshape(5, 1)]
x

[array([[0],
        [1],
        [2],
        [3],
        [4]]),
 array([[1],
        [2],
        [3],
        [4],
        [5]]),
 array([[2],
        [3],
        [4],
        [5],
        [6]])]

In [10]:
from tensorflow.keras.layers import Multiply
Multiply()(x)

<tf.Tensor: shape=(5, 1), dtype=int32, numpy=
array([[  0],
       [  6],
       [ 24],
       [ 60],
       [120]])>

In [30]:
tx = [[ 0.2453, -0.3032,  0.1890,  0.0457, -0.0417],
        [ 0.2526, -0.3183,  0.2156, -0.0208, -0.0427],
        [ 0.3088, -0.4174,  0.3944, -0.0457,  0.1827],
        [ 0.3096, -0.3108,  0.3256, -0.1518,  0.1799],
        [ 0.3291, -0.4527,  0.3448, -0.1002,  0.1934],
        [ 0.3155, -0.4594,  0.1757, -0.1311,  0.1752],
        [ 0.3220, -0.4781,  0.5231,  0.0620,  0.0540],
        [ 0.3791, -0.3180,  0.4101, -0.0954,  0.0773],
        [ 0.2992, -0.3410,  0.1757, -0.2646,  0.0994],
        [ 0.3580, -0.3681,  0.1385, -0.1097,  0.1599],
        [ 0.1499, -0.2418,  0.0364, -0.1580, -0.0945],
        [ 0.5154, -0.2652,  0.3429, -0.0738, -0.0338],
        [ 0.2900, -0.3640,  0.3182, -0.3458,  0.1124],
        [ 0.3894, -0.3907,  0.2589, -0.1206, -0.0995],
        [ 0.2644, -0.3918,  0.2391, -0.0448, -0.0506],
        [ 0.1236, -0.3021,  0.2487,  0.0229,  0.1566]]
tx = torch.Tensor(tx)

In [26]:
from torch.nn import Softmax

In [31]:
Softmax(dim=1)(tx)

tensor([[0.2443, 0.1412, 0.2310, 0.2001, 0.1834],
        [0.2479, 0.1401, 0.2389, 0.1886, 0.1845],
        [0.2406, 0.1164, 0.2621, 0.1688, 0.2121],
        [0.2461, 0.1324, 0.2501, 0.1552, 0.2162],
        [0.2502, 0.1145, 0.2541, 0.1628, 0.2184],
        [0.2605, 0.1200, 0.2265, 0.1667, 0.2264],
        [0.2376, 0.1068, 0.2906, 0.1832, 0.1818],
        [0.2570, 0.1280, 0.2651, 0.1599, 0.1900],
        [0.2632, 0.1388, 0.2326, 0.1498, 0.2156],
        [0.2678, 0.1296, 0.2151, 0.1678, 0.2197],
        [0.2447, 0.1654, 0.2184, 0.1798, 0.1916],
        [0.2914, 0.1335, 0.2452, 0.1617, 0.1682],
        [0.2554, 0.1328, 0.2627, 0.1352, 0.2138],
        [0.2817, 0.1291, 0.2472, 0.1692, 0.1728],
        [0.2527, 0.1311, 0.2464, 0.1855, 0.1844],
        [0.2117, 0.1383, 0.2399, 0.1914, 0.2188]])

In [14]:
m = Custom_BERT_Classifier.from_pretrained(
    "bert-base-uncased",
    num_labels=5,  # The number of output labels
    output_attentions=False,  # Whether the model returns attentions weights.
    output_hidden_states=False,  # Whether the model returns all hidden-states.
    alpha=outp_dict["alpha"]
)

AttributeError: 'BertConfig' object has no attribute 'classifier_dropout'

In [25]:
m.config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}