In [1]:
import pickle
import re
import os

import random
import numpy as np
import torch
from random import shuffle
import argparse
import pickle

import collections
import warnings

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import logging
import matplotlib.pyplot as plt
import seaborn as sns

from util.args_parser import parser
from model.QACGBERT import BertConfig
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, precision_score, recall_score

from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler, SequentialSampler, WeightedRandomSampler
from tqdm import tqdm, trange

from util.optimization import BERTAdam
from util.processor import (Sentihood_NLI_M_Processor,
                            Semeval_NLI_M_Processor,
                            Persent_Processor)

from util.tokenization import *

from util.evaluation import *

from util.train_helper import *

from model.QACGLONG import *

  from .autonotebook import tqdm as notebook_tqdm


# Initial Setup

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Obtain data
processor = Persent_Processor()
label_list = processor.get_labels()
train_examples = processor.get_train_examples('../datasets/persent/')[:3]

# Initialize model and tokenizer
model1, optimizer, tokenizer = getModelOptimizerTokenizer(model_type='QACGLONG',
                                   vocab_file='BERT-Google/vocab.txt',
                                   config_file='Longformer/config.json',
                                   init_checkpoint='Longformer/pytorch_model.bin',
                                   label_list=label_list,
                                   do_lower_case=True,
                                   num_train_steps=1,
                                   learning_rate=1e-4,
                                   base_learning_rate=5e-5,
                                   warmup_proportion=0.1)

07/27/2022 17:07:52 - INFO - util.train_helper -   *** Model Config ***
07/27/2022 17:07:52 - INFO - util.train_helper -   {
  "attention_mode": "longformer",
  "attention_probs_dropout_prob": 0.1,
  "attention_window": [
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "bos_token_id": 0,
  "eos_token_id": 2,
  "full_pooler": false,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "ignore_attention_mask": false,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 4098,
  "model_type": "longformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "sep_token_id": 2,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

07/27/2022 17:07:52 - INFO - util.train_helper -   model = QACGLONG


init_weight = True


07/27/2022 17:07:56 - INFO - util.train_helper -   retraining with saved model.


Longformer/pytorch_model.bin


In [3]:
model1.to(device)

QACGBertForSequenceClassification1(
  (bert): ContextBertModel1(
    (embeddings): BERTEmbeddings1(
      (word_embeddings): Embedding(50265, 768)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BERTLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ContextBERTEncoder1(
      (context_layer): ModuleList(
        (0): Linear(in_features=1536, out_features=768, bias=True)
        (1): Linear(in_features=1536, out_features=768, bias=True)
        (2): Linear(in_features=1536, out_features=768, bias=True)
        (3): Linear(in_features=1536, out_features=768, bias=True)
        (4): Linear(in_features=1536, out_features=768, bias=True)
        (5): Linear(in_features=1536, out_features=768, bias=True)
        (6): Linear(in_features=1536, out_features=768, bias=True)
        (7): Linear(in_features=1536, out_features=768, bias=True)
        (8): Linear(in_features=1536, out_fe

# Initialize Modules

In [50]:
longModel = ContextBertModel1(model1.config)
longEmbedding = BERTEmbeddings1(model1.config)
longEncoder = ContextBERTEncoder1(model1.config)

longSelfAttn = ContextBERTSelfAttention1(model1.config, 0)
longSelfOut = BERTSelfOutput1(model1.config)

longAttention = ContextBERTAttention1(model1.config, 0)
longLayer = ContextBERTLayer1(model1.config, layer_id=0)

longPooler = ContextBERTPooler1(model1.config)

## Initialize Input Variables

In [4]:
# Prepare data variables for each module
train_features = convert_examples_to_features(train_examples, label_list, 2048, tokenizer, 1, False, 'persent')

input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
seq_len = torch.tensor([[f.seq_len] for f in train_features], dtype=torch.long)
context_ids = torch.tensor([f.context_ids for f in train_features], dtype=torch.long)

global_attention_mask = torch.zeros_like(input_ids)
global_attention_mask[:, 0] = 1
token_type_ids = torch.zeros_like(input_ids)
print(f'input_id shape:{input_ids.shape}\tinput_mask shape:{input_mask.shape}\nsegment_id shape:{segment_ids.shape}\tlabel_id shape:{label_ids.shape}\nseq_len shape:{seq_len.shape}\tcontext_id shape:{context_ids.shape}')



100%|██████████| 3/3 [00:00<00:00, 290.42it/s]

input_id shape:torch.Size([3, 2048])	input_mask shape:torch.Size([3, 2048])
segment_id shape:torch.Size([3, 2048])	label_id shape:torch.Size([3])
seq_len shape:torch.Size([3, 1])	context_id shape:torch.Size([3, 1])





In [55]:
# Merge local attention mask and global attention mask
attention_mask = longModel._merge_to_attention_mask(input_mask, global_attention_mask)

# Pad input variables based on window size
padding_len, input_ids, attention_mask, token_type_ids, position_ids = longModel._pad_to_window_size(
    input_ids, attention_mask, token_type_ids, None, model1.config.pad_token_id)
print(f'input_id shape:{input_ids.shape}\tattention_mask shape:{attention_mask.shape}\ncontext_id shape:{context_ids.shape}')

# Minus 10000 for masking effect
extended_attention_mask = get_extended_attention_mask(attention_mask, input_ids.size(), device)[:, 0, 0, :]
print('extended_attention_mask shape: ', extended_attention_mask.shape)


input_id shape:torch.Size([3, 2048])	attention_mask shape:torch.Size([3, 2048])
context_id shape:torch.Size([3, 1])
extended_attention_mask shape:  torch.Size([3, 2048])


# Testing Embedding Module

In [20]:
embedding_output = longEmbedding(input_ids, token_type_ids)
print(f'embedding_output shape:{embedding_output.shape}')

context_embedded = nn.Embedding(2375, model1.config.hidden_size)(context_ids).squeeze(dim=1)
context_embedding_output = torch.stack(2048*[context_embedded], dim=1)
print(f'context_embedding_output shape: {context_embedding_output.shape}')

embedding_output shape:torch.Size([3, 2048, 768])
context_embedding_output shape: torch.Size([3, 2048, 768])


# Testing SelfAttention Module

In [38]:
# Find global indices
is_index_masked = extended_attention_mask < 0
is_index_global_attn = extended_attention_mask > 0
is_global_attn = is_index_global_attn.flatten().any().item()

deep_context_hidden = torch.cat([context_embedding_output, embedding_output], dim=-1)
deep_context_hidden = longEncoder.context_layer[0](deep_context_hidden)
deep_context_hidden += context_embedding_output

attn_output, new_attn_probs = longSelfAttn(embedding_output, extended_attention_mask, device, None,
                     is_index_masked, is_index_global_attn, is_global_attn, deep_context_hidden)
print('attention_output shape: ', attn_output.shape)

attention_output shape:  torch.Size([3, 2048, 768])


# Testing SelfAttentionOutput Module

In [42]:
attention_output = longSelfOut(attn_output, embedding_output)
print('attention_output shape (after dense, norm, dropout): ', attention_output.shape)

attention_output shape (after dense, norm, dropout):  torch.Size([3, 2048, 768])


# Testing Attention Module

In [47]:
attention_output1, attention_probs = longAttention(embedding_output, extended_attention_mask, device, None,
                     is_index_masked, is_index_global_attn, is_global_attn, deep_context_hidden)
print('attention_output shape (for Attention Module): ', attention_output1.shape)

attention_output shape (for Attention Module):  torch.Size([3, 2048, 768])


# Testing Layer Module

In [48]:
layer_output, _, _, _ = longLayer(embedding_output, extended_attention_mask, device, None,
                     is_index_masked, is_index_global_attn, is_global_attn, deep_context_hidden)
print('layer_output shape (after feed-forward, dense, norm, dropout): ', layer_output.shape)

layer_output shape (after feed-forward, dense, norm, dropout):  torch.Size([3, 2048, 768])


# Testing Encoder Module

In [49]:
all_encoder_layers, all_new_attention_probs, _, _ = longEncoder(embedding_output, extended_attention_mask, device,
                                                               deep_context_hidden, None, padding_len)
sequence_output = all_encoder_layers[-1]
print('Final output shape (from Encoder Module): ', sequence_output.shape)

Final output shape (from Encoder Module):  torch.Size([3, 2048, 768])


# Testing Pooler Module

In [52]:
pooled_output = longPooler(sequence_output, attention_mask)
print('pooled_output shape: ', pooled_output.shape)

pooled_output shape:  torch.Size([3, 768])


# Testing Model Module

In [53]:
pooled_output1, _, _, _, _ = longModel(input_ids, token_type_ids, attention_mask, device, context_ids, 
                        global_attention_mask=global_attention_mask,
                        head_mask=None,
                        position_ids=None,)
print('pooled_output shape (from Model Module): ', pooled_output.shape)

pooled_output shape (from Model Module):  torch.Size([3, 768])
