In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import LongformerModel, LongformerTokenizer,LongformerConfig
import pandas as pd
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
LONGFORMER_BASED = "allenai/longformer-base-4096"

In [3]:
tokenizer = LongformerTokenizer.from_pretrained(LONGFORMER_BASED)

In [4]:
dataFrame = pd.read_csv('./LongDocumentPreprocessData.csv')

In [5]:
model = LongformerModel.from_pretrained(LONGFORMER_BASED)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
class NLPDataset(Dataset):

    def __init__(self, dataFrame, tokenizer, padding = 'max_length', max_length = 4096) -> None:
        self.tokenizer = tokenizer
        self.dataframe = dataFrame
        self.padding = padding
        self.max_length = max_length
    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        # q_token = tokenizer.encode_plus(self.dataframe['q'].iloc[index].split(), padding = self.padding, max_length = self.max_length)
        # r_token = tokenizer.encode_plus(self.dataframe['r'].iloc[index].split(), padding = self.padding, max_length = self.max_length)
        q_token = tokenizer.encode_plus(self.dataframe['q'].iloc[index].split(), max_length = self.max_length, truncation=True)
        r_token = tokenizer.encode_plus(self.dataframe['r'].iloc[index].split(), max_length = self.max_length, truncation=True)
        s = 1 if self.dataframe['s'].iloc[index] == "AGREE" else 0
        com_q = [1] + eval(self.dataframe['com_q'].iloc[index])[:self.max_length - 2] + [1]
        com_r = [1] + eval(self.dataframe['com_r'].iloc[index])[:self.max_length - 2] + [1]
        return (
            torch.tensor(q_token['input_ids']), torch.tensor(q_token['attention_mask']),
            torch.tensor(r_token['input_ids']), torch.tensor(r_token['attention_mask']),
            torch.tensor(s),torch.tensor(com_q), torch.tensor(com_r)
        )
dataset = NLPDataset(dataFrame=dataFrame, tokenizer=tokenizer)
trainLoader = DataLoader(dataset, batch_size=1, shuffle = True)

In [7]:
# model 部分

class LongFormer(torch.nn.Module):
    def __init__(self, TYPE = LONGFORMER_BASED) -> None:
        super(LongFormer, self).__init__()
        self.model = LongformerModel.from_pretrained(LONGFORMER_BASED)

    def forward(self, dict):
        out = self.model(**dict)
        cls_output = out.pooler_output
        seq_output = out.last_hidden_state
        return cls_output, seq_output

class WordSelector(torch.nn.Module):
    # 字詞選擇器
    def __init__(self, d_model = 768) -> None:
        super(WordSelector, self).__init__()
        self.linear = torch.nn.Linear(d_model, 2)

    def forward(self, last_state):
        out = self.linear(last_state)
        return out

class SentPoistionTeller(torch.nn.Module):
    # 立場辨識模組
    def __init__(self, dim_q = 768, dim_r = 768) -> None:
        super(SentPoistionTeller, self).__init__()
        self.linear = torch.nn.Linear(dim_q + dim_r, 2)

    def forward(self, q_cls, r_cls):
        h = torch.concat([q_cls, r_cls], dim = 1)
        out = self.linear(h)
        return out

class LongFormerExtModel(torch.nn.Module):

    def __init__(self) -> None:
        super(LongFormerExtModel, self).__init__()
        self.r_bert = LongFormer(TYPE = LONGFORMER_BASED)
        self.sent_position_teller = SentPoistionTeller()
        self.word_selector = WordSelector()

    def forward(self, q_dict, r_dict):
        q_cls, q_last_seq = self.r_bert(q_dict)
        r_cls, r_last_seq = self.r_bert(r_dict)
        s = self.sent_position_teller(q_cls, r_cls)
        q_out_seq = self.word_selector(q_last_seq)
        r_out_seq = self.word_selector(r_last_seq)
        return q_out_seq, r_out_seq, s

In [8]:
model = LongFormerExtModel().cuda()
loss_fn = torch.nn.CrossEntropyLoss()
model_opt = torch.optim.AdamW(model.parameters(), 5e-5)
lr_sc = torch.optim.lr_scheduler.LinearLR(model_opt, start_factor=0.5, total_iters = 19)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
for epoch in range(20):
    total_loss = 0.
    currentLR = lr_sc.get_last_lr()[0]
    train_process = tqdm(trainLoader)
    for batch, data in enumerate(train_process, start = 1):
        model_opt.zero_grad()
        q_dict = {"input_ids" : data[0].cuda(), "attention_mask": data[1].cuda()}
        r_dict = {"input_ids" : data[2].cuda(), "attention_mask": data[3].cuda()}
        s_label, q_label, r_label = data[4], data[5], data[6]
        q_pred, r_pred, s_pred = model(q_dict, r_dict)
        s_loss = loss_fn(s_pred, s_label.cuda())
        q_loss = loss_fn(q_pred.contiguous().reshape(q_pred.shape[0] * q_pred.shape[1], -1), q_label.cuda().contiguous().reshape(-1))
        r_loss = loss_fn(r_pred.contiguous().reshape(r_pred.shape[0] * r_pred.shape[1], -1), r_label.cuda().contiguous().reshape(-1))
        t_loss = (q_loss + r_loss + s_loss) / 3.
        t_loss.backward()
        model_opt.step()
        total_loss += t_loss.item()
        train_process.set_postfix({"AVG_LOSS" : total_loss/ batch, "CURRENT_LR" : currentLR})
    lr_sc.step()

 45%|████▍     | 3579/7987 [11:23<14:01,  5.24it/s, AVG_LOSS=0.521, CURRENT_LR=2.5e-5]


OutOfMemoryError: CUDA out of memory. Tried to allocate 180.00 MiB (GPU 0; 10.00 GiB total capacity; 7.98 GiB already allocated; 0 bytes free; 8.29 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
torch.save(model.state_dict(), './longFormerExtModelWordGrained.pt')