In [25]:
import pandas
import re, json
import csv

import torch
import torch.nn as nn
from datasets import load_metric,Dataset,DatasetDict, load_dataset, Sequence, Value
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, BartForConditionalGeneration
from transformers import AutoTokenizer, Trainer

import evaluate

import numpy as np
import nltk
import os
import random
from sklearn.model_selection import train_test_split
from typing import List, Optional, Tuple, Union, Dict, Any

from torch import optim

In [26]:
import lightning as L

In [27]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [28]:

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)


NameError: name 'model_checkpoint' is not defined

In [None]:
max_input_length = 256
max_target_length = 128

In [None]:
dataset = load_dataset('pvisnrt/mod_capstone')

In [None]:
dataset['train'] = dataset['train'].cast_column("tags", Sequence(Value("int32")))
dataset['validation'] = dataset['validation'].cast_column("tags", Sequence(Value("int32")))
dataset['test'] = dataset['test'].cast_column("tags", Sequence(Value("int32")))

In [None]:
def tokenize_and_align_labels(examples):
    inputs = [doc for doc in examples['source']]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, is_split_into_words=True, return_tensors='pt', padding=True)

    with tokenizer.as_target_tokenizer():
        tokenized_inputs = tokenizer(examples["summary_target"], truncation=True, is_split_into_words=True, return_tensors='pt', padding=True)

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)# Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    model_inputs['labels'] = tokenized_inputs['input_ids']

    model_inputs["decoder_tags"] = labels
    
    return model_inputs

In [29]:
model_checkpoint = "facebook/bart-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

In [30]:
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

In [31]:
tokenized_datasets['train'] = tokenized_datasets['train'].remove_columns(['source','summary_target', 'tags'])
tokenized_datasets['validation'] = tokenized_datasets['validation'].remove_columns(['source','summary_target', 'tags'])
tokenized_datasets['test'] = tokenized_datasets['test'].remove_columns(['source','summary_target', 'tags'])

tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'decoder_tags'],
        num_rows: 80
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'decoder_tags'],
        num_rows: 10
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'decoder_tags'],
        num_rows: 10
    })
})

In [57]:
class myBart(L.LightningModule):
    def __init__(self, model_checkpoint):
        super().__init__()
        self.num_labels = 9
        self.bart_model = BartForConditionalGeneration.from_pretrained(model_checkpoint)
        self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
        self.classifier = nn.Linear(self.bart_model.config.d_model, self.num_labels)

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        # it is independent of forward

        # print(batch['input_ids'])
        
        input_ids = torch.stack(batch['input_ids']).t()

        attention_mask = torch.stack(batch['attention_mask']).t()
        labels = torch.stack(batch['labels']).t()
        print(labels.shape)
        decoder_tags = torch.stack(batch['decoder_tags']).t()
        output = self.bart_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        print(output)
        output.loss  # LM head loss
        output.decoder_hidden_states[-1].shape  # decoder last hidden state

        linear_logits = self.classifier(output.decoder_hidden_states[-1]) 

        #classifier_loss
        classifier_loss = 0
        if decoder_tags is not None:
            decoder_tags = decoder_tags.to(linear_logits.device)
            tags_loss_fct = CrossEntropyLoss()
            classifier_loss = tags_loss_fct(linear_logits.view(-1, self.num_labels), decoder_tags.view(-1))

        total_loss = classifier_loss + output.loss

        # loss = nn.functional.mse_loss(x_hat, x)
        # Logging to TensorBoard (if installed) by default
        self.log("train_loss", total_loss)
        return total_loss

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=2e-5)
        return optimizer


In [58]:
train_loader = torch.utils.data.DataLoader(tokenized_datasets['train'], batch_size=4)
validation_loader = torch.utils.data.DataLoader(tokenized_datasets['validation'], batch_size=4)
test_loader = torch.utils.data.DataLoader(tokenized_datasets['test'], batch_size=4)

In [59]:
model = myBart(model_checkpoint)
trainer = L.Trainer()
trainer.fit(model, train_dataloaders=train_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/pvakhari/miniconda3/envs/nlp_env/lib/python3.9/site-packages/lightning/pytorch/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type                         | Params
------------------------------------------------------------
0 | bart_model | BartForConditionalGeneration | 406 M 
1 | classifier | Linear                       | 9.2 K 
------------------------------------------------------------
406 M     Trainable params
0         Non-trainable params
406 M     Total params
1,625.203 Total estimated model params size (MB)
/home/pvakhari/miniconda3/envs/nlp_env/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many work

Training: |          | 0/? [00:00<?, ?it/s]

torch.Size([4, 109])


RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.

In [None]:
print("hi")

In [45]:
a = [torch.tensor([1,2,3,4]), torch.tensor([2,3,4,5])]

In [50]:
torch.stack(a).t()

tensor([[1, 2],
        [2, 3],
        [3, 4],
        [4, 5]])

In [60]:
bart_model = BartForConditionalGeneration.from_pretrained(model_checkpoint)

In [63]:
for batch in train_loader:
    input_ids = torch.stack(batch['input_ids']).t()

    attention_mask = torch.stack(batch['attention_mask']).t()
    labels = torch.stack(batch['labels']).t()
    print(labels.shape)
    decoder_tags = torch.stack(batch['decoder_tags']).t()
    output = bart_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    print(output)
    break

torch.Size([4, 109])


RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.