In [None]:
try:
  import portalocker
except ModuleNotFoundError:
  !pip install portalocker
  import portalocker
update_torchtext = False
try:
  import torchtext
  update_torchtext = torchtext.__version__ < "0.15"
except ModuleNotFoundError:
  update_torchtext = True
if update_torchtext:
  !pip uninstall --yes fastai
  import re
  cudaver = !nvcc --version | grep release
  cudaver = re.search(r".*release (.*),.*", cudaver[0]).group(1)
  print(f"Found CUDA version {cudaver}")
  cudaver_nodot = cudaver.replace(".","")
  !pip install -U torch torchvision torchaudio "torchtext>=0.15" --index-url https://download.pytorch.org/whl/cu{cudaver_nodot}
  !pip install tensorboardX lightning

In [6]:
import torch
import torch.nn as nn
import numpy as np
import tensorboardX
# import torchtext.functional as F

from transformers import AutoTokenizer, XLMRobertaForSequenceClassification,DataCollatorWithPadding,XLMRobertaTokenizerFast

from IPython.display import Markdown
import pandas as pd

# DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [7]:
torch.__version__

'2.0.0'

In [8]:
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader
# dataset = DatasetDict.load_from_disk('ww-binary-dataset-small')
dataset = DatasetDict.load_from_disk('ww-strict-binary-dataset')

In [9]:
train_datapipe = dataset['train']
val_datapipe = dataset['validation']

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_datapipe = train_datapipe.map(lambda x:  tokenizer(x['Text'],truncation=True,padding=True))
val_datapipe = val_datapipe.map(lambda x:  tokenizer(x['Text'],truncation=True,padding=True))

Loading cached processed dataset at /Users/guneet/Documents/BD2/DataSIGNS/notebooks/Modeling/ww-strict-binary-dataset/train/cache-d16819e29be1b080.arrow
                                                                                

In [10]:
train_datapipe.set_format("torch")
val_datapipe.set_format("torch")

In [11]:
train_datapipe = train_datapipe.remove_columns(['Author', 'Title', 'Subreddit', 'Flair', 'Post ID', 'Url', 'Created Time',"Text"])
val_datapipe = val_datapipe.remove_columns(['Author', 'Title', 'Subreddit', 'Flair', 'Post ID', 'Url', 'Created Time',"Text"])


In [12]:
train_datapipe

Dataset({
    features: ['label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 16002
})

In [13]:
train_dataloader = DataLoader(
    train_datapipe, batch_size=8, 
    collate_fn=data_collator
)
eval_dataloader = DataLoader(
    val_datapipe, batch_size=8, collate_fn=data_collator
)

In [14]:
# for batch in train_dataloader:
#     break
# {k: v.shape for k, v in batch.items()}

In [15]:
import time

EPOCHS = 1 # changed for model improvement
# USE_GPU = torch.cuda.is_available()
DROPOUT = .5 # changed for model improvement
timestamp = str(int(time.time()))
best_dev_acc = 0.0

LEARNING_RATE = 1e-5
BATCH_SIZE = 128
EMBEDDING_TYPE = 'built-in'

In [18]:
# from transformers import XLMRobertaConfig
num_classes = 2

# configuration = XLMRobertaConfig("xlm-roberta-base")
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base",num_labels=num_classes, from_tf=False)

# model.to(DEVICE); 

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p

In [19]:
try:
  import lightning.pytorch as pl
except:
  !pip install tensorboardX lightning

In [20]:
import lightning.pytorch as pl
import torch.nn.functional as F
from torch.optim import AdamW


class LitModel(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.model.train()
        self.criteria = nn.CrossEntropyLoss()

    def training_step(self, batch, batch_idx):
        input = batch["input_ids"].clone().detach().to(self.device)
        labels = batch["labels"].clone().detach().to(self.device)
        output = self.model(input)
        loss = self.criteria(output.logits, labels)

        # Compute accuracy
        preds = torch.argmax(output.logits, axis=1)
        acc = torch.sum(preds == labels).item() / len(labels)

        # Log loss and accuracy to TensorBoard
        self.logger.log_metrics({"train_loss": loss.item(), "train_acc": acc}, step=self.global_step)

        return loss

    def validation_step(self, batch, batch_idx):
        input = batch["input_ids"].clone().detach().to(self.device)
        labels = batch["labels"].clone().detach().to(self.device)
        output = self.model(input)
        loss = self.criteria(output.logits, labels)

        # Compute accuracy
        preds = torch.argmax(output.logits, axis=1)
        acc = torch.sum(preds == labels).item() / len(labels)

        # Log loss and accuracy to TensorBoard
        self.logger.log_metrics({"val_loss": loss.item(), "val_acc": acc}, step=self.global_step)

        return {"val_loss": loss.item(), "val_acc": acc}
        

    def configure_optimizers(self):
        return AdamW(self.model.parameters(), lr=LEARNING_RATE)

    def on_train_end(self):
        # Compute accuracy on the validation set
        # val_loader = self.trainer.val_dataloader()
        self.model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for batch in eval_dataloader:
                input = batch["input_ids"].clone().detach().to(self.device)
                output = self.model(input)
                preds = torch.argmax(output, axis=1)
                correct += torch.sum(preds == batch["labels"].clone().detach().to(self.device)).item()
                total += len(preds)
        val_acc = correct / total

        print(f"Final validation accuracy: {val_acc}")


In [21]:
from lightning.pytorch import Trainer
from lightning.pytorch.loggers import TensorBoardLogger

tb_logdir = "logs-ww-strict"

logger = TensorBoardLogger(tb_logdir, name="classifier_model")
trainer = Trainer(logger=logger, max_epochs=EPOCHS,accelerator='gpu', devices='auto')

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(LitModel(model), train_dataloader, eval_dataloader)

Missing logger folder: logs-ww-strict/classifier_model

  | Name     | Type                                | Params
-----------------------------------------------------------------
0 | model    | XLMRobertaForSequenceClassification | 278 M 
1 | criteria | CrossEntropyLoss                    | 0     
-----------------------------------------------------------------
278 M     Trainable params
0         Non-trainable params
278 M     Total params
1,112.181 Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


                                                                                

  rank_zero_warn(


Epoch 0:   0%|                                         | 0/2001 [00:00<?, ?it/s]

In [None]:
torch.save(model.state_dict(), "roberta_model.pt")

In [None]:
print("Done")

In [None]:
model

In [None]:
def on_train_end_manual(model):
        # Compute accuracy on the validation set
        # val_loader = self.trainer.val_dataloader()
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for batch in eval_dataloader:
                input = batch["input_ids"].clone().detach().to(DEVICE)
                output = model(input)
                logits = output.logits
                preds = torch.argmax(logits, axis=1)
                correct += torch.sum(preds == batch["labels"].clone().detach().to(DEVICE)).item()
                total += len(preds)
                print(correct/total,total)
        val_acc = correct / total

        print(f"Final validation accuracy: {val_acc}")

In [None]:
on_train_end_manual(model.to(DEVICE))

In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, XLMRobertaForSequenceClassification,DataCollatorWithPadding,XLMRobertaTokenizerFast
import torch

In [None]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
base_model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base",num_labels=2, force_download=True, from_tf=False)

In [None]:
model.load_state_dict(torch.load('roberta_model.pt'))
model.eval()

In [None]:
class ClassPredictor:
  def __init__(self,model,tokenizer,model_state):
    self.model = model
    self.tokenizer = tokenizer
    # self.model.load_state_dict(torch.load('/content/drive/MyDrive/roberta_model.pt'))
    self.model.load_state_dict(torch.load(model_state))
    self.model.eval()

  def __call__(self, text, prob=True):
    if isinstance(text, str):
      text = [text]
      unpack = True
    else:
      unpack = False

    tokenized_text = [self.tokenizer.encode(t, add_special_tokens=True) for t in text]
    # Pad the tokenized input text to the same length
    max_length = max(len(t) for t in tokenized_text)
    padded_text = [t + [0] * (max_length - len(t)) for t in tokenized_text]
    # Convert the padded input text to PyTorch tensor
    input_ids = torch.tensor(padded_text)
    # Make predictions with the model
    with torch.no_grad():
      logits = self.model(input_ids).logits
      probs = torch.softmax(logits, dim=1)
      print(logits.argmax())
        

    if prob:
      if unpack:
        return probs[0,1].tolist()
      else:
        return probs[:,1].tolist()
    else:
      return bool(logits.argmax())

cp = ClassPredictor(base_model,tokenizer,'roberta_model.pt')

In [None]:
cp("I am so sick of life, why are we still here? what is the meaning of it all?")

In [None]:
cp("""Hey guys, did you catch the cannucks' game last night? It was wild. It came down to the wire and
   shall go down in history as the greatest game.""")