In [3]:
from transformers import AutoTokenizer, RobertaModel
import torch
from torch import nn , optim 
from torch.utils.data import Dataset, DataLoader
from torch.nn import functional as F
import pandas as pd
import lightning as L
from pytorch_lightning.loggers import WandbLogger 
from optuna.integration import PyTorchLightningPruningCallback


In [4]:
class LSTM_RoBERTA(nn.Module):
    def __init__(self,roberta_path = "roberta-base",lstm_hidden_size=256,lstm_layers=2,lstm_dropout=0.2 , num_classes=2):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained(roberta_path)
        self.lstm = nn.LSTM(self.roberta.config.hidden_size,lstm_hidden_size,lstm_layers,batch_first=True,dropout=lstm_dropout,bidirectional=True)
        self.linear = nn.Linear(lstm_hidden_size*2,num_classes)
        self.dropout = nn.Dropout(0.2)
        for param in self.roberta.parameters():
            param.requires_grad = False
        for params in self.roberta.pooler.parameters():
            params.requires_grad = True
        print("Model Loaded")
        print("Total_trainable_params : ",sum(p.numel() for p in self.parameters() if p.requires_grad))
        print("Total_untrainable_params : ",sum(p.numel() for p in self.parameters() if not p.requires_grad))
    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids, attention_mask)
        roberta_output = roberta_output.last_hidden_state
        lstm_output, (h_n,c_n) = self.lstm(roberta_output)
        logits = self.linear(lstm_output[:, -1, :])
        # print(logits.shape,"logits")
        return logits

In [5]:
class DatasetLM(torch.utils.data.Dataset):
    def __init__(self, dataset, tokeniser):
        self.dataset = dataset
        self.tokeniser = tokeniser
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, index):
        # print(self.tokeniser(self.dataset[index]['text'], return_tensors="pt")['input_ids'])
        return (self.tokeniser(self.dataset[index]['text'], return_tensors="pt", truncation=True, max_length=512), self.dataset[index]['label'])



In [6]:
train_data_path = "/scratch/jainit/SubtaskA/subtaskA_train_monolingual.jsonl"
test_data_path = "/scratch/jainit/SubtaskA/subtaskA_dev_monolingual.jsonl"
df_train = pd.read_json(train_data_path, lines=True)
df_test = pd.read_json(test_data_path, lines=True)

In [8]:
train_data = df_train.to_dict('records')
test_data = df_test.to_dict('records')

In [10]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base" , truncation=True, max_length=512 )

train_dataset = DatasetLM(train_data, tokenizer)
test_dataset = DatasetLM(test_data, tokenizer)

In [11]:
def collate_fn(batch):
    input_ids = [x[0]['input_ids'].squeeze(0) for x in batch]
    # print(input_ids)
    attention_mask = [x[0]['attention_mask'].squeeze(0) for x in batch]
    labels = [x[1] for x in batch]
    # print(batch)
    # input_ids = torch.stack(input_ids)
    # attention_mask = torch.stack(attention_mask)
    # print(input_ids.shape)
    # print([x.shape for x in input_ids])
    inputs_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=1)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    # print(inputs_ids.shape)
    x = {'input_ids': inputs_ids, 'attention_mask': attention_mask}
    y = torch.Tensor(labels)
    return (x, y)

In [12]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn, num_workers = 4)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn, num_workers = 4)

In [13]:
from lightning.pytorch.utilities.types import TRAIN_DATALOADERS


class LitModel(L.LightningModule):
    def __init__(self,train_loader , val_loader , model_path, lr=2e-5, hidden_size=256, lstm_layers=2, lstm_dropout=0.2, num_classes=2):
        super().__init__()
        self.model_path = model_path
        self.lr = lr
        self.hidden_size = hidden_size
        self.lstm_layers = lstm_layers
        self.lstm_dropout = lstm_dropout
        self.num_classes = num_classes
        self.model = LSTM_RoBERTA(model_path, hidden_size, lstm_layers, lstm_dropout, num_classes)
        self.train_loader = train_loader
        self.val_loader = val_loader
        # self.save_hyperparameters()
    
    def forward(self, input_ids, attention_mask):
        # print(input_ids.shape , attention_mask.shape , "Here are the shapes")
        out = self.model(input_ids, attention_mask)
        # print(out.shape , "Here is the output$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$")
        return out
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        # print(x, y )
        # print( y.shape, "Here are wasd############" )
        logits = self.forward(x['input_ids'], x['attention_mask'])
        # print(logits.shape , "Here are the logits")
        loss = F.cross_entropy(logits, y.long())
        # print(logits.shape , y.shape)
        
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss
        # print("##############")
        # return None
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x['input_ids'], x['attention_mask'])
        loss = F.cross_entropy(logits, y.long())
        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        acc = (logits.argmax(dim=-1) == y).float().mean()
        self.log('val_acc', acc, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        
        return loss
    
    def train_dataloader(self):
        return self.train_loader
    
    def val_dataloader(self):
        return self.val_loader
    
    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=self.lr , weight_decay=0.01 , eps=1e-6)
        

In [21]:
def objective(trial):
    # Hyperparameters
    lr = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    hidden_size = trial.suggest_categorical("hidden_size", [256, 512, 848])
    lstm_layers = trial.suggest_categorical("lstm_layers", [1, 2, 3])
    lstm_dropout = trial.suggest_float("lstm_dropout", 0.1, 0.5)
    num_classes = 2
    model_path = "roberta-base"
    # model_path = "roberta-base"
    model = LitModel(train_loader , test_loader , model_path, lr, hidden_size, lstm_layers, lstm_dropout, num_classes)
    wandb_logger = WandbLogger(project="SubtaskA", log_model=True)
    callback = PyTorchLightningPruningCallback(trial, monitor="val_acc")
    trainer = L.Trainer(max_epochs=5, logger=wandb_logger, callbacks=[callback])
    hyperparameters = dict(lr=lr, hidden_size=hidden_size, lstm_layers=lstm_layers, lstm_dropout=lstm_dropout)
    trainer.logger.log_hyperparams(hyperparameters)
    trainer.fit(model)
    callback.check_pruned()

    return trainer.callback_metrics["val_acc"].item()


In [22]:

pruner: optuna.pruners.BasePruner =  (
        optuna.pruners.MedianPruner() 
    )


In [23]:
storage = "sqlite:///example.db"
study = optuna.create_study(
    study_name="pl_ddp",
    storage=storage,
    direction="maximize",
    pruner=pruner,
    load_if_exists=True,
)
study.optimize(objective, n_trials=100, timeout=600)

print("Number of finished trials: {}".format(len(study.trials)))

[I 2024-01-10 11:39:19,153] Using an existing study with name 'pl_ddp' instead of creating a new one.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded
Total_trainable_params :  2692866
Total_untrainable_params :  124055040


Trainer will use only 1 of 4 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=4)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
/home2/jainit/miniconda3/envs/torchy/lib/python3.11/site-packages/lightning/fabric/plugins/environments/slurm.py:191: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home2/jainit/miniconda3/envs/torchy/lib/python3.11/ ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjain_it[0m. Use [1m`wa

/home2/jainit/miniconda3/envs/torchy/lib/python3.11/site-packages/lightning/fabric/plugins/environments/slurm.py:191: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home2/jainit/miniconda3/envs/torchy/lib/python3.11/ ...
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name  | Type         | Params
---------------------------------------
0 | model | LSTM_RoBERTA | 126 M 
---------------------------------------
2.7 M     Trainable params
124 M     Non-trainable params
126 M     Total params
506.992   Total estimated model params size (MB)


Epoch 0:   0%|          | 8/7485 [00:07<1:55:49,  1.08it/s, v_num=cslw, train_loss_step=0.671]

/home2/jainit/miniconda3/envs/torchy/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...
[W 2024-01-10 11:39:45,040] Trial 1 failed with parameters: {'lr': 0.00017306470503173244, 'hidden_size': 256, 'lstm_layers': 1, 'lstm_dropout': 0.14641052916191286} because of the following error: KeyError('val_acc').
Traceback (most recent call last):
  File "/home2/jainit/miniconda3/envs/torchy/lib/python3.11/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_39413/47989888.py", line 19, in objective
    return trainer.callback_metrics["val_acc"].item()
           ~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^
KeyError: 'val_acc'
[W 2024-01-10 11:39:45,043] Trial 1 failed with value None.


KeyError: 'val_acc'