In [1]:
import os
import pandas as pd
import numpy as np
import torch
import pytorch_lightning as pl
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
import json
from transformers import AdamW

In [2]:
DATA_FOLDER = os.path.join("./", "Data")
MODELS_PATH = os.path.join("./", "Models")
TRAIN_FILE_NAME = "corefx_cleaned_train.csv"
VAL_FILE_NAME = "corefx_cleaned_val.csv"
TRAIN_FILE_PATH = os.path.join(DATA_FOLDER, TRAIN_FILE_NAME)
VAL_FILE_PATH = os.path.join(DATA_FOLDER, VAL_FILE_NAME)

In [3]:
BATCH_SIZE = 32
MAX_LEN = 64

In [4]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

In [5]:
class CorefxDataset(torch.utils.data.Dataset):

    def __init__(self, csv_path: str, tokenizer, max_len: int = 128):
        super().__init__()
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.df = pd.read_csv(csv_path)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        encoded = tokenizer(self.df["Text"].iloc[idx], padding='max_length', truncation=True, max_length=self.max_len)
        label = self.df["Label"].iloc[idx]
        return ({
            'input_ids' : torch.tensor(encoded['input_ids'], dtype=torch.long),
            'attention_mask' : torch.tensor(encoded['attention_mask'], dtype=torch.long)},
            torch.tensor(label, dtype=torch.long)
            )


In [6]:
class CorefxIssuesDataModule(pl.LightningDataModule):
    
    def __init__(self, data_path: str, tokenizer, max_len: int = 128, batch_size: int = 32):
        super().__init__()
        self.data_path = data_path
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.batch_size = batch_size

    def train_dataloader(self):
        train_split = CorefxDataset(os.path.join(self.data_path, "corefx_cleaned_train.csv"), self.tokenizer, self.max_len)
        return torch.utils.data.DataLoader(train_split, batch_size=self.batch_size, shuffle=True, num_workers=0)

    def val_dataloader(self):
        val_split = CorefxDataset(os.path.join(self.data_path, "corefx_cleaned_val.csv"), self.tokenizer, self.max_len)
        return torch.utils.data.DataLoader(val_split, batch_size=self.batch_size, shuffle=True, num_workers=0)

In [40]:
class CorefxModel(pl.LightningModule):

    def __init__(self, model):
        super().__init__()
        self.model = model
        self.lr = 1e-5

    def forward(self, x):
        return self.model(input_ids = x['input_ids'], attention_mask = x['attention_mask'])

    def configure_optimizers(self):
        param_optimizer = list(self.model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.001,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]

        optimizer = AdamW(optimizer_parameters, lr=self.lr)
        return optimizer

    def training_step(self, batch, batch_idx):
        x, y = batch
        outputs = self(x)
        loss = torch.nn.CrossEntropyLoss()(outputs.logits, y)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        outputs = self(x)
        val_loss = torch.nn.CrossEntropyLoss()(outputs.logits, y)
        return val_loss

In [41]:
corefx_dm = CorefxIssuesDataModule(DATA_FOLDER, tokenizer, MAX_LEN, BATCH_SIZE)

In [42]:
with open(os.path.join(DATA_FOLDER, 'lookup.json')) as json_file: 
    lookup = json.load(json_file) 

In [43]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(lookup.keys()) // 2, return_dict=True)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

In [44]:
roberta = CorefxModel(model)

In [45]:
early_stop_callback = pl.callbacks.early_stopping.EarlyStopping(
   monitor='val_loss',
   min_delta=0.00,
   patience=20,
   verbose=False,
   mode='min'
)

In [46]:
trainer = pl.Trainer(gpus=1, early_stop_callback=early_stop_callback)
trainer.fit(roberta, corefx_dm)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                             | Params
-----------------------------------------------------------
0 | model | RobertaForSequenceClassification | 125 M 
Validation sanity check: 0it [00:00, ?it/s]

ModuleAttributeError: 'CorefxModel' object has no attribute 'log'