# Imports and global variables

In [1]:
import os
import pandas as pd
import numpy as np
import torch
import pytorch_lightning as pl
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
import json
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from pytorch_lightning.metrics import functional as FM
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

In [2]:
DATA_FOLDER = os.path.join(os.getcwd(), "Data")
MODELS_FOLDER = os.path.join(os.getcwd(), "Models")
TRAIN_FILE_NAME = "corefx_cleaned_train.csv"
VAL_FILE_NAME = "corefx_cleaned_val.csv"
TRAIN_FILE_PATH = os.path.join(DATA_FOLDER, TRAIN_FILE_NAME)
VAL_FILE_PATH = os.path.join(DATA_FOLDER, VAL_FILE_NAME)

In [3]:
# we get the length of the train set to use in the scheduler
train_df = pd.read_csv(os.path.join(DATA_FOLDER, "corefx_cleaned_train.csv"))
TRAIN_LEN = len(train_df)
del train_df

In [4]:
BATCH_SIZE = 32
MAX_LEN = 128
DROPOUT = 0.3
SEED = 2020
EPOCHS = 50

In [5]:
def set_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

In [6]:
set_seed(SEED)

# Define Dataset class

In [7]:
class CorefxDataset(torch.utils.data.Dataset):

    def __init__(self, csv_path: str, tokenizer, max_len: int = 128):
        super().__init__()
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.df = pd.read_csv(csv_path)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        encoded = self.tokenizer(self.df["Text"].iloc[idx], padding='max_length', truncation=True, max_length=self.max_len)
        label = self.df["Label"].iloc[idx]
        return ({
            'input_ids' : torch.tensor(encoded['input_ids'], dtype=torch.long),
            'attention_mask' : torch.tensor(encoded['attention_mask'], dtype=torch.long)},
            torch.tensor(label, dtype=torch.long)
            )


# Define DataModule class (specific to pytorch lightning)

In [8]:
class CorefxIssuesDataModule(pl.LightningDataModule):
    
    def __init__(self, data_path: str, tokenizer, max_len: int = 128, batch_size: int = 32):
        super().__init__()
        self.data_path = data_path
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.batch_size = batch_size

    def train_dataloader(self):
        train_split = CorefxDataset(os.path.join(self.data_path, "corefx_cleaned_train.csv"), self.tokenizer, self.max_len)
        return torch.utils.data.DataLoader(train_split, batch_size=self.batch_size, shuffle=True, num_workers=0)

    def val_dataloader(self):
        val_split = CorefxDataset(os.path.join(self.data_path, "corefx_cleaned_val.csv"), self.tokenizer, self.max_len)
        return torch.utils.data.DataLoader(val_split, batch_size=self.batch_size, shuffle=True, num_workers=0, )

# Define the model class

In [9]:
class CorefxRobertaModel(pl.LightningModule):

    def __init__(self, num_labels, dropout):
        super().__init__()
        self.lr = 3e-5
        self.hparams.num_labels = num_labels
        self.hparams.dropout = dropout
        self.save_hyperparameters()
        self.roberta_model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=num_labels, return_dict=True, hidden_dropout_prob=dropout)
        

    def forward(self, input_ids, attention_mask):
        return self.roberta_model(input_ids = input_ids, attention_mask = attention_mask)


    def configure_optimizers(self):
        # All this is taken from Huggingface in how they use this for BERT
        # Since Roberta is basically BERT, I keep the same
        param_optimizer = list(self.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.001,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]

        num_train_steps = int(TRAIN_LEN / BATCH_SIZE * EPOCHS)
        optimizer = AdamW(optimizer_parameters, lr=self.lr)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

        return [optimizer], [scheduler]


    def training_step(self, batch, batch_idx):
        self.roberta_model.train()
        x, y = batch
        outputs = self(x["input_ids"], x["attention_mask"])
        loss = torch.nn.CrossEntropyLoss()(outputs.logits, y)
        acc = FM.accuracy(outputs.logits, y, num_classes=self.hparams.num_labels)
        result = pl.TrainResult(loss)
        result.log_dict({'train_acc': acc, 'train_loss': loss}, prog_bar=True)
        return result


    def validation_step(self, batch, batch_idx):
        self.roberta_model.eval()
        x, y = batch
        outputs = self(x["input_ids"], x["attention_mask"])
        val_loss = torch.nn.CrossEntropyLoss()(outputs.logits, y)
        acc = FM.accuracy(outputs.logits, y, num_classes=self.hparams.num_labels)
        result = pl.EvalResult(checkpoint_on=val_loss, early_stop_on=val_loss)
        result.log_dict({'val_acc': acc, 'val_loss': val_loss}, prog_bar=True)
        return result

# Training

In [10]:
roberta_tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

In [11]:
corefx_dm = CorefxIssuesDataModule(DATA_FOLDER, roberta_tokenizer, MAX_LEN, BATCH_SIZE)

In [12]:
with open(os.path.join(DATA_FOLDER, 'lookup.json')) as json_file: 
    lookup = json.load(json_file) 

In [13]:
roberta = CorefxRobertaModel(num_labels=len(lookup.keys()) // 2, dropout=DROPOUT)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: [&#39;lm_head.bias&#39;, &#39;lm_head.dense.weight&#39;, &#39;lm_head.dense.bias&#39;, &#39;lm_head.layer_norm.weight&#39;, &#39;lm_head.layer_norm.bias&#39;, &#39;lm_head.decoder.weight&#39;, &#39;roberta.pooler.dense.weight&#39;, &#39;roberta.pooler.dense.bias&#39;]
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base a

In [14]:
early_stop_callback = pl.callbacks.early_stopping.EarlyStopping(
   monitor='val_loss',
   min_delta=0.00,
   patience=20,
   verbose=True,
   mode='min')

In [15]:
checkpoint_callback = ModelCheckpoint(
    filepath=os.path.join(MODELS_FOLDER, "pl", "best.ckpt"),
    verbose=True,
    monitor='val_loss',
    mode='min')

In [16]:
trainer = pl.Trainer(gpus=1, early_stop_callback=early_stop_callback, checkpoint_callback=checkpoint_callback, max_epochs=EPOCHS)
trainer.fit(roberta, corefx_dm)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type                             | Params
-------------------------------------------------------------------
0 | roberta_model | RobertaForSequenceClassification | 124 M 
Epoch 0:  79%|███████▉  | 41/52 [00:13&lt;00:03,  2.97it/s, loss=2.491, v_num=0]
Validating: 0it [00:00, ?it/s][A
Epoch 0:  81%|████████  | 42/52 [00:13&lt;00:03,  3.01it/s, loss=2.491, v_num=0]
Epoch 0:  83%|████████▎ | 43/52 [00:14&lt;00:02,  3.06it/s, loss=2.491, v_num=0]
Epoch 0:  85%|████████▍ | 44/52 [00:14&lt;00:02,  3.10it/s, loss=2.491, v_num=0]
Epoch 0:  87%|████████▋ | 45/52 [00:14&lt;00:02,  3.14it/s, loss=2.491, v_num=0]
Epoch 0:  88%|████████▊ | 46/52 [00:14&lt;00:01,  3.18it/s, loss=2.491, v_num=0]
Epoch 0:  90%|█████████ | 47/52 [00:14&lt;00:01,  3.22it/s, loss=2.491, v_num=0]
Epoch 0:  92%|█████████▏| 48/52 [00:14&lt;00:01,  3.26it/s, loss=2.491, v_num=0]
Epoch 0:  94%|█████████▍| 

1

In [17]:
print(checkpoint_callback.best_model_score)

tensor(0.8870, device=&#39;cuda:0&#39;)


In [18]:
print(checkpoint_callback.best_model_path)

d:\work\RobertaGithubIssuesClassification\Models\pl\_ckpt_epoch_18.ckpt


# Test on new data

In [19]:
roberta = CorefxRobertaModel.load_from_checkpoint(checkpoint_callback.best_model_path)
roberta.eval()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: [&#39;lm_head.bias&#39;, &#39;lm_head.dense.weight&#39;, &#39;lm_head.dense.bias&#39;, &#39;lm_head.layer_norm.weight&#39;, &#39;lm_head.layer_norm.bias&#39;, &#39;lm_head.decoder.weight&#39;, &#39;roberta.pooler.dense.weight&#39;, &#39;roberta.pooler.dense.bias&#39;]
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base a

CorefxRobertaModel(
  (roberta_model): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.3, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0): RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (den

In [20]:
issue = "AppDomain.SetPrincipalPolicy(PrincipalPolicy.WindowsPrincipal) works only once. Setting the PrincipalPolicy on the current AppDomain to WindowsPrincipal works only for the first thread being started. Any subsequent thread has Thread.CurrentPrincipal evaluated to NULL."
label = "area-System.Security"

In [21]:
encoded_pt = roberta_tokenizer(issue, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors='pt', add_special_tokens=True, return_token_type_ids=False)

In [22]:
pt_result = roberta(encoded_pt["input_ids"], encoded_pt["attention_mask"])
print(pt_result)

SequenceClassifierOutput(loss=None, logits=tensor([[ 1.1868, -1.4769, -0.4210,  3.5745,  1.4349,  0.5693,  0.1789, -0.2447,
         -0.5479, -0.5767, -2.3285, -0.5988, -1.0072, -1.5970, -0.9118,  4.4835,
         -0.5231, -1.3572, -0.5189, -0.6711, -0.9153, -0.4658]],
       grad_fn=&lt;AddmmBackward&gt;), hidden_states=None, attentions=None)


In [23]:
index = np.argmax(pt_result[0].detach().numpy())
print(f"index : {index}, category : {lookup[str(index)]}")
# not what was expected ....

index : 15, category : area-System.Data


In [24]:
softmax = torch.nn.functional.softmax(pt_result[0])

In [25]:
for i, value in enumerate(softmax[0]):
    print(f"{lookup[str(i)] : <27} : {value.item() * 100 :.2f}% confidence")

area-System.Net             : 2.31% confidence
area-Infrastructure         : 0.16% confidence
area-System.ComponentModel  : 0.46% confidence
area-System.Security        : 25.11% confidence
area-System.Runtime         : 2.96% confidence
area-System.IO              : 1.24% confidence
area-System.Xml             : 0.84% confidence
area-System.Collections     : 0.55% confidence
area-System.Threading       : 0.41% confidence
area-System.Reflection      : 0.40% confidence
area-System.Memory          : 0.07% confidence
area-System.Diagnostics     : 0.39% confidence
area-Serialization          : 0.26% confidence
area-System.Drawing         : 0.14% confidence
area-Meta                   : 0.28% confidence
area-System.Data            : 62.33% confidence
area-Microsoft.CSharp       : 0.42% confidence
area-System.Numerics        : 0.18% confidence
area-System.Text            : 0.42% confidence
area-System.Globalization   : 0.36% confidence
area-System.Linq            : 0.28% confidence
area-System

# Export to ONNX

In [26]:
torch.onnx.export(roberta,               # model being run
                  (encoded_pt["input_ids"], encoded_pt["attention_mask"]),  # model input (or a tuple for multiple inputs)
                  os.path.join(MODELS_FOLDER, "pl", "pl_roberta_github_issues.onnx"),   # where to save the model
                  export_params=True,        # store the trained parameter weights inside the model file
                  opset_version=12,          # the ONNX version to export the model to
                  do_constant_folding=True,  # whether to execute constant folding for optimization
                  input_names = ['input_ids', 'attention_mask'],   # the model's input names
                  output_names = ['output'], # the model's output names
                  dynamic_axes={'input_ids' : {0 : 'batch_size'},
                                'attention_mask' : {0 : 'batch_size'},
                                'output' : {0 : 'batch_size'}}
                    )