# Imports and global variables

In [1]:
import tensorflow as tf
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint, TensorBoard, LearningRateScheduler
from tensorboard.plugins.hparams import api as hp
from transformers import RobertaTokenizerFast, TFRobertaModel, TFRobertaForSequenceClassification, RobertaForSequenceClassification
import numpy as np
import pandas as pd
import os
import datetime
import torch
import json

In [2]:
DATA_FOLDER = os.path.join(os.getcwd(), "Data")
MODELS_FOLDER = os.path.join(os.getcwd(), "Models")
TF_LOGS_FOLDER = os.path.join(os.getcwd(), "tf_logs")

In [3]:
train_df = pd.read_csv(os.path.join(DATA_FOLDER, "corefx_cleaned_train.csv"))
val_df = pd.read_csv(os.path.join(DATA_FOLDER, "corefx_cleaned_val.csv"))

In [4]:
TRAIN_LEN = len(train_df)
VAL_LEN = len(val_df)
del train_df
del val_df

In [5]:
BATCH_SIZE = 32
MAX_LEN = 128
DROPOUT = 0.3
SEED = 2020

# Create Tensorflow datasets

In [6]:
train_dataset = tf.data.experimental.CsvDataset(os.path.join(DATA_FOLDER, "corefx_cleaned_train.csv"), [tf.string, tf.int32], header=True)
val_dataset = tf.data.experimental.CsvDataset(os.path.join(DATA_FOLDER, "corefx_cleaned_val.csv"), [tf.string, tf.int32], header=True)

In [7]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

## Define transformations to do on datasets

In [8]:
def tf_py_function(sentences):
    decoded = []
    for sentence in sentences.numpy():
        decoded.append(sentence.decode())
    encoded = tokenizer(decoded, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors='tf')
    input_ids = encoded["input_ids"]
    attention_mask = encoded["attention_mask"]
    return (input_ids, attention_mask)

In [9]:
# Need to use tf.py_function because tokenizer doesn't operate on tensors
def encode_data(batch_x, batch_y):
    input_ids, attention_mask = tf.py_function(tf_py_function, [batch_x], (tf.int32, tf.int32))
    input_ids.set_shape([None, MAX_LEN])
    attention_mask.set_shape([None, MAX_LEN])
    return (input_ids, attention_mask, batch_y)

In [10]:
def map_to_dict(input_ids, attention_mask, labels):
    return ({"input_ids": input_ids, "attention_mask": attention_mask}, labels)

## Apply transformations and prepare datasets for training

In [11]:
train_dataset = train_dataset.shuffle(TRAIN_LEN, seed=SEED, reshuffle_each_iteration=True).batch(BATCH_SIZE).map(encode_data, num_parallel_calls=tf.data.experimental.AUTOTUNE).map(map_to_dict, num_parallel_calls=tf.data.experimental.AUTOTUNE).repeat().prefetch(1)
val_dataset = val_dataset.shuffle(VAL_LEN, seed=SEED, reshuffle_each_iteration=True).batch(BATCH_SIZE).map(encode_data, num_parallel_calls=tf.data.experimental.AUTOTUNE).map(map_to_dict, num_parallel_calls=tf.data.experimental.AUTOTUNE).repeat().prefetch(1)

# Create and train TF model

In [12]:
with open(os.path.join(DATA_FOLDER, 'lookup.json')) as json_file: 
    lookup = json.load(json_file) 

In [13]:
github_model = TFRobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(lookup.keys()) // 2, hidden_dropout_prob=DROPOUT)

Some weights of the model checkpoint at roberta-base were not used when initializing TFRobertaForSequenceClassification: [&#39;lm_head&#39;]
- This IS expected if you are initializing TFRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: [&#39;classifier&#39;]
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
def schedule(epoch, lr):
  if epoch < 10 or epoch > 40:
    return lr
  else:
    return lr * tf.math.exp(-0.1)

In [15]:
opt = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
accuracy = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")

github_model.compile(optimizer=opt, loss=loss, metrics=[accuracy])

In [16]:
earlystopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=30)
reduceLr = ReduceLROnPlateau(monitor='val_loss', mode='min', verbose=1, patience=5, factor=0.75, min_lr=1e-6)
checkpointer = ModelCheckpoint(filepath= os.path.join( MODELS_FOLDER, "github_model_weights.hdf5"), verbose=1, save_best_only=True)
scheduler = LearningRateScheduler(schedule)
# use tensorboard
log_dir = os.path.join(TF_LOGS_FOLDER, datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard = TensorBoard(log_dir=log_dir)

In [17]:
github_model.fit(train_dataset, validation_data=val_dataset, steps_per_epoch= TRAIN_LEN // BATCH_SIZE, epochs=100, validation_steps= VAL_LEN // BATCH_SIZE, callbacks=[earlystopping, reduceLr, checkpointer, tensorboard, scheduler])

Epoch 1/100
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 00001: val_loss improved from inf to 2.47690, saving model to d:\work\RobertaGithubIssuesClassification\Models\github_model_weights.hdf5
Epoch 2/100
Epoch 00002: val_loss improved from 2.47690 to 2.14337, saving model to d:\work\RobertaGithubIssuesClassification\Models\github_model_weights.hdf5
Epoch 3/100
Epoch 00003: val_loss improved from 2.14337 to 1.85898, saving model to d:\work\RobertaGithubIssuesClassification\Models\github_model_weights.hdf5
Epoch 4/100
Epoch 00004: val_loss improved from 1.85898 to 1.62820, saving model to d:\work\RobertaGithubIssuesClassification\Models\github_model_weights.hdf5
Epoch 5/100
Epoch 00005: val_loss improved from 1.62820 to 1.43296, saving model to d:\work\RobertaGithubIssuesClassification\Models\github_model_weights.hdf5
Epoch 6/100
Epoch 00006: val_loss improved from 1.43296 to 1.40135, saving model to d:\work\RobertaGithubIssuesClassification\Models\gith

&lt;tensorflow.python.keras.callbacks.History at 0x1ad16811088&gt;

# Save TF model (Huggingface format and SavedModel format) and create Pytorch model from it

In [18]:
github_model.load_weights(os.path.join( MODELS_FOLDER, "github_model_weights.hdf5"))
github_model.save_pretrained(os.path.join(MODELS_FOLDER, "tf"))
tokenizer.save_pretrained(os.path.join(MODELS_FOLDER, "tf"))

(&#39;d:\\work\\RobertaGithubIssuesClassification\\Models\\tf\\vocab.json&#39;,
 &#39;d:\\work\\RobertaGithubIssuesClassification\\Models\\tf\\merges.txt&#39;,
 &#39;d:\\work\\RobertaGithubIssuesClassification\\Models\\tf\\special_tokens_map.json&#39;,
 &#39;d:\\work\\RobertaGithubIssuesClassification\\Models\\tf\\added_tokens.json&#39;)

In [19]:
pytorch_model = RobertaForSequenceClassification.from_pretrained(os.path.join(MODELS_FOLDER, "tf"), from_tf=True)

All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

Some weights of RobertaForSequenceClassification were not initialized from the TF 2.0 model and are newly initialized: [&#39;roberta.embeddings.position_ids&#39;]
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
pytorch_model.save_pretrained(os.path.join(MODELS_FOLDER, "pt"))
tokenizer.save_pretrained(os.path.join(MODELS_FOLDER, "pt"))

(&#39;d:\\work\\RobertaGithubIssuesClassification\\Models\\pt\\vocab.json&#39;,
 &#39;d:\\work\\RobertaGithubIssuesClassification\\Models\\pt\\merges.txt&#39;,
 &#39;d:\\work\\RobertaGithubIssuesClassification\\Models\\pt\\special_tokens_map.json&#39;,
 &#39;d:\\work\\RobertaGithubIssuesClassification\\Models\\pt\\added_tokens.json&#39;)

In [21]:
callable = tf.function(github_model.call)

In [22]:
concrete_function = callable.get_concrete_function([tf.TensorSpec([None, MAX_LEN], tf.int32, name="input_ids"), tf.TensorSpec([None, MAX_LEN], tf.int32, name="attention_mask")])

In [23]:
tf.saved_model.save(github_model, os.path.join(MODELS_FOLDER, "Serving", "1"), signatures=concrete_function)

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: d:\work\RobertaGithubIssuesClassification\Models\Serving\1\assets


# Try models on test issue

In [24]:
issue = "AppDomain.SetPrincipalPolicy(PrincipalPolicy.WindowsPrincipal) works only once. Setting the PrincipalPolicy on the current AppDomain to WindowsPrincipal works only for the first thread being started. Any subsequent thread has Thread.CurrentPrincipal evaluated to NULL."
label = "area-System.Security"

In [25]:
encoded_tf = tokenizer(issue, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors='tf', add_special_tokens=True, return_token_type_ids=False)
encoded_pt = tokenizer(issue, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors='pt', add_special_tokens=True, return_token_type_ids=False)

In [26]:
github_model = TFRobertaForSequenceClassification.from_pretrained(os.path.join(MODELS_FOLDER, "tf"))

All model checkpoint weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the model checkpoint at d:\work\RobertaGithubIssuesClassification\Models\tf.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [28]:
tf_result = github_model.predict([encoded_tf['input_ids'], encoded_tf['attention_mask']])
print(tf_result)

(array([[-0.03901269, -0.70156264, -0.00813974,  5.8706336 ,  0.60016686,
        -0.90594006,  1.7966572 , -1.9033011 , -0.07496928, -0.7290467 ,
        -2.3262384 ,  1.2474042 , -0.74498063, -0.64792466, -0.71970916,
         2.2690477 , -0.9807402 , -1.0907569 , -1.7531682 ,  1.3895686 ,
        -0.98030645, -0.19943248]], dtype=float32),)


In [29]:
pytorch_model.eval()
pt_result = pytorch_model(input_ids=encoded_pt["input_ids"], attention_mask=encoded_pt["attention_mask"])
print(pt_result)

(tensor([[-0.0390, -0.7016, -0.0081,  5.8706,  0.6002, -0.9059,  1.7967, -1.9033,
         -0.0750, -0.7290, -2.3262,  1.2474, -0.7450, -0.6479, -0.7197,  2.2690,
         -0.9807, -1.0908, -1.7532,  1.3896, -0.9803, -0.1994]],
       grad_fn=&lt;AddmmBackward&gt;),)


In [44]:
np.allclose(tf_result[0], pt_result[0].detach().numpy(), rtol=1e-03, atol=1e-08)

True

In [31]:
lookup[str(np.argmax(tf_result))]

&#39;area-System.Security&#39;

# Convert to ONNX and run inference

In [32]:
import torch.onnx
pytorch_model.eval()
torch.onnx.export(pytorch_model,               # model being run
                  (encoded_pt["input_ids"], encoded_pt["attention_mask"]),  # model input (or a tuple for multiple inputs)
                  os.path.join(MODELS_FOLDER, "roberta_github_issues.onnx"),   # where to save the model (can be a file or file-like object)
                  export_params=True,        # store the trained parameter weights inside the model file
                  opset_version=12,          # the ONNX version to export the model to
                  do_constant_folding=True,  # whether to execute constant folding for optimization
                  input_names = ['input_ids', 'attention_mask'],   # the model's input names
                  output_names = ['output'], # the model's output names
                  dynamic_axes={'input_ids' : {0 : 'batch_size'},
                                'attention_mask' : {0 : 'batch_size'},
                                'output' : {0 : 'batch_size'}}
                    )

In [33]:
import onnx

onnx_model = onnx.load(os.path.join(MODELS_FOLDER, "roberta_github_issues.onnx"))
onnx.checker.check_model(onnx_model)

In [34]:
import onnxruntime

ort_session = onnxruntime.InferenceSession(os.path.join(MODELS_FOLDER, "roberta_github_issues.onnx"))

In [35]:
ort_session.get_inputs()[1].name

&#39;attention_mask&#39;

In [36]:
def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

In [37]:
ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(encoded_pt["input_ids"]), ort_session.get_inputs()[1].name: to_numpy(encoded_pt["attention_mask"])}
ort_outs = ort_session.run(None, ort_inputs)

In [38]:
print(ort_outs)
print(pt_result)
print(tf_result)

[array([[-0.03901183, -0.70156324, -0.00814154,  5.8706365 ,  0.6001704 ,
        -0.9059382 ,  1.7966552 , -1.9032998 , -0.07497119, -0.72904754,
        -2.326238  ,  1.2474039 , -0.744979  , -0.64792377, -0.7197121 ,
         2.2690408 , -0.9807419 , -1.090755  , -1.7531687 ,  1.3895706 ,
        -0.9803071 , -0.19943394]], dtype=float32)]
(tensor([[-0.0390, -0.7016, -0.0081,  5.8706,  0.6002, -0.9059,  1.7967, -1.9033,
         -0.0750, -0.7290, -2.3262,  1.2474, -0.7450, -0.6479, -0.7197,  2.2690,
         -0.9807, -1.0908, -1.7532,  1.3896, -0.9803, -0.1994]],
       grad_fn=&lt;AddmmBackward&gt;),)
(array([[-0.03901269, -0.70156264, -0.00813974,  5.8706336 ,  0.60016686,
        -0.90594006,  1.7966572 , -1.9033011 , -0.07496928, -0.7290467 ,
        -2.3262384 ,  1.2474042 , -0.74498063, -0.64792466, -0.71970916,
         2.2690477 , -0.9807402 , -1.0907569 , -1.7531682 ,  1.3895686 ,
        -0.98030645, -0.19943248]], dtype=float32),)


In [46]:
np.allclose(ort_outs, tf_result[0], rtol=1e-03, atol=1e-08)

True

In [45]:
np.allclose(ort_outs, pt_result[0].detach().numpy(), rtol=1e-03, atol=1e-08)

True

In [41]:
index = np.argmax(to_numpy(pt_result[0]))
print(f"index : {index}, category : {lookup[str(index)]}")

index : 3, category : area-System.Security


In [42]:
softmax = torch.nn.functional.softmax(pt_result[0])

In [43]:
for i, value in enumerate(softmax[0]):
    print(f"{lookup[str(i)] : <27} : {value.item() * 100 :.2f}% confidence")

area-System.Net             : 0.25% confidence
area-Infrastructure         : 0.13% confidence
area-System.ComponentModel  : 0.26% confidence
area-System.Security        : 91.46% confidence
area-System.Runtime         : 0.47% confidence
area-System.IO              : 0.10% confidence
area-System.Xml             : 1.56% confidence
area-System.Collections     : 0.04% confidence
area-System.Threading       : 0.24% confidence
area-System.Reflection      : 0.12% confidence
area-System.Memory          : 0.03% confidence
area-System.Diagnostics     : 0.90% confidence
area-Serialization          : 0.12% confidence
area-System.Drawing         : 0.13% confidence
area-Meta                   : 0.13% confidence
area-System.Data            : 2.50% confidence
area-Microsoft.CSharp       : 0.10% confidence
area-System.Numerics        : 0.09% confidence
area-System.Text            : 0.04% confidence
area-System.Globalization   : 1.04% confidence
area-System.Linq            : 0.10% confidence
area-System.