# Imports and global variables

In [1]:
import tensorflow as tf
from transformers import RobertaTokenizerFast, TFRobertaModel, TFRobertaForSequenceClassification, RobertaForSequenceClassification
import numpy as np
import pandas as pd
import os
import torch
import json

In [2]:
DATA_FOLDER = os.path.join("./", "Data")
MODELS_PATH = os.path.join("./", "Models")

In [3]:
train_df = pd.read_csv(os.path.join(DATA_FOLDER, "corefx_cleaned_train.csv"))
val_df = pd.read_csv(os.path.join(DATA_FOLDER, "corefx_cleaned_val.csv"))

In [4]:
TRAIN_LEN = len(train_df)
VAL_LEN = len(val_df)
del train_df
del val_df

In [5]:
BATCH_SIZE = 32
MAX_LEN = 64

# Create Tensorflow datasets

In [6]:
train_dataset = tf.data.experimental.CsvDataset(os.path.join(DATA_FOLDER, "corefx_cleaned_train.csv"), [tf.string, tf.int32], header=True)
val_dataset = tf.data.experimental.CsvDataset(os.path.join(DATA_FOLDER, "corefx_cleaned_val.csv"), [tf.string, tf.int32], header=True)

In [7]:
train_dataset = train_dataset.batch(BATCH_SIZE)
val_dataset = val_dataset.batch(BATCH_SIZE)

In [8]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

## Define transformations to do on datasets

In [9]:
def tf_py_function(sentences):
    decoded = []
    for sentence in sentences.numpy():
        decoded.append(sentence.decode())
    encoded = tokenizer(decoded, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors='tf')
    input_ids = encoded["input_ids"]
    attention_mask = encoded["attention_mask"]
    return (input_ids, attention_mask)

In [10]:
# Need to use tf.py_function because tokenizer doesn't operate on tensors
def encode_data(batch_x, batch_y):
    input_ids, attention_mask = tf.py_function(tf_py_function, [batch_x], (tf.int32, tf.int32))
    input_ids.set_shape([None, MAX_LEN])
    attention_mask.set_shape([None, MAX_LEN])
    return (input_ids, attention_mask, batch_y)

In [11]:
def map_to_dict(input_ids, attention_mask, labels):
    return ({"input_ids": input_ids, "attention_mask": attention_mask}, labels)

## Apply transformations and prepare datasets for training

In [12]:
train_dataset = train_dataset.map(encode_data, num_parallel_calls=tf.data.experimental.AUTOTUNE).map(map_to_dict, num_parallel_calls=tf.data.experimental.AUTOTUNE).repeat().prefetch(1)
val_dataset = val_dataset.map(encode_data, num_parallel_calls=tf.data.experimental.AUTOTUNE).map(map_to_dict, num_parallel_calls=tf.data.experimental.AUTOTUNE).repeat().prefetch(1)

# Create and train TF model

In [13]:
with open(os.path.join(DATA_FOLDER, 'lookup.json')) as json_file: 
    lookup = json.load(json_file) 

In [17]:
github_model = TFRobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(lookup.keys()) // 2)

Some weights of the model checkpoint at roberta-base were not used when initializing TFRobertaForSequenceClassification: ['lm_head']
- This IS expected if you are initializing TFRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
opt = tf.keras.optimizers.Adam(learning_rate=1e-4, epsilon=1e-08)
loss = tf.keras.losses.sparse_categorical_crossentropy(from_logits=True)
accuracy = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")

github_model.compile(optimizer=opt, loss=loss, metrics=[accuracy])

In [19]:
github_model.fit(train_dataset, validation_data=val_dataset, steps_per_epoch= TRAIN_LEN // BATCH_SIZE + 1  , epochs=10, validation_steps= VAL_LEN // BATCH_SIZE +1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1f5aaaaea08>

# Save TF model (Huggingface format and SavedModel format) and create Pytorch model from it

In [20]:
github_model.save_pretrained(os.path.join(MODELS_PATH, "tf"))
tokenizer.save_pretrained(os.path.join(MODELS_PATH, "tf"))

('./Models\\tf\\vocab.json',
 './Models\\tf\\merges.txt',
 './Models\\tf\\special_tokens_map.json',
 './Models\\tf\\added_tokens.json')

In [21]:
pytorch_model = RobertaForSequenceClassification.from_pretrained(os.path.join(MODELS_PATH, "tf"), from_tf=True)

All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.


In [22]:
pytorch_model.save_pretrained(os.path.join(MODELS_PATH, "pt"))
tokenizer.save_pretrained(os.path.join(MODELS_PATH, "pt"))

('./Models\\pt\\vocab.json',
 './Models\\pt\\merges.txt',
 './Models\\pt\\special_tokens_map.json',
 './Models\\pt\\added_tokens.json')

In [23]:
callable = tf.function(github_model.call)

In [24]:
concrete_function = callable.get_concrete_function([tf.TensorSpec([None, MAX_LEN], tf.int32, name="input_ids"), tf.TensorSpec([None, MAX_LEN], tf.int32, name="attention_mask")])

In [25]:
tf.saved_model.save(github_model, os.path.join(MODELS_PATH, "Serving", "1"), signatures=concrete_function)

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: ./Models\Serving\1\assets


# Try models on test issue

In [26]:
issue = "AppDomain.SetPrincipalPolicy(PrincipalPolicy.WindowsPrincipal) works only once. Setting the PrincipalPolicy on the current AppDomain to WindowsPrincipal works only for the first thread being started. Any subsequent thread has Thread.CurrentPrincipal evaluated to NULL."
label = "area-System.Security"

In [27]:
encoded_tf = tokenizer(issue, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors='tf', add_special_tokens=True, return_token_type_ids=False)
encoded_pt = tokenizer(issue, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors='pt', add_special_tokens=True, return_token_type_ids=False)

In [28]:
github_model = TFRobertaForSequenceClassification.from_pretrained(os.path.join(MODELS_PATH, "tf"))

All model checkpoint weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the model checkpoint at ./Models\tf.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [29]:
tf_result = github_model(encoded_tf)
print(tf_result)

(<tf.Tensor: shape=(1, 22), dtype=float32, numpy=
array([[-1.3786112 , -2.113226  , -0.5003843 ,  3.178758  ,  0.22252794,
        -1.0493124 ,  0.28553805,  0.8929118 , -0.50459427,  2.603506  ,
        -0.94234836, -1.491632  ,  1.275148  ,  0.3172301 , -1.9975668 ,
         0.8004527 ,  1.204209  ,  0.18403748,  1.8349376 ,  0.14521039,
        -0.09998886, -1.1488237 ]], dtype=float32)>,)


In [30]:
pt_result = pytorch_model(input_ids=encoded_pt["input_ids"], attention_mask=encoded_pt["attention_mask"])
print(pt_result)

(tensor([[-1.3786, -2.1132, -0.5004,  3.1788,  0.2225, -1.0493,  0.2855,  0.8929,
         -0.5046,  2.6035, -0.9423, -1.4916,  1.2751,  0.3172, -1.9976,  0.8005,
          1.2042,  0.1840,  1.8349,  0.1452, -0.1000, -1.1488]],
       grad_fn=<AddmmBackward>),)


In [36]:
np.allclose(tf_result[0], pt_result[0].detach().numpy())

True

In [32]:
lookup[str(np.argmax(tf_result))]

'area-System.Security'

# Convert to ONNX and run inference

In [37]:
import torch.onnx
pytorch_model.eval()
torch.onnx.export(pytorch_model,               # model being run
                  (encoded_pt["input_ids"], encoded_pt["attention_mask"]),  # model input (or a tuple for multiple inputs)
                  os.path.join(MODELS_PATH, "roberta_github_issues.onnx"),   # where to save the model (can be a file or file-like object)
                  export_params=True,        # store the trained parameter weights inside the model file
                  opset_version=12,          # the ONNX version to export the model to
                  do_constant_folding=True,  # whether to execute constant folding for optimization
                  input_names = ['input_ids', 'attention_mask'],   # the model's input names
                  output_names = ['output'], # the model's output names
                  dynamic_axes={'input_ids' : {0 : 'batch_size'},
                                'attention_mask' : {0 : 'batch_size'},
                                'output' : {0 : 'batch_size'}}
                    )

In [38]:
import onnx

onnx_model = onnx.load(os.path.join(MODELS_PATH, "roberta_github_issues.onnx"))
onnx.checker.check_model(onnx_model)

In [39]:
import onnxruntime

ort_session = onnxruntime.InferenceSession(os.path.join(MODELS_PATH, "roberta_github_issues.onnx"))

In [40]:
ort_session.get_inputs()[1].name

'attention_mask'

In [41]:
def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

In [42]:
ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(encoded_pt["input_ids"]), ort_session.get_inputs()[1].name: to_numpy(encoded_pt["attention_mask"])}
ort_outs = ort_session.run(None, ort_inputs)

In [53]:
print(ort_outs)
print(pt_result)
print(tf_result)

[array([[-1.3786125 , -2.1132238 , -0.5003866 ,  3.1787558 ,  0.22253025,
        -1.0493083 ,  0.28553545,  0.8929088 , -0.5045951 ,  2.603507  ,
        -0.942349  , -1.4916303 ,  1.2751487 ,  0.3172335 , -1.9975661 ,
         0.800449  ,  1.2042091 ,  0.18403693,  1.8349365 ,  0.14520907,
        -0.09998947, -1.1488235 ]], dtype=float32)]
(tensor([[-1.3786, -2.1132, -0.5004,  3.1788,  0.2225, -1.0493,  0.2855,  0.8929,
         -0.5046,  2.6035, -0.9423, -1.4916,  1.2751,  0.3172, -1.9976,  0.8005,
          1.2042,  0.1840,  1.8349,  0.1452, -0.1000, -1.1488]],
       grad_fn=<AddmmBackward>),)
(<tf.Tensor: shape=(1, 22), dtype=float32, numpy=
array([[-1.3786112 , -2.113226  , -0.5003843 ,  3.178758  ,  0.22252794,
        -1.0493124 ,  0.28553805,  0.8929118 , -0.50459427,  2.603506  ,
        -0.94234836, -1.491632  ,  1.275148  ,  0.3172301 , -1.9975668 ,
         0.8004527 ,  1.204209  ,  0.18403748,  1.8349376 ,  0.14521039,
        -0.09998886, -1.1488237 ]], dtype=float32)>

In [46]:
index = np.argmax(to_numpy(pt_result[0]))
print(f"index : {index}, category : {lookup[str(index)]}")

index : 3, category : area-System.Security


In [54]:
softmax = torch.nn.functional.softmax(pt_result[0])

In [63]:
for i, value in enumerate(softmax[0]):
    print(f"{lookup[str(i)] : <27} : {value.item() * 100 :.2f}% confidence")

area-System.Net             : 0.38% confidence
area-Infrastructure         : 0.18% confidence
area-System.ComponentModel  : 0.92% confidence
area-System.Security        : 36.62% confidence
area-System.Runtime         : 1.90% confidence
area-System.IO              : 0.53% confidence
area-System.Xml             : 2.03% confidence
area-System.Collections     : 3.72% confidence
area-System.Threading       : 0.92% confidence
area-System.Reflection      : 20.60% confidence
area-System.Memory          : 0.59% confidence
area-System.Diagnostics     : 0.34% confidence
area-Serialization          : 5.46% confidence
area-System.Drawing         : 2.09% confidence
area-Meta                   : 0.21% confidence
area-System.Data            : 3.39% confidence
area-Microsoft.CSharp       : 5.08% confidence
area-System.Numerics        : 1.83% confidence
area-System.Text            : 9.55% confidence
area-System.Globalization   : 1.76% confidence
area-System.Linq            : 1.38% confidence
area-System