In [1]:
import tensorflow as tf
from transformers import RobertaTokenizerFast, TFRobertaModel, TFRobertaForSequenceClassification, RobertaForSequenceClassification
import numpy as np
import pandas as pd
import os
import pandas as pd
import torch

In [2]:
DATA_PATH = os.path.join("./", "Data")
MODELS_PATH = os.path.join("./", "Models")

# Data Preprocessing

In [3]:
df = pd.read_csv(os.path.join(DATA_PATH, "corefx-issues-train.tsv"), sep="\t")

In [4]:
df

Unnamed: 0,ID,Area,Title,Description
0,29338,area-System.Net,Include fragment and query in Uri.LocalPath on...,"While testing XmlUriResolver, @pjanotti discov..."
1,29337,area-System.Net,Unify setting null CookieContainer behavior on...,For HttpClientHandler layer (above the WinHttp...
2,29334,area-System.Net,Check URI scheme length only after verifying t...,URI construction is failing on valid URIs unde...
3,29331,area-Infrastructure,"[Perf] Ubuntu16.04 runs blocked by multiple ""P...",[perf_ubuntu16.04_release](https://ci2.dot.net...
4,29329,area-System.ComponentModel,Port System.ComponentModel.Composition.Registr...,"Greetings, regarding [Port System.Component..."
...,...,...,...,...
1610,26959,area-System.Runtime,Re-evaluate default buffer size for getpw nati...,"By default, we are allocating 1K of memory on ..."
1611,26957,area-System.Net,Validate ClientWebSocket wss connections work ...,After https://github.com/dotnet/corefx/pull/26...
1612,26956,area-System.Numerics,"Add Quaternion.Divide(Quaternion, float)",I noticed that Quaternion.Divide Method has no...
1613,26954,area-System.Runtime,Proposal: TryForSufficientStack method to supp...,"_From @kkokosa on February 8, 2018 12:8_ Due ..."


In [5]:
df = df.replace(np.nan, '', regex=True)

In [6]:
df[df.isnull().any(axis=1)]

Unnamed: 0,ID,Area,Title,Description


In [7]:
df.Area.unique()

array(['area-System.Net', 'area-Infrastructure',
       'area-System.ComponentModel', 'area-System.Security',
       'area-System.Runtime', 'area-System.IO', 'area-System.Xml',
       'area-System.Collections', 'area-System.Threading',
       'area-System.Reflection', 'area-System.Memory',
       'area-System.Diagnostics', 'area-Serialization',
       'area-System.Drawing', 'area-Meta', 'area-System.Data',
       'area-Microsoft.CSharp', 'area-System.Numerics',
       'area-System.Text', 'area-System.Globalization',
       'area-System.Linq', 'area-System.Console'], dtype=object)

In [8]:
lookup = {}
for i, area in enumerate(df.Area.unique()):
    lookup[area] = i
    lookup[i] = area

In [9]:
lookup

{'area-System.Net': 0,
 0: 'area-System.Net',
 'area-Infrastructure': 1,
 1: 'area-Infrastructure',
 'area-System.ComponentModel': 2,
 2: 'area-System.ComponentModel',
 'area-System.Security': 3,
 3: 'area-System.Security',
 'area-System.Runtime': 4,
 4: 'area-System.Runtime',
 'area-System.IO': 5,
 5: 'area-System.IO',
 'area-System.Xml': 6,
 6: 'area-System.Xml',
 'area-System.Collections': 7,
 7: 'area-System.Collections',
 'area-System.Threading': 8,
 8: 'area-System.Threading',
 'area-System.Reflection': 9,
 9: 'area-System.Reflection',
 'area-System.Memory': 10,
 10: 'area-System.Memory',
 'area-System.Diagnostics': 11,
 11: 'area-System.Diagnostics',
 'area-Serialization': 12,
 12: 'area-Serialization',
 'area-System.Drawing': 13,
 13: 'area-System.Drawing',
 'area-Meta': 14,
 14: 'area-Meta',
 'area-System.Data': 15,
 15: 'area-System.Data',
 'area-Microsoft.CSharp': 16,
 16: 'area-Microsoft.CSharp',
 'area-System.Numerics': 17,
 17: 'area-System.Numerics',
 'area-System.Text':

In [10]:
df.Area = df.Area.apply(lambda x: lookup[x])

In [11]:
df.Area

0        0
1        0
2        0
3        1
4        2
        ..
1610     4
1611     0
1612    17
1613     4
1614     5
Name: Area, Length: 1615, dtype: int64

In [12]:
labels = df.Area
titles = df.Title
descriptions = df.Description

In [13]:
text = []
for title, description in zip(titles, descriptions):
    text.append(" ".join(title.split()) + " " + " ".join(description.split()))

In [14]:
len(labels)

1615

In [15]:
len(text)

1615

In [16]:
len(max(text))

236

In [17]:
text[:5]

["Include fragment and query in Uri.LocalPath on Unix While testing XmlUriResolver, @pjanotti discovered that any segments of a file path following a '#' symbol will be cut out of Uri.LocalPath on Unix. Based on additional tests, this also occurs for the '?' symbol. This is happening because the Unix specific case for local path only uses the path component of the URI: https://github.com/dotnet/corefx/blob/9e8d443ff78c4f0a9a6bedf7f95961cf96ceff0a/src/System.Private.Uri/src/System/Uri.cs#L1032-L1037 The fix here is to include the fragment and query in LocalPath in the Unix path specific case. This PR enables the test case in XmlUriResolver that uncovered this issues, and adds some additional cases to our URI tests. Fixes: #28486",
 "Unify setting null CookieContainer behavior on HttpClientHandler For HttpClientHandler layer (above the WinHttpHandler layer on Windows), we should be consistent and throw the exception in the CookieContainer setter when null value is provided, to match .NET

In [18]:
labels[:5]

0    0
1    0
2    0
3    1
4    2
Name: Area, dtype: int64

In [19]:
[lookup[x] for x in labels[:5]]

['area-System.Net',
 'area-System.Net',
 'area-System.Net',
 'area-Infrastructure',
 'area-System.ComponentModel']

In [20]:
del df
df = pd.DataFrame(list(zip(text, labels)), columns =['Text', 'Label'])
df.to_csv(os.path.join(DATA_PATH, "cleaned_train.csv"), index=False)

In [21]:
del df

# Dataset Creation

In [22]:
BATCH_SIZE = 32
MAX_LEN = 128

In [23]:
github_dataset = tf.data.experimental.CsvDataset(os.path.join(DATA_PATH, "cleaned_train.csv"), [tf.string, tf.int32], header=True)

In [24]:
github_dataset = github_dataset.batch(BATCH_SIZE)

In [25]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

In [26]:
def tf_py_function(sentences):
    decoded = []
    for sentence in sentences.numpy():
        decoded.append(sentence.decode())
    encoded = tokenizer(decoded, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors='tf')
    input_ids = encoded["input_ids"]
    attention_mask = encoded["attention_mask"]
    return (input_ids, attention_mask)

In [27]:
def encode_data(batch_x, batch_y):
    input_ids, attention_mask = tf.py_function(tf_py_function, [batch_x], (tf.int32, tf.int32))
    input_ids.set_shape([None, MAX_LEN])
    attention_mask.set_shape([None, MAX_LEN])
    return (input_ids, attention_mask, batch_y)

In [28]:
def map_to_dict(input_ids, attention_mask, labels):
    return ({"input_ids": input_ids, "attention_mask": attention_mask}, labels)

In [29]:
new_dataset = github_dataset.map(encode_data, num_parallel_calls=tf.data.experimental.AUTOTUNE).map(map_to_dict, num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [30]:
github_train = new_dataset.take(int(len(labels) // BATCH_SIZE *0.8)).cache()
github_valid = new_dataset.skip(int(len(labels) // BATCH_SIZE *0.8)).cache()
github_train = github_train.repeat().prefetch(1)
github_valid = github_valid.repeat().prefetch(1)

# Create and train TF model

In [31]:
github_model = TFRobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=22)

Some weights of the model checkpoint at roberta-base were not used when initializing TFRobertaForSequenceClassification: ['lm_head']
- This IS expected if you are initializing TFRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
opt = tf.keras.optimizers.Adam(learning_rate=1e-5, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
accuracy = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")

github_model.compile(optimizer=opt, loss=loss, metrics=[accuracy])

In [33]:
github_model.fit(github_train, validation_data=github_valid, steps_per_epoch= int(int(len(labels) // BATCH_SIZE *0.8)) , epochs=10, validation_steps=int(int(len(labels) // BATCH_SIZE *0.2)) )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1baa062f4c8>

# Save TF model (Huggingface format and SavedModel format) and create Pytorch model from it

In [34]:
github_model.save_pretrained(os.path.join(MODELS_PATH, "tf"))
tokenizer.save_pretrained(os.path.join(MODELS_PATH, "tf"))

('./Models\\tf\\vocab.json',
 './Models\\tf\\merges.txt',
 './Models\\tf\\special_tokens_map.json',
 './Models\\tf\\added_tokens.json')

In [35]:
pytorch_model = RobertaForSequenceClassification.from_pretrained(os.path.join(MODELS_PATH, "tf"), from_tf=True)

All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.


In [36]:
pytorch_model.save_pretrained(os.path.join(MODELS_PATH, "pt"))
tokenizer.save_pretrained(os.path.join(MODELS_PATH, "pt"))

('./Models\\pt\\vocab.json',
 './Models\\pt\\merges.txt',
 './Models\\pt\\special_tokens_map.json',
 './Models\\pt\\added_tokens.json')

In [37]:
callable = tf.function(github_model.call)

In [38]:
concrete_function = callable.get_concrete_function([tf.TensorSpec([None, MAX_LEN], tf.int32, name="input_ids"), tf.TensorSpec([None, MAX_LEN], tf.int32, name="attention_mask")])

In [39]:
tf.saved_model.save(github_model, os.path.join(MODELS_PATH, "Serving", "1"), signatures=concrete_function)

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: ./Models\Serving\1\assets


# Try models on test issue

In [40]:
issue = "AppDomain.SetPrincipalPolicy(PrincipalPolicy.WindowsPrincipal) works only once. Setting the PrincipalPolicy on the current AppDomain to WindowsPrincipal works only for the first thread being started. Any subsequent thread has Thread.CurrentPrincipal evaluated to NULL."

In [41]:
encoded_tf = tokenizer(issue, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors='tf', add_special_tokens=True, return_token_type_ids=False)
encoded_pt = tokenizer(issue, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors='pt', add_special_tokens=True, return_token_type_ids=False)

In [42]:
github_model = TFRobertaForSequenceClassification.from_pretrained(os.path.join(MODELS_PATH, "tf"))

All model checkpoint weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the model checkpoint at ./Models\tf.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [43]:
tf_result = github_model(encoded_tf)
print(tf_result)

(<tf.Tensor: shape=(1, 22), dtype=float32, numpy=
array([[-0.5988496 ,  1.1280658 ,  0.47194374, -0.50786096,  1.2456827 ,
         2.5384133 , -0.6554079 , -0.72741085,  0.67490077, -0.09534095,
        -1.4605486 ,  1.3067842 , -0.6346639 , -0.4347392 ,  0.521743  ,
         1.1091499 , -0.45329064, -0.6518139 , -0.71086705, -0.8429568 ,
        -0.49102157, -0.33502215]], dtype=float32)>,)


In [44]:
pt_result = pytorch_model(input_ids=encoded_pt["input_ids"], attention_mask=encoded_pt["attention_mask"])
print(pt_result)

(tensor([[-0.5989,  1.1281,  0.4719, -0.5079,  1.2457,  2.5384, -0.6554, -0.7274,
          0.6749, -0.0953, -1.4605,  1.3068, -0.6347, -0.4347,  0.5217,  1.1091,
         -0.4533, -0.6518, -0.7109, -0.8430, -0.4910, -0.3350]],
       grad_fn=<AddmmBackward>),)


# Convert to ONNX and run inference

In [45]:
import torch.onnx
pytorch_model.eval()
torch.onnx.export(pytorch_model,               # model being run
                  (encoded_pt["input_ids"], encoded_pt["attention_mask"]),  # model input (or a tuple for multiple inputs)
                  os.path.join(MODELS_PATH, "roberta_github_issues.onnx"),   # where to save the model (can be a file or file-like object)
                  export_params=True,        # store the trained parameter weights inside the model file
                  opset_version=12,          # the ONNX version to export the model to
                  do_constant_folding=True,  # whether to execute constant folding for optimization
                  input_names = ['input_ids', 'attention_mask'],   # the model's input names
                  output_names = ['output'], # the model's output names
                  dynamic_axes={'input_ids' : {0 : 'batch_size'},
                                'attention_mask' : {0 : 'batch_size'},
                                'output' : {0 : 'batch_size'}}
                    )

In [46]:
import onnx

onnx_model = onnx.load(os.path.join(MODELS_PATH, "roberta_github_issues.onnx"))
onnx.checker.check_model(onnx_model)

In [47]:
import onnxruntime

ort_session = onnxruntime.InferenceSession(os.path.join(MODELS_PATH, "roberta_github_issues.onnx"))

In [48]:
ort_session.get_inputs()[1].name

'attention_mask'

In [49]:
def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

In [50]:
ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(encoded_pt["input_ids"]), ort_session.get_inputs()[1].name: to_numpy(encoded_pt["attention_mask"])}
ort_outs = ort_session.run(None, ort_inputs)

In [51]:
print(ort_outs)
print(pt_result)
print(tf_result)

[array([[-0.59884876,  1.1280674 ,  0.4719437 , -0.5078648 ,  1.2456806 ,
         2.538411  , -0.6554067 , -0.7274111 ,  0.67490226, -0.09534154,
        -1.4605478 ,  1.306784  , -0.6346635 , -0.43473876,  0.5217458 ,
         1.1091526 , -0.45329082, -0.6518149 , -0.7108661 , -0.8429569 ,
        -0.49102157, -0.33502263]], dtype=float32)]
(tensor([[-0.5989,  1.1281,  0.4719, -0.5079,  1.2457,  2.5384, -0.6554, -0.7274,
          0.6749, -0.0953, -1.4605,  1.3068, -0.6347, -0.4347,  0.5217,  1.1091,
         -0.4533, -0.6518, -0.7109, -0.8430, -0.4910, -0.3350]],
       grad_fn=<AddmmBackward>),)
(<tf.Tensor: shape=(1, 22), dtype=float32, numpy=
array([[-0.5988496 ,  1.1280658 ,  0.47194374, -0.50786096,  1.2456827 ,
         2.5384133 , -0.6554079 , -0.72741085,  0.67490077, -0.09534095,
        -1.4605486 ,  1.3067842 , -0.6346639 , -0.4347392 ,  0.521743  ,
         1.1091499 , -0.45329064, -0.6518139 , -0.71086705, -0.8429568 ,
        -0.49102157, -0.33502215]], dtype=float32)>

In [55]:
index = np.argmax(to_numpy(pt_result[0]))
print(f"index : {index}, category : {lookup[index]}")

index : 5, category : area-System.IO


In [53]:
softmax = torch.nn.functional.softmax(pt_result[0])

In [54]:
softmax

tensor([[0.0141, 0.0791, 0.0410, 0.0154, 0.0889, 0.3239, 0.0133, 0.0124, 0.0503,
         0.0233, 0.0059, 0.0945, 0.0136, 0.0166, 0.0431, 0.0776, 0.0163, 0.0133,
         0.0126, 0.0110, 0.0157, 0.0183]], grad_fn=<SoftmaxBackward>)