In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import tf_keras
from keras import layers
from transformers import RobertaTokenizerFast, RobertaConfig, TFRobertaModel

gpu = 1
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_visible_devices(gpus[gpu], 'GPU')
tf.config.experimental.set_memory_growth(gpus[gpu], True)

# Create Model

In [None]:
model_name = 'microsoft/codebert-base'
config = RobertaConfig.from_pretrained(model_name)
config.output_hidden_states = False
tokenizer = RobertaTokenizerFast.from_pretrained(model_name, config=config)
transformer_model = TFRobertaModel.from_pretrained(model_name, config=config)

All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at microsoft/codebert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [None]:
bert = transformer_model.layers[0]

# Create input layer for tokenized data
input_ids = tf_keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')
attention_mask = tf_keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32') 
inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
# inputs = {'input_ids': input_ids}

# Load the bert model as a layer
bert_model = bert(inputs)[1]
dropout = tf_keras.layers.Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)

outputs = tf_keras.layers.Dense(units=7, activation="softmax", kernel_initializer=tf_keras.initializers.TruncatedNormal(stddev=config.initializer_range), name='java')(pooled_output)

model = tf_keras.models.Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')

model.summary()

Model: "BERT_MultiLabel_MultiClass"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 attention_mask (InputLayer  [(None, 512)]                0         []                            
 )                                                                                                
                                                                                                  
 input_ids (InputLayer)      [(None, 512)]                0         []                            
                                                                                                  
 roberta (TFRobertaMainLaye  TFBaseModelOutputWithPooli   1246456   ['attention_mask[0][0]',      
 r)                          ngAndCrossAttentions(last_   32         'input_ids[0][0]']           
                             hidden_state=(None, 512, 7                  

# Load and process data

In [None]:
splits = {'java_train': 'data/java_train-00000-of-00001.parquet', 'java_test': 'data/java_test-00000-of-00001.parquet', 'python_train': 'data/python_train-00000-of-00001.parquet', 'python_test': 'data/python_test-00000-of-00001.parquet', 'pharo_train': 'data/pharo_train-00000-of-00001.parquet', 'pharo_test': 'data/pharo_test-00000-of-00001.parquet'}

java_train = pd.read_parquet("hf://datasets/NLBSE/nlbse25-code-comment-classification/" + splits["java_train"])
java_test = pd.read_parquet("hf://datasets/NLBSE/nlbse25-code-comment-classification/" + splits["java_test"])

In [None]:
def tokenize_data(df, tokenizer):
    # Tokenize data and truncate/pad to 512
    inputs = tokenizer(df, padding='max_length', truncation=True, max_length=512)
    return inputs

# Tokenize all data
java_train_inputs = tokenize_data(java_train.combo.tolist(), tokenizer)
java_test_inputs = tokenize_data(java_test.combo.tolist(), tokenizer)

In [None]:
# store labels in list form
java_train_labels = java_train.labels.tolist()
java_test_labels = java_test.labels.tolist()

In [None]:
# Create dataset from the preprocessed data
def create_tf_dataset(inputs, labels):
    return tf.data.Dataset.from_tensor_slices((dict(inputs), labels))

java_train_dataset = create_tf_dataset(java_train_inputs, java_train_labels)
java_test_dataset = create_tf_dataset(java_test_inputs, java_test_labels)

# Compile and train

In [None]:
# Compile the model with RoBERTa base frozen
model.layers[2].trainable = False

optimizer = tf_keras.optimizers.AdamW(learning_rate=5e-4, epsilon=1e-8)
loss = tf_keras.losses.CategoricalCrossentropy(from_logits = True)
metric = [tf_keras.metrics.CategoricalAccuracy('accuracy'), tf_keras.metrics.TruePositives(), tf_keras.metrics.TrueNegatives(), tf_keras.metrics.FalsePositives(), tf_keras.metrics.FalseNegatives()]

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
model.summary()

batch_size = 8

Model: "BERT_MultiLabel_MultiClass"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 attention_mask (InputLayer  [(None, 512)]                0         []                            
 )                                                                                                
                                                                                                  
 input_ids (InputLayer)      [(None, 512)]                0         []                            
                                                                                                  


 roberta (TFRobertaMainLaye  TFBaseModelOutputWithPooli   1246456   ['attention_mask[0][0]',      
 r)                          ngAndCrossAttentions(last_   32         'input_ids[0][0]']           
                             hidden_state=(None, 512, 7                                           
                             68),                                                                 
                              pooler_output=(None, 768)                                           
                             , past_key_values=None, hi                                           
                             dden_states=None, attentio                                           
                             ns=None, cross_attentions=                                           
                             None)                                                                
                                                                                                  
 pooled_ou

In [None]:
# Shuffle and batch the training data use autotune for faster execution
train_dataset = java_train_dataset.shuffle(1000).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

model.fit(train_dataset, epochs=5)

Epoch 1/5


  output, from_logits = _get_logits(


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tf_keras.src.callbacks.History at 0x7f6c7d92c4c0>

In [None]:
# Evaluate on test data
java_test_results = model.evaluate(java_test_dataset.batch(batch_size))

TP = java_test_results[-4]
TF = java_test_results[-3]
FP = java_test_results[-2]
FN = java_test_results[-1]

precision = TP/(TP+FP)
recall = TP/(TP+FN)
f1 = (2*precision*recall)/(precision+recall)

print(f"Java Test precision: {precision}")
print(f"Java Test recall: {recall}")
print(f"Java Test f1: {f1}")

Java Test precision: 0.7151162790697675
Java Test recall: 0.21243523316062177
Java Test f1: 0.32756324900133155


In [None]:
#Set RoBERTa base as trainable and retrain (lower learning rate and more epochs)
model.layers[2].trainable = True

optimizer = tf_keras.optimizers.AdamW(learning_rate=5e-5, epsilon=1e-8)
loss = tf_keras.losses.CategoricalCrossentropy(from_logits = True)
metric = [tf_keras.metrics.CategoricalAccuracy('accuracy'), tf_keras.metrics.TruePositives(), tf_keras.metrics.TrueNegatives(), tf_keras.metrics.FalsePositives(), tf_keras.metrics.FalseNegatives()]

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

model.fit(train_dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tf_keras.src.callbacks.History at 0x7f6b4cf49f90>

# Calculate metrics (precision, recall, f1-score)

In [None]:
# get results
java_test_results = model.predict(java_test_dataset.batch(batch_size))



In [None]:
# calculate and output metrics
def getMetrics(y_true, y_pred):
    TP = [0]*7
    TN = [0]*7
    FP = [0]*7
    FN = [0]*7

    results = []

    for i in range(7):
        pred_col = y_pred[:, i].tolist()
        true_col = np.array(y_true.labels.tolist())[:, i].tolist()

        for j in range(len(pred_col)):
            pred_col[j] = int(pred_col[j] > .5)

        for pair in zip(true_col, pred_col):
            if pair[0] == pair[1]:
                if pair[0] == 1:
                    TP[i] += 1
                else:
                    TN[i] += 1
            else:
                if pair[1] == 1:
                    FP[i] += 1
                else:
                    FN[i] += 1
        precision = TP[i]/(TP[i]+FP[i])
        recall = TP[i]/(TP[i]+FN[i])
        f1 = (2*precision*recall)/(precision+recall)
        results.append([precision, recall, f1])


    return results

results = np.array(getMetrics(java_test, java_test_results))
print(results)
print(sum(results[:, 0])/7)
print(sum(results[:, 1])/7)
print(sum(results[:, 2])/7)


[[0.87241003 0.89686099 0.88446656]
 [1.         1.         1.        ]
 [0.31707317 0.38235294 0.34666667]
 [0.95264624 0.79350348 0.86582278]
 [0.77678571 0.94565217 0.85294118]
 [0.90909091 0.66666667 0.76923077]
 [0.44444444 0.23529412 0.30769231]]
0.7532072158317813
0.7029043380326782
0.7181171801398294
