In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import tf_keras
from keras import layers
from transformers import RobertaTokenizerFast, RobertaConfig, TFRobertaModel

gpu = 1
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_visible_devices(gpus[gpu], 'GPU')
tf.config.experimental.set_memory_growth(gpus[gpu], True)

2024-12-04 17:53:50.823086: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733360030.836903 1622778 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733360030.841121 1622778 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-04 17:53:50.856806: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Create Model

In [None]:
model_name = 'microsoft/codebert-base'
config = RobertaConfig.from_pretrained(model_name)
config.output_hidden_states = False
tokenizer = RobertaTokenizerFast.from_pretrained(model_name, config=config)
transformer_model = TFRobertaModel.from_pretrained(model_name, config=config)

I0000 00:00:1733360038.303328 1622778 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 78777 MB memory:  -> device: 1, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:c1:00.0, compute capability: 8.0
All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at microsoft/codebert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [None]:
bert = transformer_model.layers[0]

# Create input layer for tokenized data
input_ids = tf_keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')
attention_mask = tf_keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32') 
inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
# inputs = {'input_ids': input_ids}

# Load the bert model as a layer
bert_model = bert(inputs)[1]
dropout = tf_keras.layers.Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)

outputs = tf_keras.layers.Dense(units=7, activation="softmax", kernel_initializer=tf_keras.initializers.TruncatedNormal(stddev=config.initializer_range), name='pharo')(pooled_output)
model = tf_keras.models.Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')

model.summary()

Model: "BERT_MultiLabel_MultiClass"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 attention_mask (InputLayer  [(None, 512)]                0         []                            
 )                                                                                                
                                                                                                  
 input_ids (InputLayer)      [(None, 512)]                0         []                            
                                                                                                  
 roberta (TFRobertaMainLaye  TFBaseModelOutputWithPooli   1246456   ['attention_mask[0][0]',      
 r)                          ngAndCrossAttentions(last_   32         'input_ids[0][0]']           
                             hidden_state=(None, 512, 7                  

# Load and process data

In [2]:
splits = {'java_train': 'data/java_train-00000-of-00001.parquet', 'java_test': 'data/java_test-00000-of-00001.parquet', 'python_train': 'data/python_train-00000-of-00001.parquet', 'python_test': 'data/python_test-00000-of-00001.parquet', 'pharo_train': 'data/pharo_train-00000-of-00001.parquet', 'pharo_test': 'data/pharo_test-00000-of-00001.parquet'}

pharo_train = pd.read_parquet("hf://datasets/NLBSE/nlbse25-code-comment-classification/" + splits["pharo_train"])
pharo_test = pd.read_parquet("hf://datasets/NLBSE/nlbse25-code-comment-classification/" + splits["pharo_test"])

In [None]:
def tokenize_data(df, tokenizer):
    # Tokenize data and truncate/pad to 512
    inputs = tokenizer(df, padding='max_length', truncation=True, max_length=512)
    return inputs

# Tokenize all data
pharo_train_inputs = tokenize_data(pharo_train.combo.tolist(), tokenizer)
pharo_test_inputs = tokenize_data(pharo_test.combo.tolist(), tokenizer)

In [None]:
# store labels in list form
pharo_train_labels = pharo_train.labels.tolist()
pharo_test_labels = pharo_test.labels.tolist()

In [None]:
# Create dataset from the preprocessed data
def create_tf_dataset(inputs, labels):
    return tf.data.Dataset.from_tensor_slices((dict(inputs), labels))

pharo_train_dataset = create_tf_dataset(pharo_train_inputs, pharo_train_labels)
pharo_test_dataset = create_tf_dataset(pharo_test_inputs, pharo_test_labels)

# Compile and train

In [None]:
# Compile the model with RoBERTa base frozen
model.layers[2].trainable = False

optimizer = tf_keras.optimizers.AdamW(learning_rate=5e-5, epsilon=1e-8)
loss = tf_keras.losses.CategoricalCrossentropy(from_logits = True)
metric = [tf_keras.metrics.CategoricalAccuracy('accuracy'), tf_keras.metrics.TruePositives(), tf_keras.metrics.TrueNegatives(), tf_keras.metrics.FalsePositives(), tf_keras.metrics.FalseNegatives()]

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
model.summary()

batch_size = 8

Model: "BERT_MultiLabel_MultiClass"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 attention_mask (InputLayer  [(None, 512)]                0         []                            
 )                                                                                                
                                                                                                  
 input_ids (InputLayer)      [(None, 512)]                0         []                            
                                                                                                  
 roberta (TFRobertaMainLaye  TFBaseModelOutputWithPooli   1246456   ['attention_mask[0][0]',      
 r)                          ngAndCrossAttentions(last_   32         'input_ids[0][0]']           
                             hidden_state=(None, 512, 7                  

In [None]:
# Shuffle and batch the training data use autotune for faster execution
train_dataset = pharo_train_dataset.shuffle(1000).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

model.fit(train_dataset, epochs=5)

Epoch 1/5


  output, from_logits = _get_logits(
I0000 00:00:1733360065.480496 1684088 service.cc:148] XLA service 0x7f24ad1051c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1733360065.480522 1684088 service.cc:156]   StreamExecutor device (0): NVIDIA A100 80GB PCIe, Compute Capability 8.0
2024-12-04 17:54:25.485751: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1733360065.500847 1684088 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1733360065.573224 1684088 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tf_keras.src.callbacks.History at 0x7f25bf162230>

In [None]:
#Set RoBERTa base as trainable and retrain (lower learning rate and more epochs)
model.layers[2].trainable = True

optimizer = tf_keras.optimizers.AdamW(learning_rate=5e-5, epsilon=1e-8)
loss = tf_keras.losses.CategoricalCrossentropy(from_logits = True)
metric = [tf_keras.metrics.CategoricalAccuracy('accuracy'), tf_keras.metrics.TruePositives(), tf_keras.metrics.TrueNegatives(), tf_keras.metrics.FalsePositives(), tf_keras.metrics.FalseNegatives()]

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

model.fit(train_dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tf_keras.src.callbacks.History at 0x7f25be549a80>

# Calculate metrics (precision, recall, f1-score)

In [None]:
# get results
pharo_test_results = model.predict(pharo_test_dataset.batch(batch_size))



In [None]:
# calculate and output metrics
def getMetrics(y_true, y_pred):
    TP = [0]*7
    TN = [0]*7
    FP = [0]*7
    FN = [0]*7

    results = []

    for i in range(7):
        pred_col = y_pred[:, i].tolist()
        true_col = np.array(y_true.labels.tolist())[:, i].tolist()

        for j in range(len(pred_col)):
            pred_col[j] = int(pred_col[j] > .5)

        for pair in zip(true_col, pred_col):
            if pair[0] == pair[1]:
                if pair[0] == 1:
                    TP[i] += 1
                else:
                    TN[i] += 1
            else:
                if pair[1] == 1:
                    FP[i] += 1
                else:
                    FN[i] += 1
        precision = TP[i]/(TP[i]+FP[i])
        recall = TP[i]/(TP[i]+FN[i])
        f1 = (2*precision*recall)/(precision+recall)
        results.append([precision, recall, f1])


    return results

results = np.array(getMetrics(pharo_test, pharo_test_results))
print(results)
print(sum(results[:, 0])/7)
print(sum(results[:, 1])/7)
print(sum(results[:, 2])/7)

[[0.60526316 0.53488372 0.56790123]
 [0.95495495 0.8907563  0.92173913]
 [0.63461538 0.63461538 0.63461538]
 [1.         0.25       0.4       ]
 [0.92307692 0.8        0.85714286]
 [0.74418605 0.74418605 0.74418605]
 [0.2        0.1        0.13333333]]
0.7231566381505182
0.5649202077968933
0.6084168552294125
