In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import tf_keras
from keras import layers
from transformers import BertTokenizerFast, BertConfig, TFBertModel, RobertaTokenizerFast, RobertaConfig, TFRobertaModel

gpu = 0
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_visible_devices(gpus[gpu], 'GPU')
tf.config.experimental.set_memory_growth(gpus[gpu], True)

2024-12-04 08:30:35.124718: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733326235.138362 1385758 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733326235.142634 1385758 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-04 08:30:35.158142: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
splits = {'java_train': 'data/java_train-00000-of-00001.parquet', 'java_test': 'data/java_test-00000-of-00001.parquet', 'python_train': 'data/python_train-00000-of-00001.parquet', 'python_test': 'data/python_test-00000-of-00001.parquet', 'pharo_train': 'data/pharo_train-00000-of-00001.parquet', 'pharo_test': 'data/pharo_test-00000-of-00001.parquet'}

java_train = pd.read_parquet("hf://datasets/NLBSE/nlbse25-code-comment-classification/" + splits["java_train"])
python_train = pd.read_parquet("hf://datasets/NLBSE/nlbse25-code-comment-classification/" + splits["python_train"])
pharo_train = pd.read_parquet("hf://datasets/NLBSE/nlbse25-code-comment-classification/" + splits["pharo_train"])

java_test = pd.read_parquet("hf://datasets/NLBSE/nlbse25-code-comment-classification/" + splits["java_test"])
python_test = pd.read_parquet("hf://datasets/NLBSE/nlbse25-code-comment-classification/" + splits["python_test"])
pharo_test = pd.read_parquet("hf://datasets/NLBSE/nlbse25-code-comment-classification/" + splits["pharo_test"])

In [3]:
# model_name = 'bert-base-uncased'

# config = BertConfig.from_pretrained(model_name)
# config.output_hidden_states = False

# tokenizer = BertTokenizerFast.from_pretrained(model_name, config=config)
# transformer_model = TFBertModel.from_pretrained(model_name, config=config)

model_name = 'microsoft/codebert-base'

config = RobertaConfig.from_pretrained(model_name)
config.output_hidden_states = False

tokenizer = RobertaTokenizerFast.from_pretrained(model_name, config=config)
transformer_model = TFRobertaModel.from_pretrained(model_name, config=config)

I0000 00:00:1733326250.473420 1385758 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 77661 MB memory:  -> device: 0, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:81:00.0, compute capability: 8.0
All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at microsoft/codebert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [4]:
bert = transformer_model.layers[0]

input_ids = tf_keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')
# attention_mask = Input(shape=(max_length,), name='attention_mask', dtype='int32') 
# inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
inputs = {'input_ids': input_ids}

# Load the bert model as a layer
bert_model = bert(inputs)[1]
dropout = tf_keras.layers.Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)

# Additional dense layers
javaDense = tf_keras.layers.Dense(64, kernel_initializer=tf_keras.initializers.TruncatedNormal(stddev=config.initializer_range), use_bias=False)(pooled_output)
javaDense = tf_keras.layers.BatchNormalization(center=True, scale=False)(javaDense)
javaDense = tf_keras.layers.Activation('relu')(javaDense)

pythonDense = tf_keras.layers.Dense(64, kernel_initializer=tf_keras.initializers.TruncatedNormal(stddev=config.initializer_range), use_bias=False)(pooled_output)
pythonDense = tf_keras.layers.BatchNormalization(center=True, scale=False)(pythonDense)
pythonDense = tf_keras.layers.Activation('relu')(pythonDense)

pharoDense = tf_keras.layers.Dense(64, kernel_initializer=tf_keras.initializers.TruncatedNormal(stddev=config.initializer_range), use_bias=False)(pooled_output)
pharoDense = tf_keras.layers.BatchNormalization(center=True, scale=False)(pharoDense)
pharoDense = tf_keras.layers.Activation('relu')(pharoDense)

# Build model output
java = tf_keras.layers.Dense(units=7, kernel_initializer=tf_keras.initializers.TruncatedNormal(stddev=config.initializer_range), name='java')(javaDense)
python = tf_keras.layers.Dense(units=5, kernel_initializer=tf_keras.initializers.TruncatedNormal(stddev=config.initializer_range), name='python')(pythonDense)
pharo = tf_keras.layers.Dense(units=7, kernel_initializer=tf_keras.initializers.TruncatedNormal(stddev=config.initializer_range), name='pharo')(pharoDense)
outputs = {'java': java, 'python': python, 'pharo': pharo}

# And combine it all in a model object
model = tf_keras.models.Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')

# Take a look at the model
model.summary()

Model: "BERT_MultiLabel_MultiClass"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 512)]                0         []                            
                                                                                                  
 roberta (TFRobertaMainLaye  TFBaseModelOutputWithPooli   1246456   ['input_ids[0][0]']           
 r)                          ngAndCrossAttentions(last_   32                                      
                             hidden_state=(None, 512, 7                                           
                             68),                                                                 
                              pooler_output=(None, 768)                                           
                             , past_key_values=None, hi                  

In [5]:
def tokenize_data(df, tokenizer):
    inputs = tokenizer(df, padding='max_length', truncation=True, max_length=512)
    return inputs

java_train_inputs = tokenize_data(java_train.combo.tolist(), tokenizer)
python_train_inputs = tokenize_data(python_train.combo.tolist(), tokenizer)
pharo_train_inputs = tokenize_data(pharo_train.combo.tolist(), tokenizer)

java_test_inputs = tokenize_data(java_test.combo.tolist(), tokenizer)
python_test_inputs = tokenize_data(python_test.combo.tolist(), tokenizer)
pharo_test_inputs = tokenize_data(pharo_test.combo.tolist(), tokenizer)

In [6]:
javaLen = len(java_train.labels.tolist())
pythonLen = len(python_train.labels.tolist())
pharoLen = len(pharo_train.labels.tolist())

java_train_labels = {'java': java_train.labels.tolist(), 'python': [[np.int64(0)]*5]*javaLen, 'pharo': [[np.int64(0)]*7]*javaLen}
python_train_labels = {'java': [[np.int64(0)]*7]*pythonLen, 'python': python_train.labels.tolist(), 'pharo': [[np.int64(0)]*7]*pythonLen}
pharo_train_labels = {'java': [[np.int64(0)]*7]*pharoLen, 'python': [[np.int64(0)]*5]*pharoLen, 'pharo': pharo_train.labels.tolist()}

javaLen = len(java_test.labels.tolist())
pythonLen = len(python_test.labels.tolist())
pharoLen = len(pharo_test.labels.tolist())

java_test_labels = {'java': java_test.labels.tolist(), 'python': [[np.int64(0)]*5]*javaLen, 'pharo': [[np.int64(0)]*7]*javaLen}
python_test_labels = {'java': [[np.int64(0)]*7]*pythonLen, 'python': python_test.labels.tolist(), 'pharo': [[np.int64(0)]*7]*pythonLen}
pharo_test_labels = {'java': [[np.int64(0)]*7]*pharoLen, 'python': [[np.int64(0)]*5]*pharoLen, 'pharo': pharo_test.labels.tolist()}

In [7]:
def create_tf_dataset(inputs, labels):
    return tf.data.Dataset.from_tensor_slices((dict(inputs), dict(labels)))

java_train_dataset = create_tf_dataset(java_train_inputs, java_train_labels)
python_train_dataset = create_tf_dataset(python_train_inputs, python_train_labels)
pharo_train_dataset = create_tf_dataset(pharo_train_inputs, pharo_train_labels)

java_test_dataset = create_tf_dataset(java_test_inputs, java_test_labels)
python_test_dataset = create_tf_dataset(python_test_inputs, python_test_labels)
pharo_test_dataset = create_tf_dataset(pharo_test_inputs, pharo_test_labels)

In [8]:
def selectiveLoss(y_true, y_pred):
    if keras.backend.all(keras.backend.equal(y_true, 0)):
        return 0.0
    else:
        loss = tf_keras.losses.CategoricalCrossentropy(from_logits = True)
        return loss(y_true, y_pred)

In [10]:
optimizer = tf_keras.optimizers.AdamW(learning_rate=5e-5, epsilon=1e-8)
#loss = {'java': tf_keras.losses.CategoricalCrossentropy(from_logits = True), 'python': tf_keras.losses.CategoricalCrossentropy(from_logits = True), 'pharo': tf_keras.losses.CategoricalCrossentropy(from_logits = True)}
loss = {'java': selectiveLoss, 'python': selectiveLoss, 'pharo': selectiveLoss}
metric = {'java': tf_keras.metrics.CategoricalAccuracy('accuracy'), 'python': tf_keras.metrics.CategoricalAccuracy('accuracy'), 'pharo': tf_keras.metrics.CategoricalAccuracy('accuracy')}

model.layers[1].trainable = False 

model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

# Concatenate all datasets
train_dataset = java_train_dataset.concatenate(python_train_dataset).concatenate(pharo_train_dataset)

# Shuffle and batch the training dataset
train_dataset = train_dataset.shuffle(1000).batch(8).prefetch(tf.data.experimental.AUTOTUNE)

# Train the model
model.fit(train_dataset, epochs=5)

Epoch 1/5


  inputs = self._flatten_to_reference_inputs(inputs)
I0000 00:00:1733326606.258864 1388464 service.cc:148] XLA service 0x7f34c1b5a6b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1733326606.258896 1388464 service.cc:156]   StreamExecutor device (0): NVIDIA A100 80GB PCIe, Compute Capability 8.0
2024-12-04 08:36:46.264109: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1733326606.282503 1388464 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1733326606.342543 1388464 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tf_keras.src.callbacks.History at 0x7f35ea71c4f0>

In [None]:
optimizer = tf_keras.optimizers.AdamW(learning_rate=5e-5, epsilon=1e-8)
#loss = {'java': tf_keras.losses.CategoricalCrossentropy(from_logits = True), 'python': tf_keras.losses.CategoricalCrossentropy(from_logits = True), 'pharo': tf_keras.losses.CategoricalCrossentropy(from_logits = True)}
loss = {'java': selectiveLoss, 'python': selectiveLoss, 'pharo': selectiveLoss}
metric = {'java': tf_keras.metrics.CategoricalAccuracy('accuracy'), 'python': tf_keras.metrics.CategoricalAccuracy('accuracy'), 'pharo': tf_keras.metrics.CategoricalAccuracy('accuracy')}

model.layers[1].trainable = True

model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

# Concatenate all datasets
train_dataset = java_train_dataset.concatenate(python_train_dataset).concatenate(pharo_train_dataset)

# Shuffle and batch the training dataset
train_dataset = train_dataset.shuffle(1000).batch(8).prefetch(tf.data.experimental.AUTOTUNE)

# Train the model
model.fit(train_dataset, epochs=10)

Epoch 1/10


  inputs = self._flatten_to_reference_inputs(inputs)


   1/1350 [..............................] - ETA: 11:40:41 - loss: 1.5208 - java_loss: 1.5208 - pharo_loss: 0.0000e+00 - python_loss: 0.0000e+00 - java_accuracy: 0.3750 - pharo_accuracy: 0.0000e+00 - python_accuracy: 0.5000

In [93]:
optimizer = tf_keras.optimizers.AdamW(learning_rate=5e-4, epsilon=1e-8, weight_decay=0.01, clipnorm=1.0)
#loss = {'java': tf_keras.losses.CategoricalCrossentropy(from_logits = True), 'python': tf_keras.losses.CategoricalCrossentropy(from_logits = True), 'pharo': tf_keras.losses.CategoricalCrossentropy(from_logits = True)}
loss = {'java': selectiveLoss, 'python': None, 'pharo': None}
metric = {'java': tf_keras.metrics.CategoricalAccuracy('accuracy'), 'python': None, 'pharo': None}

model.layers[1].trainable = False 

model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

# Shuffle and batch the training dataset
train_dataset = java_train_dataset.shuffle(1000).batch(16).prefetch(tf.data.experimental.AUTOTUNE)

# Train the model
model.fit(train_dataset, epochs=5)

Epoch 1/5


  inputs = self._flatten_to_reference_inputs(inputs)


















Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tf_keras.src.callbacks.History at 0x7fdf9e8d2650>

In [94]:
optimizer = tf_keras.optimizers.AdamW(learning_rate=5e-4, epsilon=1e-8, weight_decay=0.01, clipnorm=1.0)
#loss = {'java': tf_keras.losses.CategoricalCrossentropy(from_logits = True), 'python': tf_keras.losses.CategoricalCrossentropy(from_logits = True), 'pharo': tf_keras.losses.CategoricalCrossentropy(from_logits = True)}
loss = {'java': None, 'python': selectiveLoss, 'pharo': None}
metric = {'java': None, 'python': tf_keras.metrics.CategoricalAccuracy('accuracy'), 'pharo': None}

model.layers[1].trainable = False 

model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

# Shuffle and batch the training dataset
train_dataset = python_train_dataset.shuffle(1000).batch(16).prefetch(tf.data.experimental.AUTOTUNE)

# Train the model
model.fit(train_dataset, epochs=5)

Epoch 1/5


  inputs = self._flatten_to_reference_inputs(inputs)


















Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tf_keras.src.callbacks.History at 0x7fdf83c3b160>

In [95]:
optimizer = tf_keras.optimizers.AdamW(learning_rate=5e-4, epsilon=1e-8, weight_decay=0.01, clipnorm=1.0)
#loss = {'java': tf_keras.losses.CategoricalCrossentropy(from_logits = True), 'python': tf_keras.losses.CategoricalCrossentropy(from_logits = True), 'pharo': tf_keras.losses.CategoricalCrossentropy(from_logits = True)}
loss = {'java': None, 'python': None, 'pharo': selectiveLoss}
metric = {'java': None, 'python': None, 'pharo': tf_keras.metrics.CategoricalAccuracy('accuracy')}

model.layers[1].trainable = False 

model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

# Shuffle and batch the training dataset
train_dataset = pharo_train_dataset.shuffle(1000).batch(16).prefetch(tf.data.experimental.AUTOTUNE)

# Train the model
model.fit(train_dataset, epochs=5)

Epoch 1/5
















Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tf_keras.src.callbacks.History at 0x7fdf9f4d5630>

In [97]:
optimizer = tf_keras.optimizers.AdamW(learning_rate=5e-4, epsilon=1e-8, weight_decay=0.01, clipnorm=1.0)
#loss = {'java': tf_keras.losses.CategoricalCrossentropy(from_logits = True), 'python': tf_keras.losses.CategoricalCrossentropy(from_logits = True), 'pharo': tf_keras.losses.CategoricalCrossentropy(from_logits = True)}
loss = {'java': selectiveLoss, 'python': selectiveLoss, 'pharo': selectiveLoss}
metric = {'java': tf_keras.metrics.CategoricalAccuracy('accuracy'), 'python': tf_keras.metrics.CategoricalAccuracy('accuracy'), 'pharo': tf_keras.metrics.CategoricalAccuracy('accuracy')}

model.layers[1].trainable = False

model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

In [11]:
# Evaluate on test datasets
# test_dataset = java_test_dataset.concatenate(python_test_dataset).concatenate(pharo_test_dataset)
# test_acc = model.evaluate(test_dataset.batch(16))
java_results = model.evaluate(java_test_dataset.batch(8))
python_results = model.evaluate(python_test_dataset.batch(8))
pharo_results = model.evaluate(pharo_test_dataset.batch(8))

print(f"Java Test Accuracy: {java_results[-3]}")
print(f"Python Test Accuracy: {python_results[-1]}")
print(f"Pharo Test Accuracy: {pharo_results[-2]}")

Java Test Accuracy: 0.5194202661514282
Python Test Accuracy: 0.32019704580307007
Pharo Test Accuracy: 0.4117647111415863


In [None]:
optimizer =  tf_keras.optimizers.AdamW([
    {'params': model.layers[1].parameters(), 'lr': 3e-5},  # For BERT layers
    {'params': model.layers[3].parameters(), 'lr': 5e-4},  # For classification heads java
    {'params': model.layers[4].parameters(), 'lr': 5e-4},  # For classification heads pharo
    {'params': model.layers[5].parameters(), 'lr': 5e-4}  # For classification heads python
], eps=1e-8, weight_decay=0.01, clipnorm=1.0)
#loss = {'java': tf_keras.losses.CategoricalCrossentropy(from_logits = True), 'python': tf_keras.losses.CategoricalCrossentropy(from_logits = True), 'pharo': tf_keras.losses.CategoricalCrossentropy(from_logits = True)}
loss = {'java': selectiveLoss, 'python': selectiveLoss, 'pharo': selectiveLoss}
metric = {'java': tf_keras.metrics.CategoricalAccuracy('accuracy'), 'python': tf_keras.metrics.CategoricalAccuracy('accuracy'), 'pharo': tf_keras.metrics.CategoricalAccuracy('accuracy')}

model.layers[1].trainable = True

model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

# Concatenate all datasets
train_dataset = java_train_dataset.concatenate(python_train_dataset).concatenate(pharo_train_dataset)

# Shuffle and batch the training dataset
train_dataset = train_dataset.shuffle(1000).batch(16).prefetch(tf.data.experimental.AUTOTUNE)

# Train the model
model.fit(train_dataset, epochs=10)

In [None]:
# Evaluate on test datasets
test_dataset = java_test_dataset.concatenate(python_test_dataset).concatenate(pharo_test_dataset)
test_acc = model.evaluate(test_dataset.batch(16))

print(f"Java Test Accuracy: {test_acc[-3]}")
print(f"Python Test Accuracy: {test_acc[-2]}")
print(f"Pharo Test Accuracy: {test_acc[-1]}")