In [46]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import tf_keras
from keras import layers
from transformers import BertTokenizerFast, BertConfig, TFBertModel, RobertaTokenizerFast, RobertaConfig, TFRobertaModel

gpu = 0
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_visible_devices(gpus[gpu], 'GPU')
tf.config.experimental.set_memory_growth(gpus[gpu], True)

In [47]:
splits = {'java_train': 'data/java_train-00000-of-00001.parquet', 'java_test': 'data/java_test-00000-of-00001.parquet', 'python_train': 'data/python_train-00000-of-00001.parquet', 'python_test': 'data/python_test-00000-of-00001.parquet', 'pharo_train': 'data/pharo_train-00000-of-00001.parquet', 'pharo_test': 'data/pharo_test-00000-of-00001.parquet'}

java_train = pd.read_parquet("hf://datasets/NLBSE/nlbse25-code-comment-classification/" + splits["java_train"])
python_train = pd.read_parquet("hf://datasets/NLBSE/nlbse25-code-comment-classification/" + splits["python_train"])
pharo_train = pd.read_parquet("hf://datasets/NLBSE/nlbse25-code-comment-classification/" + splits["pharo_train"])

java_test = pd.read_parquet("hf://datasets/NLBSE/nlbse25-code-comment-classification/" + splits["java_test"])
python_test = pd.read_parquet("hf://datasets/NLBSE/nlbse25-code-comment-classification/" + splits["python_test"])
pharo_test = pd.read_parquet("hf://datasets/NLBSE/nlbse25-code-comment-classification/" + splits["pharo_test"])

In [48]:
# model_name = 'bert-base-uncased'

# config = BertConfig.from_pretrained(model_name)
# config.output_hidden_states = False

# tokenizer = BertTokenizerFast.from_pretrained(model_name, config=config)
# transformer_model = TFBertModel.from_pretrained(model_name, config=config)

model_name = 'microsoft/codebert-base'

config = RobertaConfig.from_pretrained(model_name)
config.output_hidden_states = False

tokenizer = RobertaTokenizerFast.from_pretrained(model_name, config=config)
transformer_model = TFRobertaModel.from_pretrained(model_name, config=config)

All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at microsoft/codebert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [49]:
bert = transformer_model.layers[0]

input_ids = tf_keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')
attention_mask = tf_keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32') 
inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
# inputs = {'input_ids': input_ids}

# Load the bert model as a layer
bert_model = bert(inputs)[1]
dropout = tf_keras.layers.Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)

# Additional dense layers
Dense128 = tf_keras.layers.Dense(128, kernel_initializer=tf_keras.initializers.TruncatedNormal(stddev=config.initializer_range), use_bias=False)(pooled_output)
Dense128 = tf_keras.layers.BatchNormalization(center=True, scale=False)(Dense128)
Dense128 = tf_keras.layers.Activation('relu')(Dense128)

javaDense = tf_keras.layers.Dense(64, kernel_initializer=tf_keras.initializers.TruncatedNormal(stddev=config.initializer_range), use_bias=False)(Dense128)
javaDense = tf_keras.layers.BatchNormalization(center=True, scale=False)(javaDense)
javaDense = tf_keras.layers.Activation('relu')(javaDense)

pythonDense = tf_keras.layers.Dense(64, kernel_initializer=tf_keras.initializers.TruncatedNormal(stddev=config.initializer_range), use_bias=False)(Dense128)
pythonDense = tf_keras.layers.BatchNormalization(center=True, scale=False)(pythonDense)
pythonDense = tf_keras.layers.Activation('relu')(pythonDense)

pharoDense = tf_keras.layers.Dense(64, kernel_initializer=tf_keras.initializers.TruncatedNormal(stddev=config.initializer_range), use_bias=False)(Dense128)
pharoDense = tf_keras.layers.BatchNormalization(center=True, scale=False)(pharoDense)
pharoDense = tf_keras.layers.Activation('relu')(pharoDense)

# Build model output
java = tf_keras.layers.Dense(units=7, activation="softmax", kernel_initializer=tf_keras.initializers.TruncatedNormal(stddev=config.initializer_range), name='java')(javaDense)
python = tf_keras.layers.Dense(units=5, activation="softmax", kernel_initializer=tf_keras.initializers.TruncatedNormal(stddev=config.initializer_range), name='python')(pythonDense)
pharo = tf_keras.layers.Dense(units=7, activation="softmax", kernel_initializer=tf_keras.initializers.TruncatedNormal(stddev=config.initializer_range), name='pharo')(pharoDense)
outputs = {'java': java, 'python': python, 'pharo': pharo}

# And combine it all in a model object
model = tf_keras.models.Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')

# Take a look at the model
model.summary()

Model: "BERT_MultiLabel_MultiClass"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 attention_mask (InputLayer  [(None, 512)]                0         []                            
 )                                                                                                
                                                                                                  
 input_ids (InputLayer)      [(None, 512)]                0         []                            
                                                                                                  
 roberta (TFRobertaMainLaye  TFBaseModelOutputWithPooli   1246456   ['attention_mask[0][0]',      
 r)                          ngAndCrossAttentions(last_   32         'input_ids[0][0]']           
                             hidden_state=(None, 512, 7                  

In [50]:
def tokenize_data(df, tokenizer):
    inputs = tokenizer(df, padding='max_length', truncation=True, max_length=512)
    return inputs

java_train_inputs = tokenize_data(java_train.combo.tolist(), tokenizer)
python_train_inputs = tokenize_data(python_train.combo.tolist(), tokenizer)
pharo_train_inputs = tokenize_data(pharo_train.combo.tolist(), tokenizer)

java_test_inputs = tokenize_data(java_test.combo.tolist(), tokenizer)
python_test_inputs = tokenize_data(python_test.combo.tolist(), tokenizer)
pharo_test_inputs = tokenize_data(pharo_test.combo.tolist(), tokenizer)

In [51]:
javaLen = len(java_train.labels.tolist())
pythonLen = len(python_train.labels.tolist())
pharoLen = len(pharo_train.labels.tolist())

java_train_labels = {'java': java_train.labels.tolist(), 'python': [[np.int64(0)]*5]*javaLen, 'pharo': [[np.int64(0)]*7]*javaLen}
python_train_labels = {'java': [[np.int64(0)]*7]*pythonLen, 'python': python_train.labels.tolist(), 'pharo': [[np.int64(0)]*7]*pythonLen}
pharo_train_labels = {'java': [[np.int64(0)]*7]*pharoLen, 'python': [[np.int64(0)]*5]*pharoLen, 'pharo': pharo_train.labels.tolist()}

javaLen = len(java_test.labels.tolist())
pythonLen = len(python_test.labels.tolist())
pharoLen = len(pharo_test.labels.tolist())

java_test_labels = {'java': java_test.labels.tolist(), 'python': [[np.int64(0)]*5]*javaLen, 'pharo': [[np.int64(0)]*7]*javaLen}
python_test_labels = {'java': [[np.int64(0)]*7]*pythonLen, 'python': python_test.labels.tolist(), 'pharo': [[np.int64(0)]*7]*pythonLen}
pharo_test_labels = {'java': [[np.int64(0)]*7]*pharoLen, 'python': [[np.int64(0)]*5]*pharoLen, 'pharo': pharo_test.labels.tolist()}

In [52]:
def create_tf_dataset(inputs, labels):
    return tf.data.Dataset.from_tensor_slices((dict(inputs), dict(labels)))

java_train_dataset = create_tf_dataset(java_train_inputs, java_train_labels)
python_train_dataset = create_tf_dataset(python_train_inputs, python_train_labels)
pharo_train_dataset = create_tf_dataset(pharo_train_inputs, pharo_train_labels)

java_test_dataset = create_tf_dataset(java_test_inputs, java_test_labels)
python_test_dataset = create_tf_dataset(python_test_inputs, python_test_labels)
pharo_test_dataset = create_tf_dataset(pharo_test_inputs, pharo_test_labels)

In [53]:
def selectiveCategoricalCrossentropy(y_true, y_pred):
    if keras.backend.all(keras.backend.equal(y_true, 0)):
        return 0.0
    else:
        loss = tf_keras.losses.CategoricalCrossentropy()
        return loss(y_true, y_pred)

In [None]:
optimizer = tf_keras.optimizers.AdamW(learning_rate=5e-5, epsilon=1e-8)
loss = {'java': selectiveCategoricalCrossentropy, 'python': selectiveCategoricalCrossentropy, 'pharo': selectiveCategoricalCrossentropy}

model.layers[2].trainable = False 

model.compile(
    optimizer = optimizer,
    loss = loss)

# Concatenate all datasets
train_dataset = java_train_dataset.concatenate(python_train_dataset).concatenate(pharo_train_dataset)

# Shuffle and batch the training dataset
train_dataset = train_dataset.shuffle(1000).batch(8).prefetch(tf.data.experimental.AUTOTUNE)

# Train the model
model.fit(train_dataset, epochs=5)

In [None]:
optimizer = tf_keras.optimizers.AdamW(learning_rate=5e-5, epsilon=1e-8)
loss = {'java': selectiveCategoricalCrossentropy, 'python': selectiveCategoricalCrossentropy, 'pharo': selectiveCategoricalCrossentropy}

model.layers[2].trainable = True

model.compile(
    optimizer = optimizer,
    loss = loss)

# Concatenate all datasets
train_dataset = java_train_dataset.concatenate(python_train_dataset).concatenate(pharo_train_dataset)

# Shuffle and batch the training dataset
train_dataset = train_dataset.shuffle(1000).batch(8).prefetch(tf.data.experimental.AUTOTUNE)

# Train the model
model.fit(train_dataset, epochs=10)

In [None]:
# def trainModel(model, optimizer, loss, train_dataset, num_epochs):
#     model.compile(
#         optimizer = optimizer,
#         loss = loss)
    
#     # Shuffle and batch the training dataset
#     train_dataset = train_dataset.shuffle(1000).batch(8).prefetch(tf.data.experimental.AUTOTUNE)

#     # Train the model
#     model.fit(train_dataset, epochs=num_epochs)
#     return model

In [None]:
# optimizer = tf_keras.optimizers.AdamW(learning_rate=5e-5, epsilon=1e-8, weight_decay=0.01)
# model.layers[2].trainable = False 

# loss1 = {'java': tf_keras.losses.CategoricalCrossentropy(), 'python': None, 'pharo': None}
# loss2 = {'java': None, 'python': tf_keras.losses.CategoricalCrossentropy(), 'pharo': None}
# loss3 = {'java': None, 'python': None, 'pharo': tf_keras.losses.CategoricalCrossentropy()}
# #loss4 = {'java': selectiveCategoricalCrossentropy, 'python': selectiveCategoricalCrossentropy, 'pharo': selectiveCategoricalCrossentropy}

# model = trainModel(model, optimizer, loss1, java_train_dataset, 5)
# model = trainModel(model, optimizer, loss2, python_train_dataset, 5)
# model = trainModel(model, optimizer, loss3, pharo_train_dataset, 5)

# model.layers[2].trainable = True
# optimizer = tf_keras.optimizers.AdamW(learning_rate=3e-5, epsilon=1e-8, weight_decay=0.01)

# model = trainModel(model, optimizer, loss1, java_train_dataset, 10)
# model = trainModel(model, optimizer, loss2, python_train_dataset, 10)
# model = trainModel(model, optimizer, loss3, pharo_train_dataset, 10)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
def getMetrics(y_true, y_pred):
    size = y_pred.shape[1]
    TP = [0]*size
    TN = [0]*size
    FP = [0]*size
    FN = [0]*size

    results = []

    for i in range(size):
        pred_col = y_pred[:, i].tolist()
        true_col = np.array(y_true.labels.tolist())[:, i].tolist()

        for j in range(len(pred_col)):
            pred_col[j] = int(pred_col[j] > .5)

        for pair in zip(true_col, pred_col):
            print(pair)
            if pair[0] == pair[1]:
                if pair[0] == 1:
                    TP[i] += 1
                else:
                    TN[i] += 1
            else:
                if pair[1] == 1:
                    FP[i] += 1
                else:
                    FN[i] += 1
        precision = TP[i]/(TP[i]+FP[i])
        recall = TP[i]/(TP[i]+FN[i])
        f1 = (2*precision*recall)/(precision+recall)
        results.append([precision, recall, f1])


    return results

In [None]:
java_preds = model.predict(java_test_dataset.batch(8))
python_preds = model.predict(python_test_dataset.batch(8))
pharo_preds = model.predict(pharo_test_dataset.batch(8))



In [None]:
java_results = np.array(getMetrics(java_test, java_preds["java"]))
python_results = np.array(getMetrics(python_test, python_preds["python"]))
pharo_results = np.array(getMetrics(pharo_test, pharo_preds["pharo"]))

print("**********JAVA**********")
print(java_results)
print(sum(java_results[:, 0])/7)
print(sum(java_results[:, 1])/7)
print(sum(java_results[:, 2])/7)
print("**********PYTHON**********")
print(python_results)
print(sum(python_results[:, 0])/5)
print(sum(python_results[:, 1])/5)
print(sum(python_results[:, 2])/5)
print("**********PHARO**********")
print(pharo_results)
print(sum(pharo_results[:, 0])/7)
print(sum(pharo_results[:, 1])/7)
print(sum(pharo_results[:, 2])/7)

(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(1, 0)
(1, 0)
(1, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(1, 0)
(1, 0)
(0, 0)
(0, 0)
(1, 0)
(1, 0)
(1, 0)
(0, 0)
(1, 0)
(0, 0)
(1, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(1, 0)
(1, 0)
(1, 0)
(0, 0)
(0, 0)
(1, 0)
(1, 0)
(1, 0)
(0, 0)
(1, 0)
(1, 0)
(1, 0)
(0, 0)
(0, 0)
(0, 0)
(1, 0)
(1, 0)
(1, 0)
(1, 0)
(1, 0)
(1, 0)
(0, 0)
(1, 0)
(1, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(1, 0)
(1, 0)
(0, 0)
(0, 0)
(0, 0)
(1, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(1, 0)
(0, 0)
(0, 0)
(0, 0)
(1, 0)
(0, 0)
(1, 0)
(1, 0)
(0, 0)
(0, 0)
(0, 0)
(1, 0)
(0, 0)
(1, 0)
(1, 0)
(0, 0)
(1, 0)
(1, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(1, 0)
(0, 0)
(0, 0)
(1, 0)
(1, 0)
(1, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(1, 0)
(0, 0)
(0, 0)
(1, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(1, 0)
(1, 0)

ZeroDivisionError: division by zero