# Finetune model

In [1]:
from transformers import RobertaTokenizer, TFBertForMaskedLM
import numpy as np
import json
import os

## Use this cell to check whether CUDA cores are correctly configured and available.
Num GPUS should be 1

In [2]:
import tensorflow as tf

print("Num GPUs Available: ", len(tf.config.list_physical_devices("GPU")))

Num GPUs Available:  1


## Select a model with one of the following strings

In [None]:
model_string = "microsoft/codebert-base-mlm"
#model_string = "microsoft/graphcodebert-base"

## Get tokenizer and model based on selected model

In [3]:
tokenizer = RobertaTokenizer.from_pretrained(model_string)
model = TFBertForMaskedLM.from_pretrained(model_string)

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some layers from the model checkpoint at microsoft/graphcodebert-base were not used when initializing TFBertForMaskedLM: ['roberta', 'lm_head']
- This IS expected if you are initializing TFBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForMaskedLM were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['bert', 'mlm___cls']
You should probably TRAIN this model on a down-stream task to be able to u

## Define dataset based on given json of paths

In [4]:
dataset = json.loads(open("data/paths.json").read())

## Select single or multipath AST representation with one of the following lines

In [None]:
text_lst = [method + "(...) " + dataset[method][0] for method in dataset] # Single path
#text_lst = [method + "(...) " + " | ".join(dataset[method]) for method in dataset] # Multiple paths

## Get number of methods and check that dataset format is correct

In [1]:
len_lst = [len(line.split(" ")) for line in text_lst]
print(text_lst[0])
print(max(len_lst))
print(len(text_lst))

NameError: name 'text_lst' is not defined

## Split the dataset into training and testing

In [6]:
np.random.seed(42)
np.random.shuffle(text_lst)
train = text_lst[:int(len(text_lst)*0.8)]
test = text_lst[int(len(text_lst)*0.8):]
print(train[0])
print(test[0])

testInvalidHostConfiguration(...) program -> local_variable_declaration -> ResolveEndpointFailedException | program -> local_variable_declaration -> variable_declarator -> cause | program -> local_variable_declaration -> variable_declarator -> method_invocation -> assertIsInstanceOf | program -> local_variable_declaration -> variable_declarator -> method_invocation -> argument_list -> class_literal -> ResolveEndpointFailedException | program -> local_variable_declaration -> variable_declarator -> method_invocation -> argument_list -> method_invocation -> exception | program -> local_variable_declaration -> variable_declarator -> method_invocation -> argument_list -> method_invocation -> getCause | program -> local_variable_declaration -> variable_declarator -> method_invocation -> argument_list -> method_invocation -> argument_list | program -> expression_statement -> method_invocation -> assertTrue | program -> expression_statement -> method_invocation -> argument_list -> method_invoc

## Tokenize inputs
dict_keys should be 'input_ids' and 'attention_mask'

In [7]:
inputs = tokenizer(
    train, max_length=64, truncation=True, padding="max_length", return_tensors="tf"
)


print(inputs.keys())

dict_keys(['input_ids', 'attention_mask'])


## Tokenize inputs
dict_keys should be 'input_ids', 'attention_mask' and 'labels'

In [8]:
inputs["labels"] = tokenizer(
    train, max_length=64, truncation=True, padding="max_length", return_tensors="tf"
)["input_ids"]
print(inputs.keys())

dict_keys(['input_ids', 'attention_mask', 'labels'])


## Mask function name in 'input_ids'

In [9]:
inp_ids = []
for inp in tokenizer(
    train, max_length=64, truncation=True, padding="max_length", return_tensors="tf"
)["input_ids"].numpy():
    i = 1
    while inp[i] != 1640:
        inp[i] = tokenizer.mask_token_id
        i += 1
    inp_ids.append(inp)
inp_ids = tf.convert_to_tensor(inp_ids)
inputs["input_ids"] = inp_ids
print(inputs["input_ids"][0])
print(inputs["labels"][0])
print(tokenizer.decode(inputs["input_ids"][0]))

tf.Tensor(
[    0 50264 50264 50264 50264  1640 41137   586 43839   400  1215 48123
  1215 32639 36466 43839  4787 18224 18547  2300   597 13355 48847  1721
   586 43839   400  1215 48123  1215 32639 36466 43839 15594  1215 32639
   271  2630 43839  1303  1721   586 43839   400  1215 48123  1215 32639
 36466 43839 15594  1215 32639   271  2630 43839  5448  1215 24701 15644
 43839 18088  6209     2], shape=(64,), dtype=int32)
tf.Tensor(
[    0 21959 49695 40534 49602  1640 41137   586 43839   400  1215 48123
  1215 32639 36466 43839  4787 18224 18547  2300   597 13355 48847  1721
   586 43839   400  1215 48123  1215 32639 36466 43839 15594  1215 32639
   271  2630 43839  1303  1721   586 43839   400  1215 48123  1215 32639
 36466 43839 15594  1215 32639   271  2630 43839  5448  1215 24701 15644
 43839 18088  6209     2], shape=(64,), dtype=int32)
<s><mask><mask><mask><mask>(...) program -> local_variable_declaration -> ResolveEndpointFailedException | program -> local_variable_declarati

## Set checkpoint directory and callbacks

In [10]:
checkpoint_path = "trained_models/2024-03-20.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, save_weights_only=True, verbose=1
)

## Finetune model

In [11]:
#Source:https://stackoverflow.com/questions/38445982/how-to-log-keras-loss-output-to-a-file
from keras.callbacks import CSVLogger
csv_logger = CSVLogger('logs.csv', append=False, separator=',')
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)
history = model.fit(
    [inputs.input_ids, inputs.attention_mask],
    inputs.labels,
    verbose=1,
    batch_size=8,
    epochs=10,
    callbacks=[checkpoint_callback,csv_logger],
    
)

Epoch 1/10
Epoch 1: saving model to trained_models\2024-03-24.ckpt
Epoch 2/10
Epoch 2: saving model to trained_models\2024-03-24.ckpt
Epoch 3/10
Epoch 3: saving model to trained_models\2024-03-24.ckpt
Epoch 4/10
Epoch 4: saving model to trained_models\2024-03-24.ckpt
Epoch 5/10
Epoch 5: saving model to trained_models\2024-03-24.ckpt
Epoch 6/10
Epoch 6: saving model to trained_models\2024-03-24.ckpt
Epoch 7/10
Epoch 7: saving model to trained_models\2024-03-24.ckpt
Epoch 8/10
Epoch 8: saving model to trained_models\2024-03-24.ckpt
Epoch 9/10
Epoch 9: saving model to trained_models\2024-03-24.ckpt
Epoch 10/10
Epoch 10: saving model to trained_models\2024-03-24.ckpt


## Plot loss and accuracy

In [None]:
import matplotlib.pyplot as plt
from tools import logs_to_list
accuracy,loss= logs_to_list("logs.csv")
loss_epochs = range(1, len(accuracy) + 1)
accuracy_epochs = range(1, len(accuracy) + 1)
print(accuracy)
print(loss)
# figure for loss
plt.plot(loss_epochs, loss, label='Loss', color='red')
plt.title('Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
# figure for accuracy
plt.plot(accuracy_epochs, accuracy, label='Accuracy', color='red')
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

## Utility cell to test fine tuned model

In [13]:
query = "<mask><mask><mask>(...) program (return_statement (XPath))"
inp = tokenizer(query, return_tensors="tf")
mask_loc = np.where(inp.input_ids.numpy()[0] == tokenizer.mask_token_id)[0].tolist()
out = model(inp).logits[0].numpy()
predicted_tokens = np.argmax(out[mask_loc], axis=1).tolist()
print(tokenizer.decode(predicted_tokens))

createCPath


## Push model to hugging face

In [14]:
from huggingface_hub import login

login(token="TOKEN_HERE", add_to_git_credential= True)

model.push_to_hub("MODEL_NAME")
tokenizer.push_to_hub("MODEL_NAME")

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (manager).
Your token has been saved to C:\Users\jaspe\.cache\huggingface\token
Login successful


tf_model.h5:   0%|          | 0.00/655M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/JasperGrant/ASTBERT-gb-5k-methods-multipath/commit/5aa1c434b458c82ce62ad08c6b7422c083f52e66', commit_message='Upload tokenizer', commit_description='', oid='5aa1c434b458c82ce62ad08c6b7422c083f52e66', pr_url=None, pr_revision=None, pr_num=None)