Installing the necessary packages

In [None]:
%pip install transformers datasets torch evaluate

In [None]:
%pip install accelerate -U

In [None]:
%pip install rouge_score

In [None]:
#installing gradio for interface
%pip uninstall gradio -y
%pip install gradio

Loading CodeXGlue-code to text dataset 

In [1]:
from datasets import load_dataset

# Load the CodeSearchNet dataset
dataset = load_dataset("code_x_glue_ct_code_to_text", "javascript",trust_remote_code=True)
# Split the dataset into train and validation sets
train_data = dataset["train"]
val_data = dataset["validation"]
test_data = dataset["test"]

Preprocessing the dataset

In [2]:
import pandas as pd

# Convert the dataset to a Pandas DataFrame
tdf = pd.DataFrame(train_data)

vdf = pd.DataFrame(val_data)
#removal of those rows whose token length is greater than 512
tdf['code_tokens_length'] = tdf['code_tokens'].str.len()
sampled_tdf = tdf[tdf['code_tokens_length'] <= 512]
print(len(sampled_tdf))

vdf['code_tokens_length'] = vdf['code_tokens'].str.len()
sampled_vdf = vdf[vdf['code_tokens_length'] <= 512]
print(len(sampled_vdf))

Sampling the dataset (If necessary)

In [None]:
# sampled_tdf = sampled_tdf.sample(n=50000, random_state=52)
# len(sampled_tdf)
# sampled_vdf = sampled_vdf.sample(n=3400, random_state=52)
# len(sampled_vdf)

In [None]:
sampled_tdf.head()

Unnamed: 0,id,repo,path,func_name,original_string,language,code,code_tokens,docstring,docstring_tokens,sha,url,code_tokens_length
56772,56772,ariatemplates/ariatemplates,src/aria/utils/css/Units.js,,"function (newUnit, valueInPixels, elem, proper...",javascript,"function (newUnit, valueInPixels, elem, proper...","[function, (, newUnit, ,, valueInPixels, ,, el...",Converts the value of a given CSS property of ...,"[Converts, the, value, of, a, given, CSS, prop...",7ed5d065818ae159bf361c9dfb209b1cf3883c90,https://github.com/ariatemplates/ariatemplates...,31
13711,13711,evaisse/wiresrc,lib/detect.js,,"function (allDependencies, patterns) {\n re...",javascript,"function (allDependencies, patterns) {\n re...","[function, (, allDependencies, ,, patterns, ),...",Excludes dependencies that match any of the pa...,"[Excludes, dependencies, that, match, any, of,...",73b6e8b25bca095d665bf85478429996613f88a3,https://github.com/evaisse/wiresrc/blob/73b6e8...,69
3079,3079,appiphony/appiphony-lightning-js,public/lib/emberjs/ember.js,,function(fullName) {\n Ember.assert('fu...,javascript,function(fullName) {\n Ember.assert('fu...,"[function, (, fullName, ), {, Ember, ., assert...",Unregister a fullName\n\n```javascript\nvar co...,"[Unregister, a, fullName]",704953fdc60b62d3073fc5cace716a201d38b36c,https://github.com/appiphony/appiphony-lightni...,68
15399,15399,SeleniumHQ/selenium,third_party/js/mozmill/shared-modules/addons.js,addonsManager_getSearchFilterByValue,function addonsManager_getSearchFilterByValue(...,javascript,function addonsManager_getSearchFilterByValue(...,"[function, addonsManager_getSearchFilterByValu...",Get the search filter element for the specifie...,"[Get, the, search, filter, element, for, the, ...",38d5e4440b2c866a78a1ccb2a18d9795a1bdeafd,https://github.com/SeleniumHQ/selenium/blob/38...,45
33064,33064,cfpb/AtomicComponent,src/utilities/object-assign/index.js,assign,function assign( destination ) {\n destinatio...,javascript,function assign( destination ) {\n destinatio...,"[function, assign, (, destination, ), {, desti...",Copies properties of all sources to the destin...,"[Copies, properties, of, all, sources, to, the...",f45d7ded6687672c8b701c9910ddfe90c7ede742,https://github.com/cfpb/AtomicComponent/blob/f...,123


In [6]:
#to convert back to Hugging face datasets format
from datasets import Dataset
train_data=Dataset.from_pandas(sampled_tdf)
val_data=Dataset.from_pandas(sampled_vdf)

In [10]:
len(train_data)

57871

In [18]:
train_data['code'][0]

'function (newUnit, valueInPixels, elem, property) {\n                return this.__convertFromPixels[newUnit].call(this, valueInPixels, elem, property);\n            }'

In [None]:
train_data['docstring'][0]

In [None]:
val_data.shape

Tokenizing source (code) and target (docstring) 

In [8]:
from torch.utils.data import Dataset
from transformers import RobertaTokenizer
#codebertfull2 was the finetuned model by us 
tokenizer = RobertaTokenizer.from_pretrained("codebertfull2")

#In this class the code and docstring columns are tokenized and it has 3 different parts - input_ids, attention_mask, labels (i.e input_ids of docstring)
class CodeSummaryDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        # self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        code = self.data[idx]["code"]
        summary = self.data[idx]["docstring"]

        inputs = self.tokenizer.encode_plus(code, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
        targets = self.tokenizer.encode_plus(summary, truncation=True, padding="max_length", max_length=128, return_tensors="pt")

        input_ids = inputs["input_ids"].squeeze()
        attention_mask = inputs["attention_mask"].squeeze()
        labels = targets["input_ids"].squeeze()

        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}


In [None]:
#Create dataset objects
train_dataset = CodeSummaryDataset(train_data,tokenizer)
val_dataset = CodeSummaryDataset(val_data,tokenizer)

train_dataset[8]

Using GPU if available

In [None]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    print("GPU is available")
    print("GPU Name:", torch.cuda.get_device_name(0))
else:
    print("GPU is not available")


Training the model

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, EncoderDecoderModel, RobertaTokenizer, EarlyStoppingCallback
from datasets import Dataset
import numpy as np
import evaluate
from datasets import load_metric
import torch
#Load the tokenizer for CodeBERT
tokenizer = RobertaTokenizer.from_pretrained("codebertfull2")

#training the finetuned model again
my_model= EncoderDecoderModel.from_pretrained('codebertfull2')

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
my_model.to(device)

# Set special tokens
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token
my_model.config.decoder_start_token_id = tokenizer.bos_token_id
my_model.config.eos_token_id = tokenizer.eos_token_id
my_model.config.pad_token_id = tokenizer.pad_token_id

#defining training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./Codebert_res",                        #directory where the checkpoints are saved
    eval_strategy="epoch",                              #evaluation per epoch
    learning_rate=7e-5,                                 #the rate at which the model runs
    per_device_train_batch_size=32,                     #batch size for training dataset
    per_device_eval_batch_size=32,                      #batch size for validatation dataset
    weight_decay=0.02,                                  
    save_total_limit=5,                                 #No. of checkpoints saved
    num_train_epochs=8,                                 #No. of epochs 
    predict_with_generate=True,                         
    lr_scheduler_type="linear",
    logging_dir="./Codebert_log",
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,                       # Automatically load the best model at the end
    metric_for_best_model="eval_loss",                 # Use evaluation loss to select the best model
    greater_is_better=False,                           # Lower loss indicates a better model
    fp16=True,                                         # Enable mixed precision training
    gradient_accumulation_steps=3,                     
    # resume_from_checkpoint=True
)

# Load the ROUGE metric
rouge = load_metric("rouge")
meteor = load_metric("meteor")
# Function to compute the metric
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    # Compute the metric
    rouge_output = rouge.compute(predictions=pred_str, references=labels_str)
    meteor_output = meteor.compute(predictions=pred_str, references=labels_str)
    return { "rouge1": rouge_output["rouge1"].mid.fmeasure,
            "rouge2": rouge_output["rouge2"].mid.fmeasure,
            "rougeL": rouge_output["rougeL"].mid.fmeasure,
            "meteor": meteor_output["meteor"]
           }

trainer = Seq2SeqTrainer(
    model=my_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Add early stopping
)

# training the model 
trainer.train()


# Re-evaluate the model with the compute_metrics function
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

Saving the tokenizer and model

In [None]:
my_model.save_pretrained('codebertfull3')
tokenizer.save_pretrained('codebertfull3')

Calculating the metrics for 100 Test samples

In [None]:
import torch
from transformers import EncoderDecoderModel, RobertaTokenizer
import pandas as pd
from datasets import Dataset, load_metric

testdf = pd.DataFrame(test_data)
testdf['code_tokens_length'] = testdf['code_tokens'].str.len()
sampled_testdf = testdf[testdf['code_tokens_length'] <= 512]
sampled_testdf= sampled_testdf.sample(n=100,random_state=62)
print(len(sampled_testdf))
test_data=Dataset.from_pandas(sampled_testdf)

tokenizer = RobertaTokenizer.from_pretrained("codebertfull3")
#training the finetuned model again
model= EncoderDecoderModel.from_pretrained('codebertfull3')
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Generate predictions for the selected samples
def generate_predictions(model, tokenizer, dataset, device):
    model.eval()
    predictions = []
    references = []

    for example in dataset:
        input_text = example['code']
        reference_text = example['docstring']

        input_ids = tokenizer.encode(input_text, padding='max_length' , truncation=True, max_length=512, return_tensors='pt').to(device)

        with torch.no_grad():
            outputs = model.generate(input_ids, max_length=128, num_beams=10, early_stopping=True,length_penalty=6)
       
        predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(predicted_text)
        references.append(reference_text)

    return predictions, references

# Generate predictions
predictions, references = generate_predictions(model, tokenizer, test_data, device)

# Load ROUGE metric
rouge_metric = load_metric("rouge")
meteor = load_metric("meteor")
# Compute ROUGE, meteor, bleuscores
results = rouge_metric.compute(predictions=predictions, references=references)
meteor_output = meteor.compute(predictions=predictions, references=references)

# Print results
print("ROUGE-1:", results['rouge1'].mid.fmeasure)
print("ROUGE-2:", results['rouge2'].mid.fmeasure)
print("ROUGE-L:", results['rougeL'].mid.fmeasure)
print("meteor:",meteor_output['meteor'])

#for bleu score 
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import sacrebleu

# Calculate individual BLEU scores
bleu_scores = []
for i in range(len(references)):
    ref = [references[i].split()]
    pred = predictions[i].split()
    bleu_score = sentence_bleu(ref, pred, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=SmoothingFunction().method1)
    bleu_scores.append(bleu_score)

# Calculate corpus-level BLEU score
corpus_bleu_score = sacrebleu.corpus_bleu(predictions, [references])

print("Average BLEU-4:", sum(bleu_scores) / len(bleu_scores)) # Average BLEU-4 score
print("Corpus BLEU-4:", corpus_bleu_score.score)

In [None]:
print("ROUGE-Lf1:", results['rougeL'].mid.fmeasure)
print("ROUGE-L precision:", results['rougeL'].mid.precision)
print("ROUGE-L recall:", results['rougeL'].mid.recall)


In [None]:
#printing the predicted and reference summaries along with the code of a test sample 
print(test_data['code'][7])
print(f'predicted: {predictions[7]}\n')
print(f'refernce: {references[7]}\n')

Human Evaluation (providing own code)

In [None]:
from transformers import RobertaTokenizer, EncoderDecoderModel
import torch

# Determine device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')
model = EncoderDecoderModel.from_pretrained('codebertfull3').to(device)

# Function to generate summary for a given code
def generate_summary(code):
    # Tokenize the input code and move tensors to the correct device
    inputs = tokenizer(code, return_tensors="pt", padding="max_length", truncation=True, max_length=512).to(device)

    # Generate summary (you can adjust max_length and num_beams as needed)
    summary_ids = model.generate(inputs.input_ids, max_length=128, num_beams=8, early_stopping=True,length_penalty=4)

    # Decode the generated summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Example JavaScript code
js_code = """
public class prax {
    public static void main(String[] args) {
        int n = 10;
        int a = 0, b = 1;
        System.out.print("Fibonacci Series: " + a + " " + b);
        for (int i = 2; i < n; i++) {
            int next = a + b;
            System.out.print(" " + next);
            a = b;
            b = next;
        }
    }
}
"""

# Generate summary
summary = generate_summary(js_code)
print("Original Code:", js_code)
print("Summary:", summary)


Setting up the interface with gradio

In [None]:
import gradio as gr
#creating textbox for output
output_text=gr.Textbox()
#deining the parameters for the interface
demo = gr.Interface(fn=generate_summary,
                   inputs="textbox",
                   outputs=output_text,
                   title="Automatic Code Summarizer for Javascript",
                   description="This app can summarize your javascript code snippets in natural language",
                   live=True)
demo.launch(inline=True)

Approach 2 - training full dataset (without incrementing the data gradually)

Preprocessing the dataset

In [None]:
from datasets import Dataset, DatasetDict,load_dataset
import pandas as pd
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, EncoderDecoderModel, RobertaTokenizer, EarlyStoppingCallback
import torch

# Model name
model_name = 'NewCodeBertFull3'
# Initialize the tokenizer
tokenizer = RobertaTokenizer.from_pretrained(model_name)
# Initialize the model
model = EncoderDecoderModel.from_pretrained(model_name)
print("Tokenizer and model loaded successfully.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load the CodeSearchNet dataset
dataset = load_dataset("code_x_glue_ct_code_to_text", "javascript",trust_remote_code=True)

#Convert the dataset to Pandas DataFrames
train_df = pd.DataFrame(dataset['train'])
val_df = pd.DataFrame(dataset['validation'])
test_df = pd.DataFrame(dataset['test'])

# Filter the datasets based on code_tokens_length
train_df['code_tokens_length'] = train_df['code'].apply(lambda x: len(tokenizer.encode(x)))
sampled_train_df = train_df[train_df['code_tokens_length'] <= 512]

val_df['code_tokens_length'] = val_df['code'].apply(lambda x: len(tokenizer.encode(x)))
sampled_val_df = val_df[val_df['code_tokens_length'] <= 512]

test_df['code_tokens_length'] = test_df['code'].apply(lambda x: len(tokenizer.encode(x)))
sampled_test_df = test_df[test_df['code_tokens_length'] <= 512]

# Check the number of available rows
print(f"Available training samples: {len(sampled_train_df)}")
print(f"Available validation samples: {len(sampled_val_df)}")
print(f"Available test samples: {len(sampled_test_df)}")

# Sample based on the available data
n_train_samples = min(60000, len(sampled_train_df))
n_val_samples = min(5000, len(sampled_val_df))
n_test_samples = min(5000, len(sampled_test_df))

sampled_train_df = sampled_train_df.sample(n=n_train_samples, random_state=42)
sampled_val_df = sampled_val_df.sample(n=n_val_samples, random_state=42)
sampled_test_df = sampled_test_df.sample(n=n_test_samples, random_state=42)

# Convert DataFrames back to Hugging Face Dataset
train_data = Dataset.from_pandas(sampled_train_df)
val_data = Dataset.from_pandas(sampled_val_df)
test_data = Dataset.from_pandas(sampled_test_df)

filtered_dataset = DatasetDict({
    'train': train_data,
    'validation': val_data,
    'test': test_data
})


Training the model

In [None]:
# Tokenize the filtered dataset
def tokenize_function(examples):
    inputs = [code for code in examples['code']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['docstring'], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = filtered_dataset.map(tokenize_function, batched=True)

tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

training_args = Seq2SeqTrainingArguments(
    output_dir="./NewCodebert_res",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=4,
    num_train_epochs=7,
    lr_scheduler_type="linear",
    logging_dir="./NewCodebert_log",
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,          # Automatically load the best model at the end
    metric_for_best_model="eval_loss",    # Use evaluation loss to select the best model
    greater_is_better=False,              # Lower loss indicates a better model
    fp16=True,                            # Enable mixed precision training
    gradient_accumulation_steps=3,
    # resume_from_checkpoint=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Add early stopping
)

#train the model
trainer.train()

#save the model and tokenizer
model.save_pretrained('NewCodeBertFull4')
tokenizer.save_pretrained('NewCodeBertFull4')

# Evaluate the model
results = trainer.evaluate()
print(results)

Calculating the metrics for 100 test samples

In [None]:
import torch
from transformers import EncoderDecoderModel, RobertaTokenizer
import pandas as pd
from datasets import Dataset, load_metric

sampled_test_df= sampled_test_df.sample(n=100,random_state=123)
print(len(sampled_test_df))
test_data=Dataset.from_pandas(sampled_test_df)

tokenizer = RobertaTokenizer.from_pretrained("NewCodeBertFull4")
#training the finetuned model again
model= EncoderDecoderModel.from_pretrained('NewCodeBertFull4')
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Generate predictions for the selected samples
def generate_predictions(model, tokenizer, dataset, device):
    model.eval()
    predictions = []
    references = []

    for example in dataset:
        input_text = example['code']
        reference_text = example['docstring']

        input_ids = tokenizer.encode(input_text, padding='max_length' , truncation=True, max_length=512, return_tensors='pt').to(device)

        with torch.no_grad():
            outputs = model.generate(input_ids, max_length=128, num_beams=30, early_stopping=True,length_penalty=1)
       
        predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(predicted_text)
        references.append(reference_text)

    return predictions, references

# Generate predictions
predictions, references = generate_predictions(model, tokenizer, test_data, device)

# Load ROUGE metric
rouge_metric = load_metric("rouge")
meteor = load_metric("meteor")
# Compute ROUGE scores
results = rouge_metric.compute(predictions=predictions, references=references)
meteor_output = meteor.compute(predictions=predictions, references=references)

# Print results
print("ROUGE-1:", results['rouge1'].mid.fmeasure)
print("ROUGE-2:", results['rouge2'].mid.fmeasure)
print("ROUGE-L:", results['rougeL'].mid.fmeasure)
print("meteor:",meteor_output['meteor'])

#for bleu score 
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import sacrebleu

# Calculate individual BLEU scores
bleu_scores = []
for i in range(len(references)):
    ref = [references[i].split()]
    pred = predictions[i].split()
    bleu_score = sentence_bleu(ref, pred, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=SmoothingFunction().method1)
    bleu_scores.append(bleu_score)

# Calculate corpus-level BLEU score
corpus_bleu_score = sacrebleu.corpus_bleu(predictions, [references])

print("Average BLEU-4:", sum(bleu_scores) / len(bleu_scores)) # Average BLEU-4 score
print("Corpus BLEU-4:", corpus_bleu_score.score)

Human Evaluation (providing own code)

In [None]:
from transformers import RobertaTokenizer, EncoderDecoderModel
import torch

# Determine device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('NewCodeBertFull4')
model = EncoderDecoderModel.from_pretrained('NewCodeBertFull4').to(device)

# Function to generate summary for a given code
def generate_summary2(code):
    # Tokenize the input code and move tensors to the correct device
    inputs = tokenizer(code, return_tensors="pt", padding="max_length", truncation=True, max_length=512).to(device)

    # Generate summary (you can adjust max_length and num_beams as needed)
    summary_ids = model.generate(inputs.input_ids, max_length=128, num_beams=8, early_stopping=True,length_penalty=4)

    # Decode the generated summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Example JavaScript code
js_code = """
public class fibonacci{
    public static void main(String[] args) {
        int n = 10;
        int a = 0, b = 1;
        System.out.print("Fibonacci Series: " + a + " " + b);
        for (int i = 2; i < n; i++) {
            int next = a + b;
            System.out.print(" " + next);
            a = b;
            b = next;
        }
    }
}
"""

# Generate summary
summary = generate_summary2(js_code)
print("Original Code:", js_code)
print("Summary:", summary)


Setting up the interface with gradio

In [None]:
import gradio as gr
#creating textbox for output
output_text=gr.Textbox()
#deining the parameters for the interface
demo = gr.Interface(fn=generate_summary2,
                   inputs="textbox",
                   outputs=output_text,
                   title="Automatic Code Summarizer for Javascript",
                   description="This app can summarize your javascript code snippets in natural language",
                   live=True)
demo.launch(inline=True)