## Fine-Tuning PLBart for Python code generation from English algorithmic statements

In [1]:
# Install required packages
!pip install pandas
!pip install transformers datasets sentencepiece
!pip install huggingface_hub

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:0

In [2]:
# Imports
import json
import pandas as pd
from datasets import Dataset
from transformers import PLBartTokenizer, PLBartForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments

In [3]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load the JSON file
import pandas as pd
import json

dataset = '/content/drive/MyDrive/itrl/standardized_dataset.json'
with open(dataset, 'r') as f:
    data = json.load(f)

# Convert to pandas DataFrame
df = pd.DataFrame(data)

# Display the first few records to check
df.head()

Mounted at /content/drive


Unnamed: 0,text,code
0,Assign number two to variable A.,A = 2
1,Assign number two to variable A.,A = 2
2,Store number two in variable A,A = 2
3,Declare the variable A and store the number tw...,A = 2
4,Assign number five to variable B.,B = 5


### Preprocessing the Dataset

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import PLBartTokenizer, PLBartForConditionalGeneration

# Split data into train and valid sets
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)

# Load the PLBart tokenizer and model
tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-python-en_XX", src_lang="en_XX", tgt_lang="python")

# Add the new token for newline and indentation to the tokenizer
tokenizer.add_tokens('__newline_indent__')

# Function to combine newline and indentation into one token
def preprocess_code(example):
    # Combine newline and indentation
    example['code'] = example['code'].replace('\n    ', '__newline_indent__')
    return example

# Apply preprocessing to standardize indentation and tokenize the data
train_dataset = train_dataset.map(preprocess_code)
valid_dataset = valid_dataset.map(preprocess_code)

# Update the model to handle the new token
model = PLBartForConditionalGeneration.from_pretrained("uclanlp/plbart-python-en_XX")
model.resize_token_embeddings(len(tokenizer))

# Define a function to tokenize the data (with the assumption that newline+indent has been replaced by <newline_indent>)
def tokenize_data(example):
    inputs = tokenizer(example['text'], max_length=256, truncation=True, padding='max_length')
    outputs = tokenizer(example['code'], max_length=256, truncation=True, padding='max_length')
    inputs['labels'] = outputs['input_ids']  # Set the labels for the training task
    return inputs

# Tokenize the training dataset
tokenized_train_dataset = train_dataset.map(tokenize_data, batched=True)

# Tokenize the validation dataset
tokenized_valid_dataset = valid_dataset.map(tokenize_data, batched=True)

# Combine into a DatasetDict
dataset = DatasetDict({
    'train': tokenized_train_dataset,
    'valid': tokenized_valid_dataset
})



Map:   0%|          | 0/998 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/998 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

### Fine-tuning the plbart-python-en_XX model

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    report_to="none",
    predict_with_generate=True,
    logging_dir='./logs',
    logging_steps=10,
)

# Create a Seq2SeqTrainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_valid_dataset,
)

trainer.train()
trainer.evaluate()

model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")
model.save_pretrained("/content/drive/MyDrive/itrl/plbart_algo2code")
tokenizer.save_pretrained("/content/drive/MyDrive/itrl/plbart_algo2code")

Epoch,Training Loss,Validation Loss
1,2.4336,1.582546
2,0.0367,0.031222
3,0.0152,0.019035
4,0.012,0.015455
5,0.0106,0.013009
6,0.0065,0.012083
7,0.0095,0.010921
8,0.007,0.010347
9,0.006,0.010175
10,0.0046,0.010017


Non-default generation parameters: {'forced_eos_token_id': 2}
Non-default generation parameters: {'forced_eos_token_id': 2}
Non-default generation parameters: {'forced_eos_token_id': 2}


Non-default generation parameters: {'forced_eos_token_id': 2}
Non-default generation parameters: {'forced_eos_token_id': 2}


('/content/drive/MyDrive/itrl/model-ft-e10-cl256sd/tokenizer_config.json',
 '/content/drive/MyDrive/itrl/model-ft-e10-cl256sd/special_tokens_map.json',
 '/content/drive/MyDrive/itrl/model-ft-e10-cl256sd/sentencepiece.bpe.model',
 '/content/drive/MyDrive/itrl/model-ft-e10-cl256sd/added_tokens.json')

### Model Inference

In [4]:
from transformers import PLBartForConditionalGeneration, PLBartTokenizer
import torch

class ModelInference:
    def __init__(self, model_path):
        # Load the fine-tuned model and tokenizer
        self.tokenizer = PLBartTokenizer.from_pretrained(model_path)
        self.model = PLBartForConditionalGeneration.from_pretrained(model_path)
        self.model.eval()  # Set model to evaluation mode

    def generate_code(self, input_text):
        # Tokenize the input
        inputs = self.tokenizer(input_text, return_tensors="pt")

        # Generate the output
        with torch.no_grad():  # Disable gradient calculation
            output = self.model.generate(**inputs, decoder_start_token_id=self.tokenizer.lang_code_to_id["__python__"])

        # Decode the output to get the generated Python code
        generated_code = self.tokenizer.decode(output[0], skip_special_tokens=True)

        # Postprocess the generated output to replace the custom token with newline and indent
        generated_code = generated_code.replace('__newline_indent__', '\n    ')

        return generated_code

# Usage
model_path = "/content/drive/MyDrive/itrl/plbart_algo2code"
code_generator = ModelInference(model_path)

# Test the inference function with an example input
input_text = "Print\"have a good day\""
generated_code = code_generator.generate_code(input_text)

print("Generated Python Code:\n", generated_code)



Generated Python Code:
 print(" have a good day")
