In [1]:
!pip install transformers datasets torch


[0m

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments

# Disable WandB logging
os.environ["WANDB_DISABLED"] = "true"


In [3]:
# Set your model directory (change this if needed)
MODEL_DIR = "/root/ngp/new2/berlin/AI models/expanded_trained_codet5"

# Load dataset
dataset = load_dataset('json', data_files={
    'train': '../Datasets/Training_datasets/combined.jsonl',
    'test': '../Datasets/Training_datasets/Training_dataset_simple_3k.jsonl'
})

# Preview dataset
print("Example training data:", dataset["train"][0])
print("Example test data:", dataset["test"][0])


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Example training data: {'question': 'What is the square of of a variable p? Implement it in Python.', 'expected_code': 'def solve(p):\n    return p ** 2'}
Example test data: {'question': 'Convert this word problem into Python code: What is the sum of 3 and 5?', 'expected_code': 'def solve(): return 3 + 5'}


In [4]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-base")

# Preprocessing function
def preprocess_function(examples):
    inputs = tokenizer(examples["question"], max_length=128, truncation=True, padding="max_length")
    targets = tokenizer(examples["expected_code"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

# Tokenize dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

print("Tokenized dataset example:", tokenized_dataset["train"][0])


Map:   0%|          | 0/14944 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Tokenized dataset example: {'question': 'What is the square of of a variable p? Implement it in Python.', 'expected_code': 'def solve(p):\n    return p ** 2', 'input_ids': [1, 23801, 353, 326, 8576, 434, 434, 279, 2190, 293, 35, 10886, 518, 316, 6600, 18, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'la

In [5]:
training_args = Seq2SeqTrainingArguments( 
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs",
    warmup_steps=500,
    logging_steps=100,
    save_steps=500,
)

print("Training arguments set up successfully!")


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training arguments set up successfully!


In [6]:
# Load pre-trained model if available
if os.path.exists(MODEL_DIR):
    print(f"Loading pre-trained model from {MODEL_DIR}...")
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_DIR)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
else:
    print("No pre-trained model found. Loading base model...")
    model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/codet5-base")


Loading pre-trained model from /root/ngp/new2/berlin/AI models/expanded_trained_codet5...


In [7]:
# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
)

print("Trainer initialized! Starting fine-tuning...")


  trainer = Seq2SeqTrainer(


Trainer initialized! Starting fine-tuning...


In [8]:
trainer.train()

print("Fine-tuning complete!")


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.4383,7.3e-05
2,0.3805,6.9e-05
3,0.3594,6.5e-05
4,0.3938,5.7e-05
5,0.3594,5.7e-05
6,0.3485,5.7e-05


Fine-tuning complete!


In [9]:
# Save the updated model
model.save_pretrained(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)

print(f"Updated model saved to {MODEL_DIR}!")


Updated model saved to /root/ngp/new2/berlin/AI models/expanded_trained_codet5!


In [10]:
import torch

def generate_code(question, model, tokenizer, max_length=200, temperature=1.0):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Detect device
    model.to(device)  # Move model to the same device

    inputs = tokenizer(question, return_tensors="pt").to(device)  # Move inputs to the same device
    with torch.no_grad():
        outputs = model.generate(inputs["input_ids"], max_length=max_length, temperature = temperature)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test with a sample question
test_question = "Convert this word problem into Python code: What is the sum of a and b"
generated_code = generate_code(test_question, model, tokenizer, max_length=100, temperature=1.2)

print("Test Question:", test_question)
print("Generated Code:", generated_code)



Test Question: Convert this word problem into Python code: What is the sum of a and b
Generated Code: def solve(): return a + b


