In [1]:
import pandas as pd
import re

train_path = './train.csv'
test_path = './test.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

def clean_text(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

train_df['Body_question'] = train_df['Body_question'].apply(clean_text)
train_df['Body_answer'] = train_df['Body_answer'].apply(clean_text)

test_df['Body_question'] = test_df['Body_question'].apply(clean_text)
test_df['Body_answer'] = test_df['Body_answer'].apply(clean_text)

def remove_urls(text):
    return re.sub(r'http\S+|www\S+|@\S+', '', text)


train_df['Body_question'] = train_df['Body_question'].apply(remove_urls)
train_df['Body_answer'] = train_df['Body_answer'].apply(remove_urls)

test_df['Body_question'] = test_df['Body_question'].apply(remove_urls)
test_df['Body_answer'] = test_df['Body_answer'].apply(remove_urls)


import contractions
def expand_contractions(text):
    return contractions.fix(text)

train_df['Body_question'] = train_df['Body_question'].apply(expand_contractions)
train_df['Body_answer'] = train_df['Body_answer'].apply(expand_contractions)

test_df['Body_question'] = test_df['Body_question'].apply(expand_contractions)
test_df['Body_answer'] = test_df['Body_answer'].apply(expand_contractions)


print(train_df.head())

   Unnamed: 0                                      Body_question  \
0        6604  I Am new to machine learning and I try to crea...   
1         106  I Am using Neural Networks to solve different ...   
2        2993  I have a training set composed of images havin...   
3       10766  I have encountered a strange situation where t...   
4        4315  I have trained a CNN model and I have applied ...   

                                         Body_answer  
0  You have two separate problems going on\n\nUse...  
1  UPDATE the landscape has changed quite a bit s...  
2  I have a suggestion for you Maybe not complete...  
3  If I am getting you right you are trying to pr...  
4  Do you only have one single model If you were ...  


In [2]:
#Starting code
from datasets import load_dataset
from transformers import RobertaTokenizer


# load data
dataset = load_dataset(
    'csv',
    data_files={
        'train': './train.csv',
        'test': './test.csv'
    }
)
print(dataset)

# load tokenizer
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')

# preprocess function
def preprocess_batch(examples):
    inputs = ["question: " + question for question in examples['Body_question']]
    targets = examples['Body_answer']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=256, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# apply the preprocess to the dataset
tokenized_datasets = dataset.map(preprocess_batch, batched=True, remove_columns=dataset["train"].column_names)

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Body_question', 'Body_answer'],
        num_rows: 9751
    })
    test: Dataset({
        features: ['Unnamed: 0', 'Body_question', 'Body_answer'],
        num_rows: 2438
    })
})


In [2]:
#Edited Code
from datasets import load_dataset
from transformers import RobertaTokenizer
from transformers import AutoTokenizer

# load data
dataset = load_dataset(
    'csv',
    data_files={
        'train': './train.csv',
        'test': './test.csv'
    }
)
print(dataset)

# load tokenizer
#tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
tokenizer = AutoTokenizer.from_pretrained('Salesforce/codet5-base')

# preprocess function
def preprocess_batch(examples):
    inputs = ["question: " + question for question in examples['Body_question']]
    targets = examples['Body_answer']
    model_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=512)
    labels = tokenizer(targets, truncation=True, padding="max_length", max_length=512)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# apply the preprocess to the dataset
tokenized_datasets = dataset.map(preprocess_batch, batched=True, remove_columns=dataset["train"].column_names)

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Body_question', 'Body_answer'],
        num_rows: 9751
    })
    test: Dataset({
        features: ['Unnamed: 0', 'Body_question', 'Body_answer'],
        num_rows: 2438
    })
})


In [None]:
import numpy as np
optimal_max_length = int(np.percentile(lengths, 90))
optimal_max_length

583

In [3]:
from transformers import T5ForConditionalGeneration
import torch

model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [4]:
from transformers import DataCollatorForSeq2Seq

# convert tokenized data to PyTorch format
train_dataset = tokenized_datasets['train'].with_format('torch')
test_dataset = tokenized_datasets['test'].with_format('torch')

# define data collator for dynamic padding
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding="max_length",
    max_length=512
)

In [None]:
# define training arguments
from transformers import TrainingArguments, Trainer
import torch
torch.cuda.empty_cache()

output_dir = './Checkpoints/codet5_results'


# define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,          # Save model and checkpoints here
    evaluation_strategy='epoch',   # Evaluate at the end of each epoch
    save_strategy='epoch',         # Save model at the end of each epoch
    learning_rate=5e-5,            # Learning rate
    per_device_train_batch_size=4, # Training batch size
    per_device_eval_batch_size=4,  # Evaluation batch size
    gradient_accumulation_steps=2,
    num_train_epochs=3,            # Number of epochs
    weight_decay=0.01,             # Regularization weight decay
    logging_dir=f'{output_dir}/logs',  # Logs directory
    save_total_limit=2,            # Keep only the last 2 checkpoints
    load_best_model_at_end=True,    # Load the best model (based on loss) at the end of training
    fp16=True
)

# Define Trainer
trainer = Trainer(
    model=model,                   # Model instance
    args=training_args,            # Training arguments
    train_dataset=train_dataset,   # Training dataset
    eval_dataset=test_dataset,     # Evaluation dataset
    tokenizer=tokenizer,           # Tokenizer instance
    data_collator=data_collator    # Handles padding and batching
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

  trainer = Trainer(


  0%|          | 0/3657 [00:00<?, ?it/s]

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


In [None]:
from transformers import T5ForConditionalGeneration, RobertaTokenizer

# Specify the checkpoint directory
checkpoint_dir = './Checkpoints/codet5_results'

# Load the model and tokenizer from the checkpoint
model = T5ForConditionalGeneration.from_pretrained(checkpoint_dir)
tokenizer = RobertaTokenizer.from_pretrained(checkpoint_dir)

# Verify that the model and tokenizer are loaded
print("Model and tokenizer successfully loaded from checkpoint!")


Model and tokenizer successfully loaded from checkpoint!


In [16]:
# Example prompt
prompt = "Write a Python function to add two numbers."

# Tokenize the input prompt
input_ids = tokenizer("question: " + prompt, return_tensors="pt").input_ids.to("cuda")

# Generate the output
#outputs = model.generate(input_ids=input_ids, max_length=256)
#outputs = model.generate(input_ids=input_ids,max_length=256,num_beams=5)  # Beam search
outputs = model.generate(input_ids=input_ids,max_length=256,do_sample=True,top_k=50) # Top-k samplingtop_p=0.95

# Decode the generated output
generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Code:")
print(generated_code)


Generated Code:
This is a non-negative sign function. It has the same impact as the difference between the two are added since the sum of two values are similar to the sum between each two values.   You can write a Python function that can add a number to a fixed number, with a special case to increase the weights, and then apply a mathematical equation and then use that as a function with the same value.  So, it performs mathematical mathematical approximation.  The sum will change if you add two items to the same number.  See this comment for a list of example to be able to calculate, and you can build this model for further processing than the mathematical terms, depending if you have two instances, one for each class.


As the function is equivalent, it will become able to add two numeric members.  You may also try to add a function which on the end:
import math
import numpy as np

def add(x):
    return x * (1 + y)


    return sum(x)


def exponential(x):
    random.rand(0, 2)
  