In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

train_path = '/content/drive/MyDrive/train.csv'
test_path = '/content/drive/MyDrive/test.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print(train_df.head())

   Unnamed: 0                                      Body_question  \
0        6604  I'm new to machine learning and I try to creat...   
1         106  I'm using Neural Networks to solve different M...   
2        2993  I have a training set composed of images havin...   
3       10766  I have encountered a strange situation where t...   
4        4315  I have trained a CNN model and I have applied ...   

                                         Body_answer  
0  You have two separate problems going on.\n\nUs...  
1  UPDATE: the landscape has changed quite a bit ...  
2  I have a suggestion for you. Maybe not complet...  
3  If I am getting you right, you are trying to p...  
4  Do you only have one single model? If you were...  


In [None]:
!pip install datasets transformers

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
from datasets import load_dataset
from transformers import RobertaTokenizer

# load data
dataset = load_dataset(
    'csv',
    data_files={
        'train': '/content/drive/MyDrive/train.csv',
        'test': '/content/drive/MyDrive/test.csv'
    }
)
print(dataset)

# load tokenizer
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')

# preprocess function
def preprocess_batch(examples):
    inputs = ["question: " + question for question in examples['Body_question']]
    targets = examples['Body_answer']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=256, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# apply the preprocess to the dataset
tokenized_datasets = dataset.map(preprocess_batch, batched=True, remove_columns=dataset["train"].column_names)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Body_question', 'Body_answer'],
        num_rows: 9751
    })
    test: Dataset({
        features: ['Unnamed: 0', 'Body_question', 'Body_answer'],
        num_rows: 2438
    })
})


tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

Map:   0%|          | 0/9751 [00:00<?, ? examples/s]

Map:   0%|          | 0/2438 [00:00<?, ? examples/s]

In [None]:
from transformers import T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')

In [None]:
from transformers import DataCollatorForSeq2Seq

# convert tokenized data to PyTorch format
train_dataset = tokenized_datasets['train'].with_format('torch')
test_dataset = tokenized_datasets['test'].with_format('torch')

# define data collator for dynamic padding
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    max_length=512
)

In [None]:
# !pip install sacrebleu bert-score
# !pip install evaluate

In [None]:
# pip install protobuf==3.20.*
# pip install --upgrade accelerate

In [None]:
# define training arguments
from transformers import TrainingArguments, Trainer
import torch
torch.cuda.empty_cache()

output_dir = '/content/drive/MyDrive/codet5_results'

# define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,          # Save model and checkpoints here
    evaluation_strategy='epoch',   # Evaluate at the end of each epoch
    save_strategy='epoch',         # Save model at the end of each epoch
    learning_rate=5e-5,            # Learning rate
    per_device_train_batch_size=8, # Training batch size
    per_device_eval_batch_size=8,  # Evaluation batch size
    num_train_epochs=3,            # Number of epochs
    weight_decay=0.01,             # Regularization weight decay
    logging_dir=f'{output_dir}/logs',  # Logs directory
    save_total_limit=2,            # Keep only the last 2 checkpoints
    load_best_model_at_end=True    # Load the best model (based on loss) at the end of training
)

# define Trainer
trainer = Trainer(
    model=model,                   # Model instance
    args=training_args,            # Training arguments
    train_dataset=train_dataset,   # Training dataset
    eval_dataset=test_dataset,     # Evaluation dataset
    tokenizer=tokenizer,           # Tokenizer instance
    data_collator=data_collator    # Handles padding and batching
)


trainer.train()

# evaluate
results = trainer.evaluate()
print("Evaluation Results:", results)

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,2.5243,2.417823
2,2.4036,2.362776
3,2.3151,2.346093


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Evaluation Results: {'eval_loss': 2.34609317779541, 'eval_runtime': 34.9107, 'eval_samples_per_second': 69.835, 'eval_steps_per_second': 8.737, 'epoch': 3.0}


In [None]:
from transformers import T5ForConditionalGeneration, RobertaTokenizer

checkpoint_dir = '/content/drive/MyDrive/codet5_results/checkpoint-2438'
model = T5ForConditionalGeneration.from_pretrained(checkpoint_dir)
tokenizer = RobertaTokenizer.from_pretrained(checkpoint_dir)

print("Model and tokenizer successfully loaded from checkpoint!")


Model and tokenizer successfully loaded from checkpoint!


In [None]:
# prompt
prompt = "Write a Python function to add two numbers."
input_ids = tokenizer("question: " + prompt, return_tensors="pt").input_ids

outputs = model.generate(input_ids=input_ids, max_length=256)
generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Code:")
print(generated_code)


Generated Code:
You can use the mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathematical mathem
