In [1]:
!pip install transformers datasets rouge-score -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [2]:
from datasets import load_dataset

ds = load_dataset("michaelfeil/mined_docstrings_pypi_embedded",split='train[:1%]')

README.md:   0%|          | 0.00/547 [00:00<?, ?B/s]

train-00000-of-00005.parquet:   0%|          | 0.00/293M [00:00<?, ?B/s]

train-00001-of-00005.parquet:   0%|          | 0.00/291M [00:00<?, ?B/s]

train-00002-of-00005.parquet:   0%|          | 0.00/290M [00:00<?, ?B/s]

train-00003-of-00005.parquet:   0%|          | 0.00/289M [00:00<?, ?B/s]

train-00004-of-00005.parquet:   0%|          | 0.00/292M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/280053 [00:00<?, ? examples/s]

In [3]:
ds

Dataset({
    features: ['index', 'package', 'name', 'docstring', 'code', 'signature', 'embed_func_code'],
    num_rows: 2801
})

In [4]:
from transformers import RobertaTokenizerFast
model_name = "Salesforce/codet5-small"

# Load tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
tokenizer.is_fast

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

True

In [5]:
ds = ds.filter(lambda x: [code.lstrip().startswith("def") for code in x["code"]], batched=True)

Filter:   0%|          | 0/2801 [00:00<?, ? examples/s]

In [6]:
ds = ds.filter(lambda x: [docstring != None for docstring in x["docstring"]], batched=True)

Filter:   0%|          | 0/2314 [00:00<?, ? examples/s]

In [7]:
ds = ds.filter(lambda x: [len(docstring.split(" ")) >= 5 for docstring in x["docstring"]], batched=True)

Filter:   0%|          | 0/1046 [00:00<?, ? examples/s]

In [8]:
ds

Dataset({
    features: ['index', 'package', 'name', 'docstring', 'code', 'signature', 'embed_func_code'],
    num_rows: 995
})

In [9]:
# Preprocess function
def preprocess_function(examples):
    inputs = examples["code"]
    targets = examples['docstring']
    model_inputs = tokenizer(inputs, truncation=True)
    labels = tokenizer(targets, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
tokenized_datasets = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/995 [00:00<?, ? examples/s]

In [10]:
from datasets import load_dataset, DatasetDict

# Assuming tokenized_datasets is already loaded and preprocessed
train_test_split = tokenized_datasets.train_test_split(test_size=0.1)

# Extract train and test datasets
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

# Split the train dataset further to create a validation dataset (e.g., 10% of the train data)
train_val_split = train_dataset.train_test_split(test_size=0.1)  # 10% of the train set for validation

# Extract the new train and validation datasets
train_dataset = train_val_split["train"]
val_dataset = train_val_split["test"]

In [11]:
train_dataset.num_rows, val_dataset.num_rows, test_dataset.num_rows

(805, 90, 100)

In [12]:
from transformers import T5ForConditionalGeneration
import torch

# Load the model
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Check if GPU is available, otherwise fall back to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the selected device
model = model.to(device)
print(f"Model moved to device: {device}")

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Model moved to device: cuda


In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./t5_finetuned",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=1,
    weight_decay=0.01,

    num_train_epochs=1,
    logging_dir="./logs",
    logging_steps=50,
    
    eval_strategy="no",
    
    save_strategy="steps",  # Save model at specific intervals
    save_steps=10,  # Save every 100 steps

    fp16=True,  # Use mixed precision for faster training if on GPU
    remove_unused_columns = True,
    gradient_checkpointing=True,

    
)

In [14]:
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import numpy as np

# Define compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Decode predictions and labels to text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Initialize metrics
    rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': [], 'rougeLsum': []}
    gen_lengths = []

    for pred, label in zip(decoded_preds, decoded_labels):
        # ROUGE scores
        scores = rouge_scorer_instance.score(label, pred)
        for key in rouge_scores:
            rouge_scores[key].append(scores[key].fmeasure)

        # Track generated length
        gen_lengths.append(len(pred.split()))

    # Average ROUGE scores
    results = {
        'rouge1': np.mean(rouge_scores['rouge1']),
        'rouge2': np.mean(rouge_scores['rouge2']),
        'rougeL': np.mean(rouge_scores['rougeL']),
        'rougeLsum': np.mean(rouge_scores['rougeLsum']),
        'gen_len': np.mean(gen_lengths),  # Average generated length
    }

    return results

In [15]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [16]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,  # Add this line
)

In [17]:
from kaggle_secrets import UserSecretsClient
import wandb

user_secrets = UserSecretsClient()

# I have saved my API token with "wandb_api" as Label. 
# If you use some other Label make sure to change the same below. 
wandb_api = user_secrets.get_secret("wandb_api") 

wandb.login(key=wandb_api)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33migor-joaquim[0m ([33migor-joaquim-universidade-federal-de-minas-gerais[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [18]:
trainer.train()

[34m[1mwandb[0m: Tracking run with wandb version 0.19.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250131_014513-2ejf5f43[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33m./t5_finetuned[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/igor-joaquim-universidade-federal-de-minas-gerais/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/igor-joaquim-universidade-federal-de-minas-gerais/huggingface/runs/2ejf5f43[0m
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
50,3.3778


TrainOutput(global_step=51, training_loss=3.337572728886324, metrics={'train_runtime': 50.6352, 'train_samples_per_second': 15.898, 'train_steps_per_second': 1.007, 'total_flos': 106602815815680.0, 'train_loss': 3.337572728886324, 'epoch': 1.0})

In [19]:
# results = trainer.evaluate()
# results

In [20]:
text = "def rolling(self, *args, **kwargs) -> RollingGroupby: from pandas.core.window import RollingGroupby return RollingGroupby( self._selected_obj, *args, _grouper=self.grouper, _as_index=self.as_index, **kwargs, )"
input_ids = tokenizer(text, max_length=512, truncation=True, padding="max_length",return_tensors="pt").input_ids.to(device)

# simply generate a single sequence
generated_ids = model.generate(input_ids)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))



:


In [21]:
model.save_pretrained("./fine_tuned_codet5")
tokenizer.save_pretrained("./fine_tuned_codet5")

('./fine_tuned_codet5/tokenizer_config.json',
 './fine_tuned_codet5/special_tokens_map.json',
 './fine_tuned_codet5/vocab.json',
 './fine_tuned_codet5/merges.txt',
 './fine_tuned_codet5/added_tokens.json',
 './fine_tuned_codet5/tokenizer.json')