In [1]:
!pip install -q transformers datasets accelerate torch

import json
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

try:
    with open("docstring_data.json", "r") as f: data = json.load(f)
    print(f"Loaded {len(data)} examples.")
except: print("❌ Upload docstring_data.json first!")

df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1)

checkpoint = "Salesforce/codet5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def preprocess(examples):
    inputs = ["Generate Python Docstring: " + code for code in examples["input_code"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    labels = tokenizer(text_target=examples["target_code"], max_length=512, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess, batched=True)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

args = Seq2SeqTrainingArguments(
    "docstringer-model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    tokenizer=tokenizer
)

print("🚀 Starting Training...")
trainer.train()

trainer.save_model("my_docstringer_model")
tokenizer.save_pretrained("my_docstringer_model")
print("✅ Model Saved!")

Loaded 3432 examples.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/3088 [00:00<?, ? examples/s]

Map:   0%|          | 0/344 [00:00<?, ? examples/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(


🚀 Starting Training...


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Epoch,Training Loss,Validation Loss
1,0.643,0.066411
2,0.1125,0.052609
3,0.0964,0.04704
4,0.0786,0.044617
5,0.0755,0.043938


✅ Model Saved!
