<a href="https://colab.research.google.com/github/HammadHARahim/create-react-app-lambda/blob/master/moondream3_preview.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch transformers datasets evaluate accelerate sentencepiece

In [3]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration

# Load pretrained CodeT5-small
model_name = "Salesforce/codet5-small"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Example: Python/Java function
code_snippet = """
public Object changeInternalCrn(ProceedingJoinPoint proceedingJoinPoint) {
    String userCrnString = ThreadBasedUserCrnProvider.getUserCrn();
    MethodSignature methodSignature = (MethodSignature) proceedingJoinPoint.getSignature();
    return reflectionUtil.proceed(proceedingJoinPoint, methodSignature);
}
"""

# Prepare input (we can prefix with "generate test case:" to help guide the model)
input_text = "generate test case: " + code_snippet

# Tokenize
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)

# Generate output
output_ids = model.generate(
    **inputs,
    max_length=256,
    num_beams=5,
    early_stopping=True
)

# Decode prediction
generated_test = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("\n📌 Generated Test Case:\n")
print(generated_test)



📌 Generated Test Case:

methodSignature;



In [4]:
import json
import random

# Load your test.json
with open("/content/dataset_testcases.jsonl", "r") as f:
    data = [json.loads(line) for line in f]

# Shuffle for randomness
random.shuffle(data)

# Split ratios
train_size = int(0.8 * len(data))
valid_size = int(0.1 * len(data))

train_data = data[:train_size]
valid_data = data[train_size:train_size+valid_size]
test_data = data[train_size+valid_size:]

# Save splits
def save_jsonl(filename, dataset):
    with open(filename, "w") as f:
        for item in dataset:
            f.write(json.dumps(item) + "\n")

save_jsonl("train.jsonl", train_data)
save_jsonl("valid.jsonl", valid_data)
save_jsonl("test.jsonl", test_data)

print(f"✅ Created splits: train={len(train_data)}, valid={len(valid_data)}, test={len(test_data)}")


✅ Created splits: train=50485, valid=6310, test=6312


In [None]:
%%javascript
function ClickConnect(){
console.log("Working");
document.querySelector("colab-toolbar-button#connect").click()
}setInterval(ClickConnect,60000)

In [None]:
import json
from datasets import load_dataset
from transformers import RobertaTokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq, Trainer, TrainingArguments

# --------------------
# Load Dataset
# --------------------
dataset = load_dataset("json", data_files={"train": "train.jsonl", "validation": "valid.jsonl"})

# --------------------
# Load Model & Tokenizer
# --------------------
model_name = "Salesforce/codet5-small"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# --------------------
# Preprocess Function
# --------------------
max_input_len = 512
max_output_len = 256

def preprocess(examples):
    inputs = [ex for ex in examples["input"]]
    targets = [ex for ex in examples["output"]]

    model_inputs = tokenizer(inputs, max_length=max_input_len, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=max_output_len, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = dataset.map(preprocess, batched=True, remove_columns=["input", "output"])

# --------------------
# Data Collator (for padding dynamically)
# --------------------
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# --------------------
# Training Arguments
# --------------------
training_args = TrainingArguments(
    output_dir="./codet5-testgen",
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=100,
    save_total_limit=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=3e-5,
    num_train_epochs=5,
    warmup_steps=500,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    fp16=True,  # ✅ Mixed precision for speed
    report_to="none",
)

# --------------------
# Trainer
# --------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# --------------------
# Train
# --------------------
trainer.train()

# --------------------
# Save Final Model
# --------------------
trainer.save_model("./codet5-testgen-final")
tokenizer.save_pretrained("./codet5-testgen-final")


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/50485 [00:00<?, ? examples/s]

Map:   0%|          | 0/6310 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss,Validation Loss
500,1.3197,1.136146
1000,1.1396,1.017182
1500,1.1185,0.973749
2000,1.0723,0.945347
