In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install transformers accelerate datasets


In [None]:
from datasets import load_dataset

# Load the MBPP dataset
dataset = load_dataset("mbpp", split="train")

# Check a sample
print(dataset[0])
from datasets import load_dataset

# Load the MBPP dataset (train split)
dataset = load_dataset("mbpp", split="train")

# Check column names and first example
print("Columns:", dataset.column_names)
print("First example:", dataset[0])


In [None]:
from datasets import load_dataset

# Load MBPP
dataset = load_dataset("mbpp", split="train")

# Print column names
print(dataset.column_names)


In [None]:
# Map columns to text-to-text format
train_data = dataset.map(lambda x: {'input_text': x['text'], 'target_text': x['code']})

# Keep only necessary columns
train_data = train_data.remove_columns([col for col in train_data.column_names if col not in ['input_text', 'target_text']])

# Verify preprocessing
print(train_data[0])


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "Salesforce/codet5-base"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

print(f"Model {model_name} loaded successfully!")


In [None]:
# Tokenize input and target for CodeT5
def preprocess(example):
    return tokenizer(
        example['input_text'],
        text_target=example['target_text'],
        truncation=True,
        padding='max_length',
        max_length=256
    )

train_data = train_data.map(preprocess, batched=True)

# Verify a sample after tokenization
print(train_data[0])


In [None]:
from transformers import Trainer, TrainingArguments, TrainerCallback
from datetime import datetime

# Optional: small subset for faster fine-tuning
train_subset = train_data.select(range(300))  # first 300 examples only

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,            
    per_device_train_batch_size=2,
    save_total_limit=1,
    report_to=[],  
)


# Custom callback to log timestamp
class TimeLoggingCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        # Print timestamp whenever logs are generated
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        print(f"[{timestamp}] Step {state.global_step}: {logs}")

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    callbacks=[TimeLoggingCallback()]  # Add timestamp logging
)

# Start fine-tuning
trainer.train()


In [None]:
from transformers import pipeline

# Create the text-to-text generation pipeline using pre-trained CodeT5
gen = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
print("Code generation pipeline ready!")


In [None]:
from datetime import datetime

# Improved problem description with example
problem = """
# Python function
# Task: generate all even integers between a and b inclusive
# The function should return a list in ascending order
# Example: generate_integers(2, 6) -> [2, 4, 6]
def generate_integers(a, b):
"""

def generate_code(problem_description):
    # Generate code using fine-tuned CodeT5
    result = gen(problem_description, max_new_tokens=256)
    # Get current timestamp
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    return result[0]['generated_text'], timestamp

# Generate code
generated_code, timestamp = generate_code(problem)
print(f"Generated Code at {timestamp}:")
print(generated_code)


In [None]:
import pandas as pd

# Store generated code in a DataFrame
results = [{"task": "generate_integers", "generated_code": generated_code}]
df_results = pd.DataFrame(results)

# Save results
out_path = "/kaggle/working/codet5_mbpp_results.csv"
df_results.to_csv(out_path, index=False)
print("Results saved to:", out_path)


In [None]:
new_problem = """
# Python function
# Task: check if a number is a palindrome
# The function should return True if number is palindrome, otherwise False
# Example: is_palindrome(121) -> True
def is_palindrome(n):
    # Your code here
"""


In [None]:
# Generate code for your custom problem
generated_code, timestamp = generate_code(new_problem)
print(f"Generated Code at {timestamp}:")
print(generated_code)


In [None]:
results = gen(new_problem, max_new_tokens=256, num_return_sequences=3)
for i, r in enumerate(results):
    print(f"Candidate {i+1}:\n{r['generated_text']}\n")


In [None]:
from datetime import datetime

def generate_code(problem_description, num_candidates=1):
    """
    problem_description: str, the problem text including function signature
    num_candidates: int, number of code versions to generate
    """
    results = gen(problem_description, max_new_tokens=256, num_return_sequences=num_candidates)
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    # Return a list of tuples (code, timestamp)
    return [(r['generated_text'], timestamp) for r in results]


In [None]:
new_problem = """
# Python function
# Task: find the largest number among three numbers
# The function should return the largest of the three numbers
# Example: max_of_three(3, 7, 5) -> 7
def max_of_three(a, b, c):
    # Your code here
"""

# Generate 1 candidate
generated_results = generate_code(new_problem, num_candidates=1)

# Print the output
for code, ts in generated_results:
    print(f"Generated Code at {ts}:\n{code}")


In [None]:
generated_results = generate_code(new_problem, num_candidates=3)
for i, (code, ts) in enumerate(generated_results):
    print(f"Candidate {i+1} at {ts}:\n{code}\n{'-'*50}")


In [None]:
problems = [
    {
        "task": "palindrome_check",
        "description": """
# Python function
# Task: check if a number is palindrome
# Example: is_palindrome(121) -> True
def is_palindrome(n):
    # Your code here
"""
    },
    {
        "task": "fibonacci_n",
        "description": """
# Python function
# Task: return nth Fibonacci number
# Example: fibonacci_n(6) -> 8
def fibonacci_n(n):
    # Your code here
"""
    },
    {
        "task": "factorial",
        "description": """
# Python function
# Task: return factorial of n
# Example: factorial(5) -> 120
def factorial(n):
    # Your code here
"""
    },
]


In [None]:
from datetime import datetime

for p in problems:
    generated_result = gen(p["description"], max_new_tokens=64, num_return_sequences=1)
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    code = generated_result[0]['generated_text']
    
    print(f"Task: {p['task']} | Generated at {timestamp}")
    print(code)
    print("-"*50)
