Step 1: Install Required Libraries

In [3]:
!pip install kaggle transformers datasets torch accelerate evaluate rouge_score
!pip install sentencepiece  # For T5/CodeT5 tokenization

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=5302d9fcbe305682ad0dfb62c50be47a7c3d1013d40d66c5815163e2dc8f02f2
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.6 rouge_score-0.1.2


In [4]:
from google.colab import files
files.upload()  # Upload kaggle.json here

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"manshujaiswal47","key":"a4acd51cab42148107c492df5e3adcc6"}'}

Step 2: Download the Kaggle Dataset

In [5]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d mukesh663/code-optimisation-dataset
!unzip code-optimisation-dataset.zip

Dataset URL: https://www.kaggle.com/datasets/mukesh663/code-optimisation-dataset
License(s): MIT
Downloading code-optimisation-dataset.zip to /content
  0% 0.00/30.0k [00:00<?, ?B/s]
100% 30.0k/30.0k [00:00<00:00, 107MB/s]
Archive:  code-optimisation-dataset.zip
  inflating: code-optimization/data.csv  
  inflating: code-optimization/optimized/1.txt  
  inflating: code-optimization/optimized/10.txt  
  inflating: code-optimization/optimized/11.txt  
  inflating: code-optimization/optimized/12.txt  
  inflating: code-optimization/optimized/13.txt  
  inflating: code-optimization/optimized/14.txt  
  inflating: code-optimization/optimized/15.txt  
  inflating: code-optimization/optimized/16.txt  
  inflating: code-optimization/optimized/17.txt  
  inflating: code-optimization/optimized/18.txt  
  inflating: code-optimization/optimized/19.txt  
  inflating: code-optimization/optimized/2.txt  
  inflating: code-optimization/optimized/20.txt  
  inflating: code-optimization/optimized/21.txt

Step 3: Load and Explore the Dataset

In [6]:
import pandas as pd
from datasets import Dataset

# Load the CSV (adjust filename if needed)
df = pd.read_csv('code-optimization/data.csv')  # Corrected file path and name
print(f"Dataset shape: {df.shape}")
print(df.head())  # View sample rows
print(df.columns) # Print column names to debug

# Convert to Hugging Face Dataset (we'll use only code pairs for training)
dataset = Dataset.from_pandas(df[['Unoptimized code ', 'Optimized code ']])
print(f"Sample input: {dataset[0]['Unoptimized code '][:100]}...")  # Preview
print(f"Sample output: {dataset[0]['Optimized code '][:100]}...")

Dataset shape: (28, 6)
   Unoptimized code   Optimized code  Time complexity (unoptimized)  \
0  unoptimized\1.txt  optimized\1.txt                          O(1)   
1  unoptimized\2.txt  optimized\2.txt                          O(n)   
2  unoptimized\3.txt  optimized\3.txt                          O(n)   
3  unoptimized\4.txt  optimized\4.txt                          O(1)   
4  unoptimized\5.txt  optimized\5.txt                          O(1)   

  Time complexity (optimized) Language   \
0                        O(1)    python   
1                        O(1)      java   
2                        O(n)      java   
3                        O(1)      java   
4                        O(1)      java   

                                         Description  
0           Combined conditions to streamline logic.  
1  Used @PostMapping and returned the created ent...  
2  Extracted the repeated logic for initializing ...  
3  Created a helper method setupPopover to avoid ...  
4  Introduced a 

Step 4: Preprocess the Data

In [7]:
import os
from transformers import AutoTokenizer  # Use AutoTokenizer to automatically select the correct tokenizer
from datasets import DatasetDict

# Define a function to read code from a file
def read_code_from_file(filename):
    try:
        # Assuming the files are in the 'code-optimization' directory
        file_path = os.path.join('code-optimization', filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    except FileNotFoundError:
        return "File not found"
    except Exception as e:
        return f"Error reading file: {e}"

# Apply the function to load the code content into new columns
# Create a temporary dataset with the new columns first
dataset_with_code = dataset.map(
    lambda example: {
        'unoptimized_code_text': read_code_from_file(example['Unoptimized code '].strip()),
        'optimized_code_text': read_code_from_file(example['Optimized code '].strip())
    }
)

# Remove the old columns containing only filenames from the temporary dataset
dataset_with_code = dataset_with_code.remove_columns(['Unoptimized code ', 'Optimized code '])

# Now, reassign the temporary dataset to the original variable name
dataset = dataset_with_code

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('Salesforce/codet5-base')

def preprocess_function(examples):
    # Use the column names with code content
    inputs = ["optimize python: " + code for code in examples['unoptimized_code_text']]
    targets = examples['optimized_code_text']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=512, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Tokenize the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)  # 90/10 split
print(tokenized_dataset)

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3
    })
})


Step 5: Fine-Tune the Model

In [8]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
import torch

model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')

training_args = TrainingArguments(
    output_dir='./code-optimizer',
    num_train_epochs=3,
    per_device_train_batch_size=4,  # Adjust if OOM error
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    save_strategy='epoch',
    load_best_model_at_end=False,  # Disabled to resolve conflict
    fp16=True,  # For GPU efficiency
    report_to = None, # Disable WANDB integration
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
)

trainer.train()  # Start training!
trainer.save_model()  # Save the fine-tuned model

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


In [9]:
%env WANDB_DISABLED=true

env: WANDB_DISABLED=true


Now, let's try fine-tuning the model again with WANDB explicitly disabled.

In [10]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments, AutoTokenizer
import torch
import os
from datasets import Dataset, DatasetDict # Import Dataset as it's used here
import pandas as pd # Import pandas as it's used here

# Load the CSV (copied from o1wMsj5yfV54)
df = pd.read_csv('code-optimization/data.csv')
# Convert to Hugging Face Dataset (copied from o1wMsj5yfV54)
dataset = Dataset.from_pandas(df[['Unoptimized code ', 'Optimized code ']])


# Define a function to read code from a file (copied from aVJzRjKvfdIK)
def read_code_from_file(filename):
    try:
        # Assuming the files are in the 'code-optimization' directory
        file_path = os.path.join('code-optimization', filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    except FileNotFoundError:
        return "File not found"
    except Exception as e:
        return f"Error reading file: {e}"

# Apply the function to load the code content into new columns (copied from aVJzRjKvfdIK)
# Create a temporary dataset with the new columns first
dataset_with_code = dataset.map(
    lambda example: {
        'unoptimized_code_text': read_code_from_file(example['Unoptimized code '].strip()),
        'optimized_code_text': read_code_from_file(example['Optimized code '].strip())
    }
)

# Remove the old columns containing only filenames from the temporary dataset (copied from aVJzRjKvfdIK)
dataset_with_code = dataset_with_code.remove_columns(['Unoptimized code ', 'Optimized code '])

# Now, reassign the temporary dataset to the original variable name
dataset = dataset_with_code

# Load the tokenizer (copied from aVJzRjKvfdIK)
tokenizer = AutoTokenizer.from_pretrained('Salesforce/codet5-base')

# Define preprocess_function (copied from aVJzRjKvfdIK)
def preprocess_function(examples):
    # Use the column names with code content
    inputs = ["optimize python: " + code for code in examples['unoptimized_code_text']]
    targets = examples['optimized_code_text']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=512, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Tokenize the dataset (copied from aVJzRjKvfdIK)
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)  # 90/10 split

# Now proceed with model loading and training
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')

training_args = TrainingArguments(
    output_dir='./code-optimizer',
    num_train_epochs=3,
    per_device_train_batch_size=4,  # Adjust if OOM error
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    save_strategy='epoch',
    load_best_model_at_end=False,  # Disabled to resolve conflict
    fp16=True,  # For GPU efficiency
    report_to = None, # Disable WANDB integration
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
)

trainer.train()  # Start training!
trainer.save_model()  # Save the fine-tuned model

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss


Step 7: Evaluate the Model

In [16]:
from evaluate import load
import numpy as np
import torch

rouge = load('rouge')

# Get input IDs for the test set
test_input_ids = tokenized_dataset['test']['input_ids']
test_labels = tokenized_dataset['test']['labels']

# Generate predictions using model.generate()
model.eval() # Set model to evaluation mode
with torch.no_grad():
    generated_ids = model.generate(
        torch.tensor(test_input_ids).to(model.device), # Move input IDs to model's device
        max_length=512,
        num_beams=4,
        early_stopping=True,
        # Removed temperature as it's not a valid generation flag
    )

# Decode generated predictions and reference labels
decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(test_labels, skip_special_tokens=True)

# Compute ROUGE scores
results = rouge.compute(predictions=decoded_preds, references=decoded_labels)
print(results)

{'rouge1': np.float64(0.0), 'rouge2': np.float64(0.0), 'rougeL': np.float64(0.0), 'rougeLsum': np.float64(0.0)}


Step 8: Inference – User Input for Optimized Code

In [19]:
def optimize_code(input_code):
    # Prefix for inference
    input_text = "optimize python: " + input_code
    inputs = tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True)

    # Generate optimized code
    with torch.no_grad():
        # Move input tensor to the same device as the model
        input_ids = inputs.input_ids.to(model.device)
        attention_mask = inputs.attention_mask.to(model.device) if inputs.attention_mask is not None else None

        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=512,
            num_beams=4,  # Beam search for better outputs
            early_stopping=True,
            temperature=0.7,  # Creativity balance
        )

    optimized = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return optimized

# Example usage
user_code = """
# Unoptimized example
result = []
for i in range(1000):
    if i % 2 == 0:
        result.append(i * 2)
"""
print("Optimized Code:\n", optimize_code(user_code))

Optimized Code:
 result.append(i*2)result.append(i*2)result.append(i*2)
