In [1]:
import pandas as pd
import json
import torch
from datasets import Dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from sklearn.model_selection import train_test_split
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    prepare_model_for_kbit_training
)
import bitsandbytes as bnb
from transformers import BitsAndBytesConfig

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [2]:
files = {
    'test': '../data/raw/test.jsonl',
    'train': '../data/raw/train.jsonl'
}

def readDataset(file):
    test_set, train_set = [], []
    # Open and parse the JSONL file
    with open(file, 'r') as file:
        dataset = [json.loads(line) for line in file]

    for data_point in dataset:
        test_set.append({
            'docstring': data_point['version_data'][-1]['docstring'],
            'code': data_point['version_data'][-1]['code'],
            'function': data_point['function'],
            'file_path': data_point['file_path'],
            'filename': data_point['filename'],
        })
    return pd.DataFrame(test_set)

test_df, train_df = pd.DataFrame(), pd.DataFrame()
for name, path in files.items():
    if name == 'test':
        test_df = readDataset(path)
    elif name == 'train':
        train_df = readDataset(path)

test_df
train_df

Unnamed: 0,docstring,code,function,file_path,filename
0,fetches information on multiple orders made by...,"def fetch_orders(self, symbol: Str = None, sin...",oceanex.fetch_orders,python/ccxt/oceanex.py,oceanex.py
1,Test if the accelerator is set to `tpu` when d...,def test_accelerator_set_when_using_tpu(device...,test_accelerator_set_when_using_tpu,tests/tests_pytorch/models/test_tpu.py,test_tpu.py
2,Initialize data storage.,"def __init__(self, hass: HomeAssistant, legacy...",StoredData.__init__,homeassistant/components/feedreader/__init__.py,__init__.py
3,Fetches the history of funding rates\n ...,"def fetch_funding_rate_history(self, symbol=No...",bitmex.fetch_funding_rate_history,python/ccxt/bitmex.py,bitmex.py
4,see https://www.bitmex.com/api/explorer/#not /...,"def fetch_my_trades(self, symbol: Optional[str...",bitmex.fetch_my_trades,python/ccxt/bitmex.py,bitmex.py
...,...,...,...,...,...
2295,Inspect any Python object.\n\n * inspect(<O...,"def inspect(\n obj: Any,\n *,\n conso...",inspect,rich/__init__.py,__init__.py
2296,Pretty prints JSON. Output will be valid JSON....,def print_json(\n json: Optional[str] = Non...,print_json,rich/__init__.py,__init__.py
2297,Return true if safety issue detected.,def is_on(self):\n \n parent_is_...,HomematicipSecuritySensorGroup.is_on,homeassistant/components/homematicip_cloud/bin...,binary_sensor.py
2298,validates an input instance before a convex-hu...,def _validate_input(points):\n \n \n ...,_validate_input,divide_and_conquer/convex_hull.py,convex_hull.py


In [3]:
# 1. Load and prepare your dataset
def load_dataset(file_path):
    test_file = '../data/raw/test.jsonl'
    train_file = '../data/raw/train.jsonl'

    test_df = pd.read_json(test_file, lines=True)
    train_df = pd.read_json(train_file, lines=True)    

    # # Split into train/validation/test
    # train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    # train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)
    
    # Convert to Hugging Face datasets
    # train_dataset = Dataset.from_pandas(train_df)
    # val_dataset = Dataset.from_pandas(val_df)
    # test_dataset = Dataset.from_pandas(test_df)
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)
    
    return train_dataset, val_dataset, test_dataset

# 2. Tokenization functions
def preprocess_function(examples, tokenizer, max_input_length=512, max_target_length=128):
    inputs = examples['diff_code']
    targets = examples['diff_docstring']
    
    # Add a prefix to help the model understand the task
    model_inputs = tokenizer(
        ["Generate a comment for this code: " + code for code in inputs],
        max_length=max_input_length,
        truncation=True,
        padding="max_length",
    )
    
    # Tokenize targets
    labels = tokenizer(
        targets, 
        max_length=max_target_length, 
        truncation=True,
        padding="max_length"
    )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 3. Main fine-tuning function with LoRA and quantization
def finetune_code_comment_model(dataset_path, model_name="Salesforce/codet5p-220m", output_dir="./code-comment-model"):
    # Load datasets
    train_dataset, val_dataset, test_dataset = load_dataset(dataset_path)
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Configure quantization
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )
    
    # Load model with quantization
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map="auto"
    )
    
    # Prepare model for k-bit training
    model = prepare_model_for_kbit_training(model)
    
    # Define LoRA configuration
    peft_config = LoraConfig(
        task_type=TaskType.SEQ_2_SEQ_LM,
        inference_mode=False,
        r=8,  # rank
        lora_alpha=32,
        lora_dropout=0.1,
        # Target modules depend on the model architecture
        # For CodeT5, we typically want to adapt attention layers
        target_modules=["q", "v", "k", "o", "wi", "wo"],
    )
    
    # Apply LoRA
    model = get_peft_model(model, peft_config)
    
    # Print trainable parameters
    model.print_trainable_parameters()
    
    # Preprocess datasets
    tokenized_train = train_dataset.map(
        lambda examples: preprocess_function(examples, tokenizer),
        batched=True
    )
    tokenized_val = val_dataset.map(
        lambda examples: preprocess_function(examples, tokenizer),
        batched=True
    )
    
    # Data collator
    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        model=model,
        pad_to_multiple_of=8
    )
    
    # Training arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",  # Match evaluation_strategy
        learning_rate=1e-4,  # Slightly higher learning rate for LoRA
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=4,  # Further reduce memory requirements
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=3,
        predict_with_generate=True,
        fp16=True,  # Mixed precision training
        report_to="tensorboard",
        load_best_model_at_end=True,
        logging_steps=50,
    )
    
    # Initialize trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    
    # Fine-tune the model
    trainer.train()
    
    # Save the adapter model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    # Evaluate on test set
    print("Evaluating fine-tuned model...")
    test_results = evaluate_model(model, tokenizer, test_dataset)
    print(f"Test Results: {test_results}")
    
    return model, tokenizer

# 4. Evaluation function
def evaluate_model(model, tokenizer, test_dataset):
    # A simple evaluation function
    def generate_comment(code):
        inputs = tokenizer("Generate a comment for this code: " + code, return_tensors="pt")
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        
        outputs = model.generate(**inputs, max_length=128)
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Generate comments for a few examples
    examples = test_dataset.select(range(min(5, len(test_dataset))))
    for i, example in enumerate(examples):
        code = example['diff_code']
        expected = example['diff_docstring']
        generated = generate_comment(code)
        
        print(f"Example {i+1}:")
        print(f"Code: {code[:100]}...")
        print(f"Expected: {expected}")
        print(f"Generated: {generated}")
        print("-" * 50)
    
    # Here you would implement proper evaluation metrics
    return {"status": "Evaluation complete"}

# 5. Inference with the trained model
def load_and_use_model(model_path):
    # Load the fine-tuned model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    # For inference, we can load in 8-bit to save even more memory
    quantization_config = BitsAndBytesConfig(
        load_in_8bit=True,
        device_map="auto"
    )
    
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_path,
        quantization_config=quantization_config
    )
    
    # Example usage
    code_snippet = """
    def calculate_average(numbers):
        total = sum(numbers)
        return total / len(numbers)
    """
    
    # Generate comment
    inputs = tokenizer("Generate a comment for this code: " + code_snippet, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=128)
    generated_comment = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    print(f"Generated comment: {generated_comment}")
    return model, tokenizer

# 6. Usage example
if __name__ == "__main__":
    # Replace with your dataset path
    dataset_path = "path/to/your/code_comment_dataset.csv"
    
    # You can use a larger model with these memory savings
    # "Salesforce/codet5p-770m" or even "Salesforce/codet5p-2b" 
    model, tokenizer = finetune_code_comment_model(
        dataset_path=dataset_path,
        model_name="Salesforce/codet5p-770m",
        output_dir="./code-comment-lora-model"
    )
    
    # Load and use the model for inference
    load_and_use_model("./code-comment-lora-model")

ImportError: Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

In [6]:
!pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/nightly/cpu
Collecting sympy==1.13.1 (from torch)
  Downloading https://download.pytorch.org/whl/nightly/sympy-1.13.1-py3-none-any.whl (6.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting torch
  Using cached torch-2.6.0-cp312-none-macosx_11_0_arm64.whl.metadata (28 kB)
Using cached torch-2.6.0-cp312-none-macosx_11_0_arm64.whl (66.5 MB)
Installing collected packages: sympy, torch
  Attempting uninstall: sympy
    Found existing installation: sympy 1.13.3
    Uninstalling sympy-1.13.3:
      Successfully uninstalled sympy-1.13.3
  Attempting uninstall: torch
    Found existing installation: torch 2.6.0.dev20241112
    Uninstalling torch-2.6.0.dev20241112:
      Successfully uninstalled torch-2.6.0.dev20241112
Successfully installed sympy-1.13.1 torch-2.6.0
