In [1]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from sklearn.model_selection import train_test_split
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    prepare_model_for_kbit_training
)
import bitsandbytes as bnb
from transformers import BitsAndBytesConfig

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [2]:
test_file = '../data/raw/test.jsonl'
train_file = '../data/raw/train.jsonl'

test_df = pd.read_json(test_file, lines=True)
train_df = pd.read_json(train_file, lines=True)

train_df
test_df

Unnamed: 0,file,function,version_data,diff_code,diff_docstring,whitespace_only_code,whitespace_only_docstring,file_path,filename,project,owner
0,tests_test_database.py,test_timewindows,[{'commit_date_time': '2021-08-09 21:00:11+02:...,def test_timewindows(database):\n \n-...,- unit tests for addNewTW and getFirstTWforPro...,False,False,tests/test_database.py,test_database.py,StratosphereLinuxIPS,stratosphereips
1,tests_test_database.py,test_getProfileIdFromIP,[{'commit_date_time': '2024-01-27 22:28:53+02:...,def test_getProfileIdFromIP():\n \n \n...,- unit test for addProfile and getProfileIdFro...,False,False,tests/test_database.py,test_database.py,StratosphereLinuxIPS,stratosphereips
2,erpnext_stock_doctype_delivery_trip_delivery_t...,DeliveryTrip.get_directions,[{'commit_date_time': '2021-11-05 11:16:29+05:...,"def get_directions(self, route, optimize):\n...",Retrieve map directions for a given route an...,False,False,erpnext/stock/doctype/delivery_trip/delivery_t...,delivery_trip.py,erpnext,frappe
3,torch_ao_quantization_fx__model_report_detecto...,PerChannelDetector.determine_observer_insert_p...,[{'commit_date_time': '2022-06-22 12:41:22+00:...,"- def determine_observer_insert_points(self, m...","r""""""\n- There is no observers insert...",False,False,torch/ao/quantization/fx/_model_report/detecto...,detector.py,pytorch,pytorch
4,torch_ao_quantization_fx__model_report_detecto...,InputWeightEqualizationDetector.generate_detec...,[{'commit_date_time': '2022-12-16 08:09:33+00:...,"def generate_detector_report(self, model: Gr...","r""""""\n Determines whether input wei...",False,False,torch/ao/quantization/fx/_model_report/detecto...,detector.py,pytorch,pytorch
...,...,...,...,...,...,...,...,...,...,...,...
2268,zipline_algorithm.py,TradingAlgorithm.order_target_value,[{'commit_date_time': '2015-06-11 10:14:07-04:...,"def order_target_value(self, sid, target,\n ...",Place an order to adjust a position to a tar...,False,False,zipline/algorithm.py,algorithm.py,zipline,quantopian
2269,zipline_algorithm.py,TradingAlgorithm._create_generator,[{'commit_date_time': '2013-03-20 12:12:33-04:...,"- def _create_generator(self, sim_params):\n+ ...",Create a basic generator setup using the sou...,False,False,zipline/algorithm.py,algorithm.py,zipline,quantopian
2270,zipline_algorithm.py,TradingAlgorithm.symbol,[{'commit_date_time': '2018-07-18 10:14:07-04:...,"- def symbol(self, symbol_str):\n+ def symbol(...",Lookup an Equity by its ticker symbol.\n \n...,False,False,zipline/algorithm.py,algorithm.py,zipline,quantopian
2271,zipline_algorithm.py,TradingAlgorithm.continuous_future,[{'commit_date_time': '2017-03-29 08:49:12-04:...,"def continuous_future(self,\n ...",Create a specifier for a continuous contract...,False,False,zipline/algorithm.py,algorithm.py,zipline,quantopian


In [3]:
# 1. Load and prepare your dataset
def load_dataset(file_path):
    test_file = '../data/raw/test.jsonl'
    train_file = '../data/raw/train.jsonl'

    test_df = pd.read_json(test_file, lines=True)
    train_df = pd.read_json(train_file, lines=True)    

    # # Split into train/validation/test
    # train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    # train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)
    
    # Convert to Hugging Face datasets
    # train_dataset = Dataset.from_pandas(train_df)
    # val_dataset = Dataset.from_pandas(val_df)
    # test_dataset = Dataset.from_pandas(test_df)
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)
    
    return train_dataset, val_dataset, test_dataset

# 2. Tokenization functions
def preprocess_function(examples, tokenizer, max_input_length=512, max_target_length=128):
    inputs = examples['diff_code']
    targets = examples['diff_docstring']
    
    # Add a prefix to help the model understand the task
    model_inputs = tokenizer(
        ["Generate a comment for this code: " + code for code in inputs],
        max_length=max_input_length,
        truncation=True,
        padding="max_length",
    )
    
    # Tokenize targets
    labels = tokenizer(
        targets, 
        max_length=max_target_length, 
        truncation=True,
        padding="max_length"
    )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 3. Main fine-tuning function with LoRA and quantization
def finetune_code_comment_model(dataset_path, model_name="Salesforce/codet5p-220m", output_dir="./code-comment-model"):
    # Load datasets
    train_dataset, val_dataset, test_dataset = load_dataset(dataset_path)
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Configure quantization
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )
    
    # Load model with quantization
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map="auto"
    )
    
    # Prepare model for k-bit training
    model = prepare_model_for_kbit_training(model)
    
    # Define LoRA configuration
    peft_config = LoraConfig(
        task_type=TaskType.SEQ_2_SEQ_LM,
        inference_mode=False,
        r=8,  # rank
        lora_alpha=32,
        lora_dropout=0.1,
        # Target modules depend on the model architecture
        # For CodeT5, we typically want to adapt attention layers
        target_modules=["q", "v", "k", "o", "wi", "wo"],
    )
    
    # Apply LoRA
    model = get_peft_model(model, peft_config)
    
    # Print trainable parameters
    model.print_trainable_parameters()
    
    # Preprocess datasets
    tokenized_train = train_dataset.map(
        lambda examples: preprocess_function(examples, tokenizer),
        batched=True
    )
    tokenized_val = val_dataset.map(
        lambda examples: preprocess_function(examples, tokenizer),
        batched=True
    )
    
    # Data collator
    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        model=model,
        pad_to_multiple_of=8
    )
    
    # Training arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",  # Match evaluation_strategy
        learning_rate=1e-4,  # Slightly higher learning rate for LoRA
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=4,  # Further reduce memory requirements
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=3,
        predict_with_generate=True,
        fp16=True,  # Mixed precision training
        report_to="tensorboard",
        load_best_model_at_end=True,
        logging_steps=50,
    )
    
    # Initialize trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    
    # Fine-tune the model
    trainer.train()
    
    # Save the adapter model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    # Evaluate on test set
    print("Evaluating fine-tuned model...")
    test_results = evaluate_model(model, tokenizer, test_dataset)
    print(f"Test Results: {test_results}")
    
    return model, tokenizer

# 4. Evaluation function
def evaluate_model(model, tokenizer, test_dataset):
    # A simple evaluation function
    def generate_comment(code):
        inputs = tokenizer("Generate a comment for this code: " + code, return_tensors="pt")
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        
        outputs = model.generate(**inputs, max_length=128)
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Generate comments for a few examples
    examples = test_dataset.select(range(min(5, len(test_dataset))))
    for i, example in enumerate(examples):
        code = example['diff_code']
        expected = example['diff_docstring']
        generated = generate_comment(code)
        
        print(f"Example {i+1}:")
        print(f"Code: {code[:100]}...")
        print(f"Expected: {expected}")
        print(f"Generated: {generated}")
        print("-" * 50)
    
    # Here you would implement proper evaluation metrics
    return {"status": "Evaluation complete"}

# 5. Inference with the trained model
def load_and_use_model(model_path):
    # Load the fine-tuned model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    # For inference, we can load in 8-bit to save even more memory
    quantization_config = BitsAndBytesConfig(
        load_in_8bit=True,
        device_map="auto"
    )
    
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_path,
        quantization_config=quantization_config
    )
    
    # Example usage
    code_snippet = """
    def calculate_average(numbers):
        total = sum(numbers)
        return total / len(numbers)
    """
    
    # Generate comment
    inputs = tokenizer("Generate a comment for this code: " + code_snippet, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=128)
    generated_comment = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    print(f"Generated comment: {generated_comment}")
    return model, tokenizer

# 6. Usage example
if __name__ == "__main__":
    # Replace with your dataset path
    dataset_path = "path/to/your/code_comment_dataset.csv"
    
    # You can use a larger model with these memory savings
    # "Salesforce/codet5p-770m" or even "Salesforce/codet5p-2b" 
    model, tokenizer = finetune_code_comment_model(
        dataset_path=dataset_path,
        model_name="Salesforce/codet5p-770m",
        output_dir="./code-comment-lora-model"
    )
    
    # Load and use the model for inference
    load_and_use_model("./code-comment-lora-model")

ImportError: Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

In [6]:
!pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/nightly/cpu
Collecting sympy==1.13.1 (from torch)
  Downloading https://download.pytorch.org/whl/nightly/sympy-1.13.1-py3-none-any.whl (6.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting torch
  Using cached torch-2.6.0-cp312-none-macosx_11_0_arm64.whl.metadata (28 kB)
Using cached torch-2.6.0-cp312-none-macosx_11_0_arm64.whl (66.5 MB)
Installing collected packages: sympy, torch
  Attempting uninstall: sympy
    Found existing installation: sympy 1.13.3
    Uninstalling sympy-1.13.3:
      Successfully uninstalled sympy-1.13.3
  Attempting uninstall: torch
    Found existing installation: torch 2.6.0.dev20241112
    Uninstalling torch-2.6.0.dev20241112:
      Successfully uninstalled torch-2.6.0.dev20241112
Successfully installed sympy-1.13.1 torch-2.6.0
