In [9]:
import pandas as pd
import json
import torch
from datasets import Dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from sklearn.model_selection import train_test_split
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType
)
import os
import platform

In [None]:
files = {
    'test': '../data/raw/test.jsonl',
    'train': '../data/raw/train.jsonl'
}

def readDataset(file):
    test_set, train_set = [], []
    # Open and parse the JSONL file
    with open(file, 'r') as file:
        dataset = [json.loads(line) for line in file]

    for data_point in dataset:
        test_set.append({
            'docstring': data_point['version_data'][-1]['docstring'],
            'code': data_point['version_data'][-1]['code'],
            'function': data_point['function'],
            'file_path': data_point['file_path'],
            'filename': data_point['filename'],
        })
    return pd.DataFrame(test_set)

test_df, train_df = pd.DataFrame(), pd.DataFrame()
for name, path in files.items():
    if name == 'test':
        test_df = readDataset(path)
    elif name == 'train':
        train_df = readDataset(path)

test_df
train_df

Unnamed: 0,docstring,code,function,file_path,filename
0,fetches information on multiple orders made by...,"def fetch_orders(self, symbol: Str = None, sin...",oceanex.fetch_orders,python/ccxt/oceanex.py,oceanex.py
1,Test if the accelerator is set to `tpu` when d...,def test_accelerator_set_when_using_tpu(device...,test_accelerator_set_when_using_tpu,tests/tests_pytorch/models/test_tpu.py,test_tpu.py
2,Initialize data storage.,"def __init__(self, hass: HomeAssistant, legacy...",StoredData.__init__,homeassistant/components/feedreader/__init__.py,__init__.py
3,Fetches the history of funding rates\n ...,"def fetch_funding_rate_history(self, symbol=No...",bitmex.fetch_funding_rate_history,python/ccxt/bitmex.py,bitmex.py
4,see https://www.bitmex.com/api/explorer/#not /...,"def fetch_my_trades(self, symbol: Optional[str...",bitmex.fetch_my_trades,python/ccxt/bitmex.py,bitmex.py
...,...,...,...,...,...
2295,Inspect any Python object.\n\n * inspect(<O...,"def inspect(\n obj: Any,\n *,\n conso...",inspect,rich/__init__.py,__init__.py
2296,Pretty prints JSON. Output will be valid JSON....,def print_json(\n json: Optional[str] = Non...,print_json,rich/__init__.py,__init__.py
2297,Return true if safety issue detected.,def is_on(self):\n \n parent_is_...,HomematicipSecuritySensorGroup.is_on,homeassistant/components/homematicip_cloud/bin...,binary_sensor.py
2298,validates an input instance before a convex-hu...,def _validate_input(points):\n \n \n ...,_validate_input,divide_and_conquer/convex_hull.py,convex_hull.py


In [7]:
# Check if running on Apple Silicon
is_apple_silicon = platform.processor() == 'arm' or "Apple" in platform.processor()
print(f"Running on Apple Silicon: {is_apple_silicon}")

# Check if MPS is available
mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
device = torch.device("mps" if mps_available else "cpu")
print(f"Using device: {device}")

# 1. Load and prepare your dataset
def load_dataset(file_path):
    test_file = '../data/raw/test.jsonl'
    train_file = '../data/raw/train.jsonl'

    test_df = pd.read_json(test_file, lines=True)
    train_df = pd.read_json(train_file, lines=True)    

    # # Split into train/validation/test
    # train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    # train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)
    
    # Convert to Hugging Face datasets
    # train_dataset = Dataset.from_pandas(train_df)
    # val_dataset = Dataset.from_pandas(val_df)
    # test_dataset = Dataset.from_pandas(test_df)
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)
    
    return train_dataset, val_dataset, test_dataset

# 2. Tokenization functions
def preprocess_function(examples, tokenizer, max_input_length=512, max_target_length=128):
    inputs = examples['diff_code']
    targets = examples['diff_docstring']
    
    # Add a prefix to help the model understand the task
    model_inputs = tokenizer(
        ["Generate a comment for this code: " + code for code in inputs],
        max_length=max_input_length,
        truncation=True,
        padding="max_length",
    )
    
    # Tokenize targets
    labels = tokenizer(
        targets, 
        max_length=max_target_length, 
        truncation=True,
        padding="max_length"
    )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 3. Main fine-tuning function using standard MPS for Apple Silicon
def finetune_code_comment_model(dataset_path, model_name="Salesforce/codet5p-220m", output_dir="./code-comment-model"):
    # Load datasets
    train_dataset, val_dataset, test_dataset = load_dataset(dataset_path)
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Load model with appropriate settings for Apple Silicon
    if mps_available:
        print("Loading model with MPS acceleration...")
        # Load model and convert to half precision for better performance on MPS
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        model = model.half().to(device)  # Half precision (fp16) for better MPS performance
    else:
        print("Loading standard model...")
        # model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        model = model.half().to(device)
    
    # Define LoRA configuration
    peft_config = LoraConfig(
        task_type=TaskType.SEQ_2_SEQ_LM,
        inference_mode=False,
        r=8,  # rank (smaller rank = less memory)
        lora_alpha=32,
        lora_dropout=0.1,
        # Target modules depend on the model architecture
        target_modules=["q", "v", "k", "o", "wi", "wo"],
    )
    
    # Apply LoRA - wrap in try/except in case of compatibility issues
    try:
        model = get_peft_model(model, peft_config)
        print("LoRA applied successfully")
        model.print_trainable_parameters()
    except Exception as e:
        print(f"Couldn't apply LoRA: {e}")
        print("Continuing with full fine-tuning")
    
    # Preprocess datasets
    tokenized_train = train_dataset.map(
        lambda examples: preprocess_function(examples, tokenizer),
        batched=True
    )
    tokenized_val = val_dataset.map(
        lambda examples: preprocess_function(examples, tokenizer),
        batched=True
    )
    
    # Data collator
    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        model=model,
        pad_to_multiple_of=8
    )
    
    # Apple Silicon optimized training arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=4,  # Smaller batch size for Mac
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=8,  # Larger gradient accumulation
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=3,
        predict_with_generate=True,
        # fp16=mps_available,  # Use fp16 if MPS is available
        fp16=False,  # Use fp16 if MPS is available
        report_to="tensorboard",
        load_best_model_at_end=True,
        logging_steps=10,
        # Mac-specific optimizations
        # dataloader_num_workers=max(1, os.cpu_count() // 2),
        dataloader_num_workers=0,  # Change from max(1, os.cpu_count() // 2) to 0
        dataloader_pin_memory=False,  # Disable for MPS
    )
    
    # Initialize trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    
    # Fine-tune the model
    trainer.train()
    
    # Save the model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    # Evaluate on test set
    print("Evaluating fine-tuned model...")
    test_results = evaluate_model(model, tokenizer, test_dataset, device)
    print(f"Test Results: {test_results}")
    
    return model, tokenizer

# 4. Evaluation function
def evaluate_model(model, tokenizer, test_dataset, device):
    # A simple evaluation function
    def generate_comment(code):
        inputs = tokenizer("Generate a comment for this code: " + code, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        outputs = model.generate(**inputs, max_length=128)
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Generate comments for a few examples
    examples = test_dataset.select(range(min(5, len(test_dataset))))
    for i, example in enumerate(examples):
        code = example['diff_code']
        expected = example['diff_docstring']
        generated = generate_comment(code)
        
        print(f"Example {i+1}:")
        print(f"Code: {code[:100]}...")
        print(f"Expected: {expected}")
        print(f"Generated: {generated}")
        print("-" * 50)
    
    # Here you could implement proper evaluation metrics
    return {"status": "Evaluation complete"}

# 5. Load and use the model for inference
def load_and_use_model(model_path):
    # Check for MPS availability
    mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
    device = torch.device("mps" if mps_available else "cpu")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    # Load model with appropriate settings for Apple Silicon
    if mps_available:
        print("Loading model with MPS acceleration...")
        # For inference, using half precision offers good performance on MPS
        model = AutoModelForSeq2SeqLM.from_pretrained(model_path).half().to(device)
    else:
        print("Loading standard model...")
        model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
    
    # Example usage
    code_snippet = """
    def calculate_average(numbers):
        total = sum(numbers)
        return total / len(numbers)
    """
    
    # Generate comment
    inputs = tokenizer("Generate a comment for this code: " + code_snippet, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    outputs = model.generate(**inputs, max_length=128)
    generated_comment = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    print(f"Generated comment: {generated_comment}")
    return model, tokenizer

# 6. Install required packages
def install_requirements():
    try:
        import pip
        
        # Install base requirements
        pip.main(['install', 'transformers', 'datasets', 'peft', 'tensorboard', 'scikit-learn'])
        
        # Check for Apple Silicon
        is_apple_silicon = platform.processor() == 'arm' or "Apple" in platform.processor()
        
        if is_apple_silicon:
            # Install PyTorch with MPS support if not already installed
            import torch
            if not hasattr(torch.backends, "mps"):
                print("Installing PyTorch with MPS support...")
                pip.main(['install', 'torch', 'torchvision', 'torchaudio'])
                
    except Exception as e:
        print(f"Error installing requirements: {e}")
        print("Please manually install required packages")

# 7. Usage example
if __name__ == "__main__":
    # Install required packages
    install_requirements()
    
    # Replace with your dataset path
    dataset_path = "path/to/your/code_comment_dataset.csv"
    
    # For Apple Silicon M1/M2/M3, use a smaller model for better performance
    model, tokenizer = finetune_code_comment_model(
        dataset_path=dataset_path,
        model_name="Salesforce/codet5p-220m",  # Use smaller model for better Mac performance
        output_dir="./code-comment-mac-model"
    )
    
    # Load and use the model for inference
    load_and_use_model("./code-comment-mac-model")

Running on Apple Silicon: True
Using device: mps


Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


Loading model with MPS acceleration...
LoRA applied successfully
trainable params: 3,244,032 || all params: 226,126,080 || trainable%: 1.4346


Map:   0%|          | 0/2300 [00:00<?, ? examples/s]

Map:   0%|          | 0/2300 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
0,3.8099,3.652344
1,3.5813,3.455078
2,3.5247,3.361328


Evaluating fine-tuned model...
Example 1:
Code:   def test_timewindows(database):
        
-     # clear the database before running this test
-    ...
Expected: - unit tests for addNewTW and getFirstTWforProfile
+ unit tests for addNewTW ,getLastTWforProfile and getFirstTWforProfile
?                         +++++++++++++++++++++

Generated: - Tests the timewindows for a profile.
+ Tests the timewindows for a profile.
?                            ++++++  ++++++

      Tests the timewindows for a profile.
?                            ++++++  ++++++

      Tests the timewindows for a profile.
?                            ++++++  ++++++

      Tests the timewindows for a profile.
?                            ++++++  ++++++

      Tests the timewindows for a profile.
?                            ++++++  ++++++

      Tests
--------------------------------------------------
Example 2:
Code:   def test_getProfileIdFromIP():
      
  
      # clear the database before running this test
     

Token indices sequence length is longer than the specified maximum sequence length for this model (951 > 512). Running this sequence through the model will result in indexing errors


Example 4:
Code: - def determine_observer_insert_points(self, model):
+ def determine_observer_insert_points(self, mo...
Expected:   r"""
-         There is no observers inserted for the PerChannelDetector
+         There is no observers inserted for the PerChannelDetector.
?                                                                  +

+ 
+         Returns an empty dictionary since no observers are added or needed
Generated:   Determine the number of observers inserted in the PerChannelDetector.
  
          Parameters
          ----------
          model : nn.Module
              The model to be detected.
  
          Returns
          -------
          num_observers : int
              The number of observers inserted in the PerChannelDetector.
  
          Raises
          ------
          ValueError
              If the model is not a valid model.
  
          Raises
          ------
          ValueError
              If the model is not a valid model.
  
          Raises
  