In [3]:
from datasets import Dataset

# Load JSON file into a Dataset
json_path = r"C:\\Users\\Gattupalli Saketh\\OneDrive\Desktop\\Preprocessed\\cuda_dataset.json"
dataset = Dataset.from_json(json_path)

# Split into train and test sets
dataset = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)

# Save to disk
output_dir = r"C:\\Users\\Gattupalli Saketh\\OneDrive\Desktop\wide\\fim_dataset"
dataset.save_to_disk(output_dir)

print(f"Dataset saved to {output_dir}")
print(f"Train size: {len(dataset['train'])}")
print(f"Test size: {len(dataset['test'])}")

Generating train split: 897 examples [00:00, 4200.53 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 807/807 [00:00<00:00, 23870.60 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 90/90 [00:00<00:00, 11333.23 examples/s]

Dataset saved to C:\\Users\\Gattupalli Saketh\\OneDrive\Desktop\wide\\fim_dataset
Train size: 807
Test size: 90





In [None]:


import os
import json
import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments,DataCollatorForLanguageModeling
from peft import LoraConfig
from trl import SFTTrainer

# Step 4: Set Hugging Face token (if needed)
os.environ["HF_TOKEN"] = "hf_*********************"  # Uncomment if using Hugging Face token

# Step 5: Load and prepare the JSON dataset
def load_json_dataset(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)  # Load the entire JSON file
        if isinstance(data, list):  # Expecting an array of objects
            return data
        else:
            raise ValueError("Expected a JSON array of objects")
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON file: {e}")
        raise
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        raise

def format_gemma_fim(data_point):
    """Format dataset for FIM using fim_text field, safely."""
    fim_text = data_point.get("fim_text")
    if fim_text is not None and isinstance(fim_text, str) and fim_text.strip() != "":
        return {"text": fim_text}
    else:
        return None  # Return None if fim_text is missing or empty

# Load and format dataset
json_file = "cuda_dataset.json"  # Update with your actual file path if different
raw_data = load_json_dataset(json_file)

# Safely format the dataset
formatted_data = []
for item in raw_data:
    formatted_item = format_gemma_fim(item)
    if formatted_item is not None:
        formatted_data.append(formatted_item)

# Now create Dataset
dataset = Dataset.from_list(formatted_data)

# Split dataset into train and test
train_test_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]


# Configure the Gemma model with 4-bit quantization
model_id = "google/codegemma-2b"  # Use CodeGemma 2B model for code generation
model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Set padding token for tokenizer
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # Prevent warnings

#  Configure LoRA for efficient fine-tuning
lora_config = LoraConfig(
    r=64,  # LoRA rank
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

#  Set up training arguments
training_args = TrainingArguments(
    output_dir="./gemma-cuda-finetuned",
    per_device_train_batch_size=1,  # Adjust based on GPU memory
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch size
    warmup_steps=10,
    max_steps=100,  # Adjust based on dataset size
    learning_rate=2e-4,
    fp16=True,  # Mixed precision training
    logging_steps=1,
    optim="paged_adamw_8bit",
    save_strategy="epoch",
    # The `evaluation_strategy` argument is deprecated in newer versions of Transformers.
    # Instead, use `eval_steps` to control the evaluation frequency.
    # evaluation_strategy="steps", 
    eval_steps=20,  
    push_to_hub=False,  # Set to False to avoid pushing to Hub
    report_to="none"  # Disable wandb logging
)

#  Initialize SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=lora_config,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

#  Fine-tune the model
trainer.train()

#  Save the fine-tuned model
new_model = "gemma-cuda-finetuned"
trainer.model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

#  Merge LoRA weights with base model (optional)
model = AutoModelForCausalLM.from_pretrained(new_model)
model.save_pretrained(f"{new_model}-merged")
tokenizer.save_pretrained(f"{new_model}-merged")

#  Test the fine-tuned model
def generate_cuda_code_fim(prefix, suffix, max_tokens=500):
    device = "cuda:0"  # Ensure you're using GPU
    formatted_prompt = f"<fim_prefix>{prefix}<fim_suffix>{suffix}<fim_middle>"
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.7,
        top_p=0.95
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract the middle part (between <fim_prefix> and <fim_suffix>)
    start = response.find("<fim_middle>") + len("<fim_middle>")
    end = response.find("<fim_suffix>")
    if start >= len("<fim_middle>") and end != -1:
        return response[start:end].strip()
    return response  # Return full response if parsing fails

# Example FIM test
test_prefix = """#include <errno.h>
#include <cuda.h>
#include <stdio.h>
#define BLOCKS  1
#define THREADS 1
__global__ void add(int *a, int *b, int *c);
int main(void)
{
    int a, b, c;"""
test_suffix = """Memcpy(d_a, &a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
    add<<< BLOCKS, THREADS >>>(d_a, d_b, d_c);
    cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    return(EXIT_SUCCESS);
}
__global__ void add(int *a, int *b, int *c)
{
    *c = *a + *b;
}"""
generated_middle = generate_cuda_code_fim(test_prefix, test_suffix)
print("Generated CUDA Code (Middle):")
print(generated_middle)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Loaded 894 valid samples from 897 total samples.


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]