## Install Dependencies

In [None]:
# !pip install torch transformers datasets accelerate peft bitsandbytes

## Training Pipeline

In [None]:
!rm -r /kaggle/working/wandb/*

In [None]:
from kaggle_secrets import UserSecretsClient
import wandb
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb_apikey")

wandb.login(key=secret_value_0)

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
import os

# Define the model name
MODEL_NAME = "deepseek-ai/deepseek-coder-1.3b-base"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token # add a pad token

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Load and preprocess dataset
TRAINING_DATA= "/kaggle/input/terraform-aws-custom-train/terraform_aws_docs_training_data2 (1).jsonl"
TEST_DATA ="/kaggle/input/validated-terraform/validated_terraform_data.jsonl"
dataset =  load_dataset("json", data_files={"train": TRAINING_DATA, "test": TEST_DATA})


# Format dataset for training
def format_prompt(example):
    return {"input": f"### Instruction:\n{example['prompt']}\n\n### Response:\n{example['completion']}"}

dataset = dataset.map(format_prompt)

# tokenize the dataset
def tokenize_function(example):
    """
    Tokenizes the 'input' text in the example using the global tokenizer.
    Adds padding and truncation to handle variable sequence lengths.

    Args:
        example (dict): A dictionary containing an 'input' key with text data.

    Returns:
        dict: A dictionary containing tokenized data.
    """
    max_length = 151 # set the maximum length here
    result = tokenizer(
        example['input'],
        padding='max_length',  # Pad to max_length
        truncation=True,  # Truncate to max_length
        max_length=max_length # explicitly set max length
    )
    result["labels"] = result["input_ids"].copy()  # labels are the same as input_ids for causal language models
    return result

tokenized_datasets = dataset.map(tokenize_function, remove_columns=['input'])

# Define LoRA configuration
lora_config = LoraConfig(
    r=8, lora_alpha=32, lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"]
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)

# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    warmup_steps=100,
    num_train_epochs=1.25,
    logging_steps=5,
    save_strategy="epoch", # change save strategy
    output_dir="./fine_tuned_deepseek",
    remove_unused_columns=True # change to true
)

#create the data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, pad_to_multiple_of=151) #mlm is false since we aren't using masked language modeling, pad to multiple of 151 since that was the expected sequence length

# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    #eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator
)

# Before Fine-tuning

In [None]:
prompt = "Create a Terraform template with AWS Lambda function which reads S3"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate response
output = model.generate(**inputs, max_length=1000)
print(tokenizer.decode(output[0], skip_special_tokens=True))

In [None]:
trainer.train()

# After Fine Tuning

In [None]:
prompt = "Create a Terraform template with an AWS Lambda function which reads S3 bucket"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate response
output = model.generate(**inputs, max_length=1000)
print(tokenizer.decode(output[0], skip_special_tokens=True))

In [None]:
# Example input
fine_tuned_model = model
prompt = "Create Terraform template with AWS db cluster snapshot resource"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate response
output = fine_tuned_model.generate(**inputs, max_length=1000)
print(tokenizer.decode(output[0], skip_special_tokens=True))

# Save the trained model

In [None]:
!rm -r /kaggle/working/fine_tuned_deepseek/*

In [None]:
model.save_pretrained("./fine_tuned_deepseek")
tokenizer.save_pretrained("./fine_tuned_deepseek")

In [None]:
BEST_MODEL_DIR = "./fine_tuned_deepseek"

model = AutoModelForCausalLM.from_pretrained(BEST_MODEL_DIR, torch_dtype=torch.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(BEST_MODEL_DIR)

In [None]:
!zip -r best_model.zip /kaggle/working/fine_tuned_deepseek