In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%cd /content/gdrive/MyDrive/transformer

/content/gdrive/MyDrive/transformer


In [None]:
!pip install --upgrade datasets transformers

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloadi

In [5]:
# Importing required libraries
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
import torch

# Define a function to load and prepare the dataset
def load_and_prepare_data(file_path):
    """
    Loads JSON data from the given file path and prepares it with prompts for fine-tuning.
    """
    # Prompt engineering template
    prompt_template = (
        "You are a Transformer-based assistant specializing in helping beginners understand both coding and conceptual questions "
        "related to machine learning and Transformers. Below is a question. Provide a detailed explanation or relevant code example.\n\n"
        "Question:\n{query}\n\n"
        "Answer:"
    )

    # Load the JSON data
    with open(file_path, "r", encoding="utf-8") as f:
        raw_data = json.load(f)

    # Format the data
    formatted_data = [
        {
            "prompt": prompt_template.format(query=entry["content"]),
            "completion": "Provide a comprehensive explanation or solution."
        }
        for entry in raw_data if "content" in entry
    ]

    return Dataset.from_list(formatted_data)

# Load the training and validation datasets
train_dataset = load_and_prepare_data("/content/gdrive/MyDrive/transformer/datasets/train_data.json")
val_dataset = load_and_prepare_data("/content/gdrive/MyDrive/transformer/datasets/val_data.json")

# Define the pretrained model path
pretrained_model_path = "/content/gdrive/MyDrive/transformer/Qwen2.5-Coder-0.5B-Instruct"

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(pretrained_model_path, trust_remote_code=True)

# Tokenize the datasets
def tokenize_data(examples):
    """
    Tokenizes the dataset using the provided tokenizer.
    """
    inputs = tokenizer(examples["prompt"], truncation=True, padding="max_length", max_length=512)
    outputs = tokenizer(examples["completion"], truncation=True, padding="max_length", max_length=512)
    inputs["labels"] = outputs["input_ids"]
    return inputs

train_dataset = train_dataset.map(tokenize_data, batched=True)
val_dataset = val_dataset.map(tokenize_data, batched=True)

# Data collator for training
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments
training_args = TrainingArguments(
    output_dir="./tuned_qwen2.5-coder",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    learning_rate=5e-5,
    save_total_limit=2,
    push_to_hub=False,
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start training
trainer.train()

# Save the fine-tuned model
model.save_pretrained("/content/gdrive/MyDrive/transformer/tuned_qwen2.5-coder")
tokenizer.save_pretrained("/content/gdrive/MyDrive/transformer/tuned_qwen2.5-coder")

print("Fine-tuning complete. Model saved!")


Map:   0%|          | 0/1228 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.4513,1.469843
2,0.7852,1.541184
3,0.3246,1.774942


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Fine-tuning complete. Model saved!
