In [None]:
from datasets import load_dataset
import json
from transformers import AutoTokenizer

# Load tokenizer (replace with your actual LLaMA tokenizer path if needed)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")

# Load GSM8K dataset
dataset = load_dataset("openai/gsm8k", "main")

# Alpaca-style prompt template
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# EOS Token
EOS_TOKEN = tokenizer.eos_token  # Ensure the model stops generation properly

# Convert dataset to Alpaca format
def formatting_prompts_func(examples):
    instructions = ["Solve the following math problem"] * len(examples["question"])  # Common instruction for all
    inputs = examples["question"]  # Math problems
    outputs = examples["answer"]  # Solutions

    texts = []
    for instruction, input_text, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input_text, output) + EOS_TOKEN
        texts.append(text)

    return {"text": texts}

# Apply formatting function
dataset["train"] = dataset["train"].map(formatting_prompts_func, batched=True)
dataset["test"] = dataset["test"].map(formatting_prompts_func, batched=True)

# Convert dataset to JSON format
train_data = [{"text": entry} for entry in dataset["train"]["text"]]
test_data = [{"text": entry} for entry in dataset["test"]["text"]]

# Save to JSON
with open("gsm8k_train_alpaca.json", "w", encoding="utf-8") as f:
    json.dump(train_data, f, indent=4, ensure_ascii=False)

with open("gsm8k_test_alpaca.json", "w", encoding="utf-8") as f:
    json.dump(test_data, f, indent=4, ensure_ascii=False)

print("Dataset successfully converted to Alpaca-style format for LLaMA 3.1 8B fine-tuning!")
