In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer


In [2]:
model_name = "EleutherAI/gpt-neo-1.3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [3]:
import json
from datasets import Dataset

In [4]:
with open("data.json", "r") as f:
    data = json.load(f)

# Convert data to Hugging Face dataset format
dataset = Dataset.from_dict({
    "input": [item["input"] for item in data],
    "output": [item["output"] for item in data]
})


In [5]:
from transformers import TrainingArguments, Trainer

In [6]:

def tokenize_function(examples):
    inputs = tokenizer(examples["input"], padding="max_length", truncation=True)
    outputs = tokenizer(examples["output"], padding="max_length", truncation=True)
    return {"input_ids": inputs["input_ids"], "labels": outputs["input_ids"]}


In [7]:
tokenizer.pad_token = tokenizer.eos_token

In [8]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [9]:
training_args = TrainingArguments(
    output_dir="./fine_tuned_gptneo",
    evaluation_strategy="no",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    num_train_epochs=3,
    save_steps=100,
    logging_steps=10,
    save_total_limit=2,
)




In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)