# Installs and Imports

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import torch


# Data

In [None]:
model_name = "distilgpt2"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Set the pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name)


In [None]:
# # Define max length for the sequences
MAX_LENGTH = 512

# # Preprocess the data
# def tokenize_function(examples):
#     tokenized = tokenizer(
#         examples["text"],
#         truncation=True,
#         padding="max_length",
#         max_length=MAX_LENGTH,
#         return_tensors=None
#     )
#     # Set labels equal to input_ids for causal language modeling
#     tokenized["labels"] = tokenized["input_ids"].copy()
#     return tokenized

def tokenize_function(examples):
    """Tokenize the texts and prepare them for training"""
    # First tokenize without padding to get actual lengths
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding=False,  # Changed from "max_length"
        max_length=MAX_LENGTH,
        return_tensors=None
    )
    
    # Now add padding
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding='max_length',  # Add padding up to max_length
        max_length=MAX_LENGTH,
        return_tensors=None
    )
    
    labels = tokenized["input_ids"].copy()
    
    for idx, text in enumerate(examples["text"]):
        # Find the start of the response
        response_start = text.find("### Response:\n") + len("### Response:\n")
        
        # Convert text before response to tokens
        prompt_tokens = len(tokenizer(text[:response_start], truncation=True, max_length=MAX_LENGTH)["input_ids"])
        
        # Mask out the prompt tokens in labels
        labels[idx][:prompt_tokens] = [-100] * prompt_tokens
        
        # If sequence is longer than max_length, truncate labels too
        if len(labels[idx]) > MAX_LENGTH:
            labels[idx] = labels[idx][:MAX_LENGTH]
    
    # Ensure all sequences are exactly MAX_LENGTH
    for idx in range(len(labels)):
        if len(labels[idx]) < MAX_LENGTH:
            labels[idx].extend([-100] * (MAX_LENGTH - len(labels[idx])))
        if len(tokenized["input_ids"][idx]) < MAX_LENGTH:
            tokenized["input_ids"][idx].extend([tokenizer.pad_token_id] * (MAX_LENGTH - len(tokenized["input_ids"][idx])))
            tokenized["attention_mask"][idx].extend([0] * (MAX_LENGTH - len(tokenized["attention_mask"][idx])))
    
    tokenized["labels"] = labels
    return tokenized

In [None]:
# # Load both train and evaluation datasets
# train_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split='train[:20%]')
# eval_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split='validation[:20%]')

# # Map the tokenization function to both datasets
# tokenized_train = train_dataset.map(
#     tokenize_function,
#     batched=True,
#     remove_columns=train_dataset.column_names
# )
# tokenized_eval = eval_dataset.map(
#     tokenize_function,
#     batched=True,
#     remove_columns=eval_dataset.column_names
# )

# # Set the tensor type
# tokenized_train.set_format("torch")
# tokenized_eval.set_format("torch")

import json
import torch
from datasets import Dataset

# 1. Load the data
def load_alpaca_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# 2. Format the data into prompt-completion pairs
def format_alpaca_prompt(example):
    """Format the instruction and input into a prompt"""
    if example["input"]:
        prompt = f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n"
    else:
        prompt = f"### Instruction:\n{example['instruction']}\n\n### Response:\n"
    return prompt

def prepare_dataset(data):
    """Convert the JSON data into a format suitable for the model"""
    formatted_data = []
    for item in data:
        prompt = format_alpaca_prompt(item)
        formatted_data.append({
            "text": prompt + item["output"]  # Combine prompt and output
        })
    return Dataset.from_list(formatted_data)

# 3. Modify the tokenization function
def tokenize_function(examples):
    """Tokenize the texts and prepare them for training"""
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
        return_tensors=None
    )
    # Set labels to -100 for prompt tokens (we don't want to predict these)
    # and actual token IDs for response tokens
    labels = tokenized["input_ids"].copy()
    
    for idx, text in enumerate(examples["text"]):
        # Find the start of the response
        response_start = text.find("### Response:\n") + len("### Response:\n")
        
        # Convert text before response to tokens
        prompt_tokens = len(tokenizer(text[:response_start])["input_ids"])
        
        # Mask out the prompt tokens in labels
        labels[idx][:prompt_tokens] = [-100] * prompt_tokens
    
    tokenized["labels"] = labels
    return tokenized

# Usage:
# Replace the dataset loading code with:
alpaca_data = load_alpaca_data('alpaca_data.json')
train_size = int(0.9 * len(alpaca_data))
train_data = alpaca_data[:train_size]
eval_data = alpaca_data[train_size:]

# Convert to Dataset format
train_dataset = prepare_dataset(train_data)
eval_dataset = prepare_dataset(eval_data)

# Tokenize the datasets
tokenized_train = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names
)
tokenized_eval = eval_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=eval_dataset.column_names
)

# Set the tensor format
tokenized_train.set_format("torch")
tokenized_eval.set_format("torch")

# Fine Tuning Setup

In [None]:
# training_args = TrainingArguments(
#     output_dir="./results",
#     eval_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size=4,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     logging_dir="./logs",
#     # Reduce logging frequency
#     logging_steps=100,  # Changed from 1
#     save_total_limit=2,
#     report_to="none",
#     # Add gradient clipping
#     max_grad_norm=1.0,
#     # Add warmup steps
#     warmup_steps=500,
#     # Add gradient accumulation
#     gradient_accumulation_steps=4,
#     # Add fp16 training if GPU available
#     fp16=True if torch.cuda.is_available() else False,
#     disable_tqdm=False,
#     logging_first_step=False,
#     logging_strategy="steps"
# )

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,  # You might want to lower this to 1e-5
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps=100,  # Add warmup steps for stability
    logging_steps=100,
    save_total_limit=2,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
)

# Fine Tune

In [None]:
trainer.train()

In [1]:
model_save_path = "./fine_tuned_distilgpt2"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

NameError: name 'trainer' is not defined