In [None]:
# Refined code

# Install necessary libraries if running in Google Colab
!pip install transformers datasets -q
!pip install torch scikit-learn -q

import json
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
import torch

# Step 1: Load the instruction and answer data
with open("Real_Data.json", "r") as f:
    instructions = json.load(f)

with open("Real_Data_Ans.json", "r") as f:
    answers = json.load(f)

# Step 2: Combine instructions and answers into a list of dictionaries
dataset_list = []
for key in instructions:
    if key in answers:
        instr_list = instructions[key]
        ans_list = answers[key]
        # Ensure the lists have matching lengths
        if len(instr_list) == len(ans_list):
            dataset_list.extend(
                [{"instruction": instr, "response": ans} for instr, ans in zip(instr_list, ans_list)]
            )
        else:
            print(f"Warning: Key {key} has mismatched lengths between instructions and answers!")
    else:
        print(f"Warning: Key {key} not found in answers!")

# Step 3: Split dataset into training and validation sets
train_list, val_list = train_test_split(dataset_list, train_size=0.8, random_state=42)
train_dataset = Dataset.from_list(train_list)
val_dataset = Dataset.from_list(val_list)

# Step 4: Load LLaMA model and tokenizer
model_name = "facebook/opt-1.3b"  # Replace with a valid Hugging Face model
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Step 5: Tokenize datasets
def tokenize_data(example):
    # Combine instruction and response into one input string with EOS token
    input_text = f"### Instruction:\n{example['instruction']}\n### Response:\n{example['response']}{tokenizer.eos_token}"
    tokenized = tokenizer(
        input_text,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )
    return {
        "input_ids": tokenized["input_ids"].squeeze(0),
        "attention_mask": tokenized["attention_mask"].squeeze(0),
        "labels": tokenized["input_ids"].squeeze(0)  # Labels are the same as input_ids for causal LM
    }

# Map datasets and return PyTorch tensors
train_dataset = train_dataset.map(tokenize_data)
val_dataset = val_dataset.map(tokenize_data)
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Step 6: Define training arguments
training_args = TrainingArguments(
    output_dir="./output",  # Directory to save the model and logs
    num_train_epochs=3,  # Number of epochs
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,  # Batch size for evaluation
    gradient_accumulation_steps=16,  # Adjust based on memory constraints
    fp16=torch.cuda.is_available(),  # Enable mixed-precision training if GPU is available
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",  # Save model after each epoch
    logging_dir="./logs",  # Directory for logging
    logging_steps=100,  # Log every 100 steps
    save_total_limit=2,  # Keep only the last 2 checkpoints
    load_best_model_at_end=True  # Load the best model after training
)

# Step 7: Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Step 8: Fine-tune the model
trainer.train()

# Step 9: Save the fine-tuned model and tokenizer
model.save_pretrained("fine_tuned_llama_model")
tokenizer.save_pretrained("fine_tuned_llama_model")

# Step 10: Test the fine-tuned model
input_prompt = "### Instruction:\nWhat are the benefits of using LoRa?\n### Response:\n"
input_ids = tokenizer.encode(input_prompt, return_tensors="pt").to(model.device)

# Generate the response
model.eval()  # Set model to evaluation mode
generated_ids = model.generate(input_ids, max_length=100)
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(generated_text)


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
# Old code

import json
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments

# Step 1: Load the instruction and answer data
with open("Real_Data.json", "r") as f:
    instructions = json.load(f)

with open("Real_Data_Ans.json", "r") as f:
    answers = json.load(f)

# Step 2: Combine instructions and answers into a list of dictionaries
dataset_list = []
for key in instructions:
    if key in answers:
        instr_list = instructions[key]
        ans_list = answers[key]
        # Ensure the lists have matching lengths
        if len(instr_list) == len(ans_list):
            dataset_list.extend(
                [{"instruction": instr, "response": ans} for instr, ans in zip(instr_list, ans_list)]
            )
        else:
            print(f"Warning: Key {key} has mismatched lengths between instructions and answers!")

# Step 3: Split dataset into training and validation sets
train_list, val_list = train_test_split(dataset_list, train_size=0.8)
train_dataset = Dataset.from_list(train_list)
val_dataset = Dataset.from_list(val_list)

# Step 4: Load Llama model and tokenizer
model_name = "llama-3.1-8b"
model = LlamaForCausalLM.from_pretrained(model_name)
tokenizer = LlamaTokenizer.from_pretrained(model_name)

# Step 5: Tokenize datasets
def tokenize_data(example):
    encoded = tokenizer(
        example["instruction"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    encoded["labels"] = tokenizer(
        example["response"],
        truncation=True,
        padding="max_length",
        max_length=512
    )["input_ids"]
    return encoded

train_dataset = train_dataset.map(tokenize_data, batched=True)
val_dataset = val_dataset.map(tokenize_data, batched=True)

# Step 6: Define training arguments
training_args = TrainingArguments(
    output_dir="./output",  # Directory to save the model and logs
    num_train_epochs=3,  # Number of epochs
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,  # Batch size for evaluation
    gradient_accumulation_steps=16,  # Adjust based on memory constraints
    fp16=True,  # Enable mixed-precision training
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",  # Save model after each epoch
    logging_dir="./logs",  # Directory for logging
    logging_steps=100,  # Log every 100 steps
    save_total_limit=2,  # Keep only the last 2 checkpoints
    load_best_model_at_end=True  # Load the best model after training
)

# Step 7: Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Step 8: Fine-tune the model
trainer.train()

# Step 9: Save the fine-tuned model and tokenizer
model.save_pretrained("fine_tuned_llama_model")
tokenizer.save_pretrained("fine_tuned_llama_model")

# Step 10: Test the fine-tuned model
input_prompt = "### Instruction:\nWhat are the benefits of using LoRa?\n### Response:\n"
input_ids = tokenizer.encode(input_prompt, return_tensors="pt")

# Generate the response
generated_ids = model.generate(input_ids, max_length=100)
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(generated_text)


FileNotFoundError: [Errno 2] No such file or directory: 'Real_Data.json'

In [None]:
# New code

import json
from datasets import Dataset
from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# Step 1: Load JSON data
with open("Training_Data.json", "r") as data_file:
    data = json.load(data_file)

with open("Training_Data_Ans.json", "r") as answers_file:
    answers = json.load(answers_file)

# Step 2: Convert data to instruction-response pairs
dataset_list = []
for game_id, words in data.items():
    for answer in answers:
        if str(answer["id"]) == game_id:
            correct_groups = answer["answers"]
            # Add the game as an instruction-response pair
            instruction = (
                f"Group the following words into 4 categories with 4 words each:\n{', '.join(words)}"
            )
            response = "\n".join(
                [f"{i + 1}. {', '.join(group['members'])}" for i, group in enumerate(correct_groups)]
            )
            dataset_list.append({"instruction": instruction, "response": response})
            break

# Step 3: Split into train and validation datasets
train_list, val_list = train_test_split(dataset_list, train_size=0.8, random_state=42)
train_dataset = Dataset.from_list(train_list)
val_dataset = Dataset.from_list(val_list)

# Step 4: Load LLaMA model and tokenizer
model_name = "huggingface/llama-3.1-8b"  # LLaMA model
model = LlamaForCausalLM.from_pretrained(model_name)
tokenizer = LlamaTokenizer.from_pretrained(model_name)

# Step 5: Tokenize the data
def tokenize_data(example):
    instruction = example["instruction"]
    response = example["response"]
    input_text = f"### Instruction:\n{instruction}\n### Response:\n{response}"

    # Tokenize inputs and labels
    encoded = tokenizer(
        input_text,
        truncation=True,
        padding="max_length",
        max_length=512,
    )
    encoded["labels"] = tokenizer(
        response,
        truncation=True,
        padding="max_length",
        max_length=512
    )["input_ids"]
    return encoded

train_dataset = train_dataset.map(tokenize_data, batched=True)
val_dataset = val_dataset.map(tokenize_data, batched=True)

# Step 6: Define training arguments
training_args = TrainingArguments(
    output_dir="./llama-3.1-8b-output",  # Directory to save the model and logs
    num_train_epochs=3,  # Number of epochs
    per_device_train_batch_size=4,  # Batch size (adjust for memory)
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,  # Adjust for large models
    fp16=True,  # Use mixed precision for faster training
    evaluation_strategy="epoch",  # Evaluate after each epoch
    save_strategy="epoch",  # Save model checkpoint after each epoch
    logging_dir="./logs",  # Directory for logs
    logging_steps=100,
    save_total_limit=2,  # Save only the last 2 checkpoints
    load_best_model_at_end=True  # Load the best model after training
)

# Step 7: Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Step 8: Fine-tune the model
trainer.train()

# Step 9: Save the fine-tuned model and tokenizer
model.save_pretrained("fine_tuned_llama_3.1-8b")
tokenizer.save_pretrained("fine_tuned_llama_3.1-8b")

# Step 10: Test the fine-tuned model
test_prompt = "### Instruction:\nGroup the following words into 4 categories:\nRACECAR, OPTION, TAB, JAZZ, LEVEL, NETS, SNOW, HEAT, BUCKS, MOM, RAIN, SHIFT, KAYAK, RETURN, SLEET, HAIL\n### Response:\n"
input_ids = tokenizer.encode(test_prompt, return_tensors="pt")

# Generate a response
generated_ids = model.generate(input_ids, max_length=150)
generated_response = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(generated_response)



FileNotFoundError: [Errno 2] No such file or directory: 'Training_Data.json'