I apologize for the oversight. You're right, there is indeed a script for hyperparameter tuning. Let's use this `optimize_hyperparameters.py` script for our workflow. Here's how we can proceed with hyperparameter tuning, scaling test, and evaluation using this script in a Colab notebook:

### 1. Set up the Colab environment:

In [None]:
!git clone https://github.com/yourusername/arc-neural-reasoning-model.git
%cd arc-neural-reasoning-model
!pip install -r requirements.txt

### 2. Run hyperparameter tuning:

In [None]:
!python gpt2_arc/src/optimize_hyperparameters.py --n_trials 50 --storage sqlite:///optuna_results.db --n_embd_min 64 --n_embd_max 256 --n_head_min 2 --n_head_max 8 --n_layer_min 2 --n_layer_max 6 --batch_size_min 16 --batch_size_max 64 --learning_rate_min 1e-5 --learning_rate_max 1e-3 --max_epochs_min 10 --max_epochs_max 30

### 3. Get the best hyperparameters:

In [7]:
import optuna
import json

storage_name = "sqlite:////workspaces/arc-neural-reasoning-model/optuna_results.db"
study_name = "gpt2_arc_optimization"

try:
    # List all study names in the database
    study_summaries = optuna.study.get_all_study_summaries(storage=storage_name)
    print("Available studies in the database:")
    for study_summary in study_summaries:
        print(f"- {study_summary.study_name}")

    # Attempt to load the specified study
    study = optuna.load_study(study_name=study_name, storage=storage_name)
    best_params = study.best_params
    print("Best hyperparameters:")
    print(json.dumps(best_params, indent=2))
except KeyError as e:
    print("Error: The specified study does not exist in the database. Please ensure that the study name and storage path are correct.")
    print(f"Details: {str(e)}")
except Exception as e:
    print(f"An error occurred: {str(e)}")


Available studies in the database:
- gpt2_arc_optimization
Best hyperparameters:
{
  "n_embd": 87,
  "n_head": 3,
  "n_layer": 1,
  "batch_size": 56,
  "learning_rate": 0.0006206653045449178,
  "max_epochs": 10
}


### 4. Use the best hyperparameters for a scaling test (longer training):

In [10]:
from gpt2_arc.src.training.train import main
import argparse

# Create an argument parser with the best hyperparameters
parser = argparse.ArgumentParser()
parser.add_argument("--n_embd", type=int, default=best_params['n_embd'])
parser.add_argument("--n_head", type=int, default=best_params['n_head'])
parser.add_argument("--n_layer", type=int, default=best_params['n_layer'])
parser.add_argument("--batch_size", type=int, default=best_params['batch_size'])
parser.add_argument("--learning_rate", type=float, default=best_params['learning_rate'])
parser.add_argument("--max_epochs", type=int, default=100)  # Extend for longer training
parser.add_argument("--use_gpu", action="store_true")
parser.add_argument("--project", type=str, default="arc-scaling-test")
parser.add_argument("--log_level", type=str, default="INFO", help="Set the logging level")

args = parser.parse_args([])
args.use_gpu = True  # Enable GPU if available

# Run the training
main(args)

INFO:gpt2_arc.src.training.train:Data loaded successfully using arckit
INFO:gpt2_arc.src.training.train:Initializing model with new configuration
DEBUG:gpt2_arc.src.models.gpt2:Initialized Attention with n_embd=96, n_head=3
DEBUG:gpt2_arc.src.models.gpt2:Initialized FeedForward with n_embd=96
DEBUG:gpt2_arc.src.models.gpt2:Initialized TransformerBlock with n_embd=96, n_head=3
INFO:gpt2_arc.src.training.train:Initializing trainer with new configuration


DEBUG: Starting ARCDataset initialization
DEBUG: data_source type: <class 'arckit.data.TaskSet'>
DEBUG: data_source content: <TaskSet: 400 tasks>
DEBUG: Processed data length: 400
DEBUG: First item keys: dict_keys(['id', 'train', 'test'])
DEBUG: First train item: {'input': array([[0, 7, 7],
       [7, 7, 7],
       [0, 7, 7]]), 'output': array([[0, 0, 0, 0, 7, 7, 0, 7, 7],
       [0, 0, 0, 7, 7, 7, 7, 7, 7],
       [0, 0, 0, 0, 7, 7, 0, 7, 7],
       [0, 7, 7, 0, 7, 7, 0, 7, 7],
       [7, 7, 7, 7, 7, 7, 7, 7, 7],
       [0, 7, 7, 0, 7, 7, 0, 7, 7],
       [0, 0, 0, 0, 7, 7, 0, 7, 7],
       [0, 0, 0, 7, 7, 7, 7, 7, 7],
       [0, 0, 0, 0, 7, 7, 0, 7, 7]])}
Number of train samples: 1302
Number of test samples: 416
DEBUG: Starting ARCDataset initialization
DEBUG: data_source type: <class 'arckit.data.TaskSet'>
DEBUG: data_source content: <TaskSet: 400 tasks>
DEBUG: Processed data length: 400
DEBUG: First item keys: dict_keys(['id', 'train', 'test'])
DEBUG: First train item: {'input': ar

AttributeError: 'Namespace' object has no attribute 'no_logging'

### 5. Evaluate the trained model:

In [9]:
from gpt2_arc.src.evaluate import main as evaluate_main
import argparse
# Set up evaluation arguments
eval_parser = argparse.ArgumentParser()
eval_parser.add_argument("--model_checkpoint", type=str, required=True, help="Path to the model checkpoint")
eval_parser.add_argument("--batch_size", type=int, default=32)
eval_parser.add_argument("--output_dir", type=str, default="./evaluation_results")
eval_parser.add_argument("--wandb_project", type=str, default="arc-evaluation")
eval_parser.add_argument("--wandb_run_name", type=str, default="scaling-test-evaluation")

# Replace with the actual path to your trained model checkpoint
model_checkpoint_path = "/workspaces/arc-neural-reasoning-model/final_model_4fe9801e-c839-454f-a46c-6e94e3c04e81.pth"

eval_args = eval_parser.parse_args([
    "--model_checkpoint", model_checkpoint_path,
    "--batch_size", "32",
    "--output_dir", "./evaluation_results",
    "--wandb_project", "arc-evaluation",
    "--wandb_run_name", "scaling-test-evaluation"
])

# Run the evaluation
evaluate_main(eval_args)

DEBUG: Starting ARCDataset initialization
DEBUG: data_source type: <class 'arckit.data.TaskSet'>
DEBUG: data_source content: <TaskSet: 400 tasks>
DEBUG: Processed data length: 400
DEBUG: First item keys: dict_keys(['id', 'train', 'test'])
DEBUG: First train item: {'input': array([[8, 6],
       [6, 4]]), 'output': array([[8, 6, 8, 6, 8, 6],
       [6, 4, 6, 4, 6, 4],
       [6, 8, 6, 8, 6, 8],
       [4, 6, 4, 6, 4, 6],
       [8, 6, 8, 6, 8, 6],
       [6, 4, 6, 4, 6, 4]])}
Number of train samples: 1363
Number of test samples: 419


  checkpoint = torch.load(args.model_checkpoint)


ValueError: Model configuration not found in checkpoint

### 6. Analyze the results:

In [None]:
import json

# Load and print evaluation results
with open("./evaluation_results/scaling-test-evaluation_results.json", "r") as f:
    results = json.load(f)

print("Evaluation Results:")
print(json.dumps(results, indent=2))