Required: https://youtu.be/IDxrMbXPVTA?si=PHfGry-HQj__3Xne

# STUFF YOU CHANGE

In [1]:
# ========== User-Defined Parameters (Top of Notebook) ==========

# 1. Base directory for storing date-based experiment result folders in Google Drive.
parent_dir = "/workspaces/arc-neural-reasoning-model/EXPERIMENTAL/"

# 2. Boolean flag to choose between best hyperparameters from previous experiments (True) or manual parameters (False).
use_best_params = True  # Set to False to use manual parameters

# 3. Dictionary of manually set hyperparameters used when use_best_params is False.
#    Includes model architecture and training settings.
manual_params = {
    "n_embd": 32,     # Embedding dimension
    "n_head": 4,       # Number of attention heads
    "n_layer": 6,     # Number of transformer layers
    "batch_size": 64,  # Batch size for training
    "learning_rate": 0.0001,  # Learning rate
    "max_epochs": 50   # Maximum number of epochs for training
}

# Hyperparameter tuning search space settings
# 4. Number of Optuna trials for hyperparameter tuning.
#    More trials can lead to better optimization but increase computation time.
n_trials = 100

# 5. Range for embedding dimension in hyperparameter search space.
n_embd_min, n_embd_max = 64, 256

# 6. Range for number of attention heads in transformer model during tuning.
n_head_min, n_head_max = 2, 16

# 7. Range for number of transformer layers in model architecture during optimization.
n_layer_min, n_layer_max = 2, 16

# 8. Range of batch sizes to explore. Larger batches can speed up training but may require more memory.
batch_size_min, batch_size_max = 1, 256

# 9. Range for learning rate in hyperparameter search space. Crucial for model convergence and performance.
learning_rate_min, learning_rate_max = 1e-8, 1e-2

# 10. Range for number of training epochs to consider during hyperparameter optimization.
max_epochs_min, max_epochs_max = 3, 50

# These parameters allow flexible experimentation with different model configurations and training settings,
# enabling comprehensive exploration of the hyperparameter space for optimal model performance.

# Print configurations for verification
print("Current Configuration:")
print(f"Parent Directory: {parent_dir}")
print(f"Use Best Parameters: {use_best_params}")
print(f"Manual Parameters: {manual_params}")
print(f"Number of Optuna Trials: {n_trials}")
print("Hyperparameter Ranges:")
print(f"  n_embd: {n_embd_min} to {n_embd_max}")
print(f"  n_head: {n_head_min} to {n_head_max}")
print(f"  n_layer: {n_layer_min} to {n_layer_max}")
print(f"  batch_size: {batch_size_min} to {batch_size_max}")
print(f"  learning_rate: {learning_rate_min} to {learning_rate_max}")
print(f"  max_epochs: {max_epochs_min} to {max_epochs_max}")

# Save configuration
import json

def save_config(config, filename="config.json"):
    with open(filename, 'w') as f:
        json.dump(config, f, indent=4)

config = {
    "parent_dir": parent_dir,
    "use_best_params": use_best_params,
    "manual_params": manual_params,
    "tuning": {
        "n_trials": n_trials,
        "n_embd": (n_embd_min, n_embd_max),
        "n_head": (n_head_min, n_head_max),
        "n_layer": (n_layer_min, n_layer_max),
        "batch_size": (batch_size_min, batch_size_max),
        "learning_rate": (learning_rate_min, learning_rate_max),
        "max_epochs": (max_epochs_min, max_epochs_max)
    }
}

save_config(config)
print(f"Configuration saved to config.json")

# Check GPU availability
import torch
print("\nGPU Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))

Current Configuration:
Parent Directory: /workspaces/arc-neural-reasoning-model/EXPERIMENTAL/
Use Best Parameters: True
Manual Parameters: {'n_embd': 32, 'n_head': 4, 'n_layer': 6, 'batch_size': 64, 'learning_rate': 0.0001, 'max_epochs': 50}
Number of Optuna Trials: 100
Hyperparameter Ranges:
  n_embd: 64 to 256
  n_head: 2 to 16
  n_layer: 2 to 16
  batch_size: 1 to 256
  learning_rate: 1e-08 to 0.01
  max_epochs: 3 to 50
Configuration saved to config.json

GPU Available: False


# CODE

### 1. Set up the Colab environment:

In [2]:
#from google.colab import drive
#drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [3]:
#!rm -rf /content/arc-neural-reasoning-model/
#!git clone https://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@github.com/ImmortalDemonGod/arc-neural-reasoning-model.git
!pip install -r /workspaces/arc-neural-reasoning-model/gpt2_arc/requirements.txt
!pip install optuna
#!pip install jupyterlab jupyterlab-optuna
# Install the required packages
!pip install optuna-dashboard pyngrok
!ngrok config add-authtoken 2NCEuxuUBMj6zsdTokQHkYJ4AZz_3E7e2pyW87otgFg3UdSC3
!pip install tensorboard
!pip install watchdog

[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: '/content/arc-neural-reasoning-model/gpt2_arc/requirements.txt'[0m[31m
Collecting pyngrok
  Downloading pyngrok-7.2.0-py3-none-any.whl.metadata (7.4 kB)
Downloading pyngrok-7.2.0-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.0
Authtoken saved to configuration file: /home/codespace/.config/ngrok/ngrok.yml                      
Collecting watchdog
  Downloading watchdog-5.0.2-py3-none-manylinux2014_x86_64.whl.metadata (41 kB)
Downloading watchdog-5.0.2-py3-none-manylinux2014_x86_64.whl (78 kB)
Installing collected packages: watchdog
Successfully installed watchdog-5.0.2


In [4]:
%cd /workspaces/arc-neural-reasoning-model/EXPERIMENTAL/
#!rm -rf /content/arc-neural-reasoning-model/arc_sat_solver
#!rm -rf /content/arc-neural-reasoning-model/benchmark_results
#!rm -rf /content/arc-neural-reasoning-model/checkpoints
#!rm -rf /content/arc-neural-reasoning-model/tmp
#!pip install -e .

/workspaces/arc-neural-reasoning-model/EXPERIMENTAL


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


### 2. Run hyperparameter tuning (in the background) click the **links** to see the dashboards:

make sure to set the directory in your drive to save the files in

In [5]:
import os
from datetime import datetime

# Get the current date in YYYYMMDD format
current_date = datetime.now().strftime('%Y%m%d')


# Create a folder named after the current date
date_folder = os.path.join(parent_dir, current_date)
if not os.path.exists(date_folder):
    os.makedirs(date_folder)

# Change into the newly created folder
%cd {date_folder}

# Now all your operations will save to /content/YYYYMMDD

/workspaces/arc-neural-reasoning-model/EXPERIMENTAL/20240924


In [None]:
# Now run the optimization command using the new date folder for storage in the background
# The user-defined hyperparameters are passed here dynamically
!nohup python /content/arc-neural-reasoning-model/gpt2_arc/src/optimize_hyperparameters.py \
  --n_trials {n_trials} \
  --storage sqlite:///{date_folder}/optuna_results.db \
  --n_embd_min {n_embd_min} --n_embd_max {n_embd_max} \
  --n_head_min {n_head_min} --n_head_max {n_head_max} \
  --n_layer_min {n_layer_min} --n_layer_max {n_layer_max} \
  --batch_size_min {batch_size_min} --batch_size_max {batch_size_max} \
  --learning_rate_min {learning_rate_min} --learning_rate_max {learning_rate_max} \
  --max_epochs_min {max_epochs_min} --max_epochs_max {max_epochs_max} > /dev/null 2>&1 &

# Import ngrok and set up the tunnel
from pyngrok import ngrok

# Start the ngrok tunnel for port 8081
public_url = ngrok.connect(8081)
print(f"Optuna dashboard is accessible at: {public_url}")

# Run the Optuna dashboard in the background without blocking
!nohup optuna-dashboard --port 8081 sqlite:///{date_folder}/optuna_results.db > /dev/null 2>&1 &

# Import necessary modules
import os

# Specify the folder for logs (using date_folder)
log_dir = f"{date_folder}/runs"

# Run TensorBoard in the background without blocking
!nohup tensorboard --logdir {log_dir} --port 6006 > /dev/null 2>&1 &

# Set up the ngrok tunnel for TensorBoard
public_url_tb = ngrok.connect(6006)
print(f"TensorBoard is accessible at: {public_url_tb}")

Optuna dashboard is accessible at: NgrokTunnel: "https://5b4a-35-240-146-94.ngrok-free.app" -> "http://localhost:8081"
TensorBoard is accessible at: NgrokTunnel: "https://90f7-35-240-146-94.ngrok-free.app" -> "http://localhost:6006"


### 3. Get the best hyperparameters:

In [None]:
import optuna
import json

# Set Optuna storage and study details
storage_name = f"sqlite:///{date_folder}/optuna_results.db"
study_name = "gpt2_arc_optimization"

try:
    # List all study names in the database
    study_summaries = optuna.study.get_all_study_summaries(storage=storage_name)
    print("Available studies in the database:")
    for study_summary in study_summaries:
        print(f"- {study_summary.study_name}")

    # Load the specified study
    study = optuna.load_study(study_name=study_name, storage=storage_name)
    best_params = study.best_params
    print("Best hyperparameters:")
    print(json.dumps(best_params, indent=2))

    # Save the best parameters to a JSON file
    with open(f"{date_folder}/best_hyperparameters.json", "w") as f:
        json.dump(best_params, f)

except KeyError as e:
    print("Error: The specified study does not exist in the database. Please ensure that the study name and storage path are correct.")
    print(f"Details: {str(e)}")
except Exception as e:
    print(f"An error occurred: {str(e)}")

Available studies in the database:
Error: The specified study does not exist in the database. Please ensure that the study name and storage path are correct.
Details: 'Record does not exist.'


### 4. Use the best hyperparameters for longer training (manually set max epochs!):

In [8]:
import json
import subprocess
import os

# Load best hyperparameters from the JSON file if use_best_params is True
if use_best_params:
    try:
        with open(f"{date_folder}/best_hyperparameters.json", "r") as f:
            best_params = json.load(f)
        # Extract hyperparameters from the JSON file
        params = {
            "n_embd": best_params.get("n_embd", manual_params["n_embd"]),
            "n_head": best_params.get("n_head", manual_params["n_head"]),
            "n_layer": best_params.get("n_layer", manual_params["n_layer"]),
            "batch_size": best_params.get("batch_size", manual_params["batch_size"]),
            "learning_rate": best_params.get("learning_rate", manual_params["learning_rate"]),
            "max_epochs": best_params.get("max_epochs", manual_params["max_epochs"])
        }
    except FileNotFoundError:
        print(f"Warning: {date_folder}/best_hyperparameters.json not found. Using manual parameters.")
        params = manual_params
else:
    # Use manually defined parameters
    params = manual_params

# Build the arguments for the training command
train_args = [
    "python", "/workspaces/arc-neural-reasoning-model/gpt2_arc/src/training/train.py",
    "--n-embd", str(params["n_embd"]),
    "--n-head", str(params["n_head"]),
    "--n-layer", str(params["n_layer"]),
    "--batch-size", str(params["batch_size"]),
    "--learning-rate", str(params["learning_rate"]),
    "--max-epochs", str(params["max_epochs"]),
    "--use-gpu",
    "--project", "arc-scaling-test"
]

# Execute the training script in the background
print("Starting training process in the background...")
with open(f"{date_folder}/training_output.log", "w") as log_file:
    process = subprocess.Popen(train_args, stdout=log_file, stderr=subprocess.STDOUT)

print(f"Training process started with PID: {process.pid}")
print(f"You can monitor the training progress in {date_folder}/training_output.log")

Starting training process in the background...
Training process started with PID: 7126
You can monitor the training progress in /workspaces/arc-neural-reasoning-model/EXPERIMENTAL/20240924/training_output.log


### 5. Evaluate the trained model:

In [12]:
import os
import wandb
import time
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
import subprocess

# Set W&B API key (replace with your actual API key)
wandb_api_key = "2b06e99af167044b281668f6edd388c633aba1a0"  # Replace with your W&B API key
os.environ["WANDB_API_KEY"] = wandb_api_key

# Define the date_folder variable appropriately
# For example, you can set it based on the current date
from datetime import datetime
date_folder = datetime.now().strftime("%Y%m%d")

# Directory containing the model files
model_dir = "/workspaces/arc-neural-reasoning-model/EXPERIMENTAL/"  # Ensure this path is correct
print(f"Watching for new models in directory: {model_dir}")
output_dir = f"{date_folder}/evaluation_results"
os.makedirs(output_dir, exist_ok=True)
wandb_project = "arc-evaluation"

# Set of evaluated models
evaluated_models = set()

class CheckpointHandler(FileSystemEventHandler):
    def on_created(self, event):
        if event.is_directory:
            return
        if event.src_path.endswith('.ckpt') or event.src_path.endswith('.pth'):
            print(f"New checkpoint detected: {event.src_path}")
            self.evaluate_model(event.src_path)

    def evaluate_model(self, model_path):
        model_file = os.path.basename(model_path)

        if model_file in evaluated_models:
            print(f"Skipping already evaluated model: {model_file}")
            return  # Skip if the model was already evaluated

        # Extract epoch and val_loss from the filename for run_name
        # Assuming the filename pattern arc_model-epoch=<number>-val_loss=<number>.ckpt
        try:
            parts = model_file.replace('.ckpt', '').split('-')
            epoch = None
            val_loss = None
            for part in parts:
                if part.startswith('epoch='):
                    epoch = part.split('=')[1]
                elif part.startswith('val_loss='):
                    val_loss = part.split('=')[1]
            if epoch is not None and val_loss is not None:
                run_name = f"scaling-test-evaluation-epoch{epoch}-val_loss{val_loss}"
            else:
                run_name = f"scaling-test-evaluation-{model_file}"
        except Exception as e:
            print(f"Error parsing run name from filename {model_file}: {e}")
            run_name = f"scaling-test-evaluation-{model_file}"

        eval_command = [
            "python", "/workspaces/arc-neural-reasoning-model/gpt2_arc/src/evaluate.py",
            "--model_checkpoint", model_path,
            "--batch_size", "32",
            "--output_dir", output_dir,
            "--wandb_project", wandb_project,
            "--wandb_run_name", run_name
        ]

        print(f"Evaluating model: {model_file} with command: {eval_command}")
        try:
            # Run the evaluation command and capture stdout and stderr
            result = subprocess.run(
                eval_command,
                check=True,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True  # Automatically decode bytes to string
            )
            print(f"Successfully evaluated model: {model_file}")
            print("Evaluation Output:")
            print(result.stdout)  # Print the standard output from evaluate.py
            if result.stderr:
                print("Evaluation Errors/Warnings:")
                print(result.stderr)  # Print any errors or warnings from evaluate.py
        except subprocess.CalledProcessError as e:
            print(f"Error during evaluation of {model_file}: {e}")
            print("Standard Output:")
            print(e.stdout)  # Print the standard output even if there's an error
            print("Standard Error:")
            print(e.stderr)  # Print the error messages
        except Exception as ex:
            print(f"An unexpected error occurred while evaluating {model_file}: {ex}")

        evaluated_models.add(model_file)

def get_all_checkpoint_files(directory):
    print(f"Checking directory for .ckpt and .pth files: {directory}")
    checkpoint_files = []
    for root, _, files in os.walk(directory):
        checkpoint_files.extend([os.path.join(root, f) for f in files if f.endswith('.ckpt') or f.endswith('.pth')])
    print(f"Found checkpoint files: {checkpoint_files}")
    return checkpoint_files

# Set up and start the watchdog observer
event_handler = CheckpointHandler()
observer = Observer()
observer.schedule(event_handler, model_dir, recursive=True)
observer.start()

print("Watching for new checkpoints and final models in all subdirectories...")
print("This script will continue running until you stop it manually.")
print("You can stop it by interrupting the process when training is complete.")

try:
    while True:
        time.sleep(10)

        # Check for any new models
        current_models = set(f for f in get_all_checkpoint_files(model_dir))
        new_models = current_models - evaluated_models

        print(f"Current models: {current_models}")
        print(f"New models to evaluate: {new_models}")

        for model_path in new_models:
            event_handler.evaluate_model(model_path)

except KeyboardInterrupt:
    observer.stop()

observer.join()
print("Checkpoint and final model evaluation completed.")


Watching for new models in directory: /workspaces/arc-neural-reasoning-model/EXPERIMENTAL/
Watching for new checkpoints and final models in all subdirectories...
This script will continue running until you stop it manually.
You can stop it by interrupting the process when training is complete.
Checking directory for .ckpt and .pth files: /workspaces/arc-neural-reasoning-model/EXPERIMENTAL/
Found checkpoint files: []
Current models: set()
New models to evaluate: set()
Checking directory for .ckpt and .pth files: /workspaces/arc-neural-reasoning-model/EXPERIMENTAL/
Found checkpoint files: []
Current models: set()
New models to evaluate: set()
Checking directory for .ckpt and .pth files: /workspaces/arc-neural-reasoning-model/EXPERIMENTAL/
Found checkpoint files: []
Current models: set()
New models to evaluate: set()
Checking directory for .ckpt and .pth files: /workspaces/arc-neural-reasoning-model/EXPERIMENTAL/
Found checkpoint files: []
Current models: set()
New models to evaluate: set

### 6. Analyze the results:

In [None]:
#import json

# Load and print evaluation results
#with open("./evaluation_results/scaling-test-evaluation_results.json", "r") as f:
#    results = json.load(f)

#print("Evaluation Results:")
#print(json.dumps(results, indent=2))