In [None]:
class DataLoaderLite:
    def __init__(self, B, T, process_rank, num_processes, split):
        # ... (other initialization code)
        self.current_shard = 0
        self.current_position = self.B * self.T * self.process_rank

    def next_batch(self):
        B, T = self.B, self.T
        buf = self.tokens[self.current_position : self.current_position+B*T+1]
        x = (buf[:-1]).view(B, T) # inputs
        y = (buf[1:]).view(B, T) # targets
        # Advance the position in the tensor
        self.current_position += B * T * self.num_processes
        # If loading the next batch would be out of bounds, advance to next shard
        if self.current_position + (B * T * self.num_processes + 1) > len(self.tokens):
            self.current_shard = (self.current_shard + 1) % len(self.shards)
            self.tokens = load_tokens(self.shards[self.current_shard])
            self.current_position = B * T * self.process_rank
        return x, y

    def set_state(self, state):
        self.current_shard = state['current_shard']
        self.current_position = state['current_position']
        self.tokens = load_tokens(self.shards[self.current_shard])

    def get_state(self):
        return {
            'current_shard': self.current_shard,
            'current_position': self.current_position
        }

# In the training loop:
if args.resume_from_checkpoint:
    latest_checkpoint = max([f for f in os.listdir(log_dir) if f.startswith("checkpoint")], key=os.path.getctime)
    checkpoint_path = os.path.join(log_dir, latest_checkpoint)
    starting_step, val_loss, run_name = load_checkpoint(checkpoint_path, raw_model, optimizer, lr_scheduler, train_loader, val_loader)
    print(f"Resuming from checkpoint at step {starting_step}")
else:
    starting_step = 0
    # ... (setup for new run)

for step in range(starting_step, max_steps):
    # Get the next batch of data
    x, y = train_loader.next_batch()

    # ... (rest of the training loop)

    if step % 2000 == 0 or last_step:
        # Save checkpoint
        checkpoint_path = save_checkpoint(
            raw_model, optimizer, lr_scheduler, step, val_loss_accum.item(), run_name,
            train_loader.get_state(), val_loader.get_state()
        )
        # ... (push to HuggingFace, etc.)

# Formatting to Config for train_gpt2.py

To modify your train_gpt2.py file so that the main hyperparameters are controlled from the config dictionary instead of having integer variables scattered throughout the code, you can make the following changes:

1. Update the config dictionary to include all the main hyperparameters:

```python
config = {
    "batch_size": 64,
    "weight_decay": 0.1,
    "learning_rate": 6e-4,
    "lr_scheduler_type": "cosine",
    "num_warmup_steps": 715,
    "gradient_accumulation_steps": 2**19 // (64 * 1024 * ddp_world_size),
    "max_train_steps": 19073,
    "max_eval_steps": 20,
    "seq_length": 1024,
    "seed": 1,
    "eval_interval": 250,
    "save_interval": 2000,
    "total_batch_size": 524288,
    "max_lr": 6e-4,
    "min_lr": 6e-5,
}
```

2. Replace the hardcoded values in your code with references to the config dictionary. Here are some examples:

```python
# Replace
B = 64
T = 1024

# With
B = config["batch_size"]
T = config["seq_length"]

# Replace
total_batch_size = 524288

# With
total_batch_size = config["total_batch_size"]

# Replace
grad_accum_steps = total_batch_size // (B * T * ddp_world_size)

# With
grad_accum_steps = config["gradient_accumulation_steps"]

# Replace
max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_steps = 715
max_steps = 19073

# With
max_lr = config["max_lr"]
min_lr = config["min_lr"]
warmup_steps = config["num_warmup_steps"]
max_steps = config["max_train_steps"]

# Replace
if step % 250 == 0 or last_step:

# With
if step % config["eval_interval"] == 0 or last_step:

# Replace
if step % 2000 == 0 or last_step:

# With
if step % config["save_interval"] == 0 or last_step:
```

3. Update the learning rate scheduler function:

```python
def get_lr(it):
    if it < config["num_warmup_steps"]:
        return config["max_lr"] * (it+1) / config["num_warmup_steps"]
    if it > config["max_train_steps"]:
        return config["min_lr"]
    decay_ratio = (it - config["num_warmup_steps"]) / (config["max_train_steps"] - config["num_warmup_steps"])
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return config["min_lr"] + coeff * (config["max_lr"] - config["min_lr"])
```

4. Update the optimizer configuration:

```python
optimizer = raw_model.configure_optimizers(
    weight_decay=config["weight_decay"],
    learning_rate=config["learning_rate"],
    device_type=device_type
)
```

5. Update the evaluation loop:

```python
val_loss_steps = config["max_eval_steps"]
for _ in range(val_loss_steps):
    # ... (rest of the evaluation loop)
```

By making these changes, you'll centralize all the main hyperparameters in the config dictionary, making it easier to modify and experiment with different settings. This approach also makes your code more maintainable and reduces the risk of inconsistencies between different parts of your script.

You're right, and I apologize for the oversight. The `load_checkpoint` function you defined isn't being used in the code snippet you've shown. This is likely the reason why your training isn't resuming from the checkpoint as expected.

To fix this, you should modify your code to use the `load_checkpoint` function. Here's how you can integrate it into your existing code:

```python
if args.resume_from_checkpoint:
    checkpoint_dir = "./log"
    checkpoint_pattern = os.path.join(checkpoint_dir, "checkpoint_*.pt")
    checkpoint_files = glob.glob(checkpoint_pattern)
    
    if not checkpoint_files:
        print(f"No checkpoint files found in {checkpoint_dir}")
        # Handle the case when no checkpoints are found (e.g., start from scratch)
    else:
        latest_checkpoint = max(checkpoint_files, key=os.path.getctime)
        checkpoint_path = latest_checkpoint
        
        # Use the load_checkpoint function here
        starting_step, val_loss, run_name = load_checkpoint(checkpoint_path, raw_model, optimizer, train_loader, val_loader)
        
        checkpoint = torch.load(checkpoint_path)
        wandb_id = checkpoint['wandb_id']
        
        logger, run_name = resume_logging(project_name.split("/")[1], wandb_id, args)
        print(f"Resuming from checkpoint: {checkpoint_path}")
        print(f"Weights and Biases run name: {run_name}")
        print(f"Resuming from step: {starting_step}")

        # Initialize HuggingFace repository <-- UNSURE IF NEEDED
        if master_process:
            new_branch_name = run_name
            create_branch(project_name, repo_type="model", branch=new_branch_name)
            hf_repo = Repository("./", clone_from=project_name, revision=run_name)

        if master_process:
            print(f"Resuming from checkpoint at step {starting_step}")
```

This modification does the following:

1. It uses your `load_checkpoint` function to load the model state, optimizer state, and data loader states.
2. It retrieves the starting step, validation loss, and run name from the checkpoint.
3. It still loads the full checkpoint to get the `wandb_id`.

Also, make sure that in your training loop, you're starting from `starting_step` instead of 0:

```python
for step in range(starting_step, max_steps):
    # Your training code here
```

This should ensure that your training resumes from the correct step when loading a checkpoint.

Remember to also update your `save_checkpoint` function to include all necessary information:

```python
def save_checkpoint(model, optimizer, step, val_loss, run_name, train_loader_state, val_loader_state, wandb_id):
    checkpoint = {
        'model': model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'step': step,
        'val_loss': val_loss,
        'run_name': run_name,
        'train_loader_state': train_loader_state,
        'val_loader_state': val_loader_state,
        'wandb_id': wandb_id
    }
    checkpoint_path = os.path.join("./log", f"checkpoint_{step:05d}.pt")
    torch.save(checkpoint, checkpoint_path)
    return checkpoint_path
```

These changes should help your training resume correctly from the checkpoint.

torchrun --standalone --nproc_per_node=1 train_gpt2-trail.py

torchrun --standalone --nproc_per_node=6 train_gpt2-final.py

# task for tommorow

- get model converted and usable by HF
- fix and clear training files
- get RMSN trained
- check PN

---

- close GPU when leave