# VLM_Pretrain



### Requirements

- Python 3.10.16
- CUDA 12.2
- [requirements.txt](./requirements.txt)



### Start Training



#### Step 0: Download the clip model and the language model
```bash
# Download the clip model to the ./model/vision_model directory
git clone https://huggingface.co/openai/clip-vit-base-patch16
# or
git clone https://www.modelscope.cn/models/openai-mirror/clip-vit-base-patch16
```

```bash
# Download the pure language model weights to the ./out directory (as the base language model for training VLM)
https://huggingface.co/jingyaogong/MiniMind2-V-PyTorch/blob/main/lm_512.pth
```

#### Step 1: Install the dependencies

```bash
pip install -r requirements.txt
```

#### Step 2: Prepare the dataset

Download the dataset from [here](https://huggingface.co/datasets/jingyaogong/minimind-v_dataset) and put it in the `./dataset` directory.

```
git clone https://huggingface.co/datasets/jingyaogong/minimind-v_dataset
```

`*.jsonl` is the question-answer dataset, `*images` is the corresponding image data, and the image data needs to be decompressed after downloading.

#### Step 3: Run the code!

In [7]:
# First import the necessary libraries
import os
import sys
import torch
import time
import math
from contextlib import nullcontext
from torch import optim, nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

# Add path
sys.path.append('..')
from model.model_vlm import MiniMindVLM, VLMConfig
from dataset.lm_dataset import VLMDataset

print("Environment setup completed!")


Environment setup completed!


In [8]:
# Configuration
class Config:
    def __init__(self):
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.epochs = 1
        self.batch_size = 24
        self.learning_rate = 4e-4
        self.accumulation_steps = 1
        self.grad_clip = 1.0
        self.log_interval = 100
        self.save_interval = 1000

        # Model configuration
        self.hidden_size = 512
        self.num_hidden_layers = 8
        self.max_seq_len = 640
        self.use_moe = False

        # Data paths
        self.data_path = "dataset/pretrain_data.jsonl"
        self.images_path = "dataset/pretrain_images"
        self.save_dir = "out"

args = Config()
print(f"Using device: {args.device}")
print(f"Training epochs: {args.epochs}")
print(f"Batch size: {args.batch_size}")


Using device: cuda:0
Training epochs: 1
Batch size: 24


In [9]:
# Initialize model
def init_model():
    # Create model configuration
    model_config = VLMConfig(
        hidden_size=args.hidden_size,
        num_hidden_layers=args.num_hidden_layers,
        max_seq_len=args.max_seq_len
    )

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained('model/vision_model/clip-vit-base-patch16', use_fast=True)

    # Create VLM model
    model = MiniMindVLM(model_config, vision_model_path="model/vision_model/clip-vit-base-patch16")

    # Load pre-trained language model weights
    moe_path = '_moe' if model_config.use_moe else ''
    ckp = f'{args.save_dir}/lm_{model_config.hidden_size}{moe_path}.pth'
    state_dict = torch.load(ckp, map_location=args.device)
    model.load_state_dict(state_dict, strict=False)

    # Only train vision_proj layer, freeze other parameters
    for name, param in model.named_parameters():
        if 'vision_proj' not in name:
            param.requires_grad = False

    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6
    print(f'VLM trainable parameters: {trainable_params:.3f}M')

    _, preprocess = model.vision_encoder, model.processor
    return model.to(args.device), tokenizer, preprocess, model_config

# Initialize
model, tokenizer, preprocess, model_config = init_model()
print("Model initialization completed!")


VLM trainable parameters: 0.394M
Model initialization completed!


In [10]:
# Create dataset and data loader
train_ds = VLMDataset(
    args.data_path,
    args.images_path,
    tokenizer,
    preprocess=preprocess,
    image_special_token=model_config.image_special_token,
    max_length=args.max_seq_len
)

train_loader = DataLoader(
    train_ds,
    batch_size=args.batch_size,
    pin_memory=True,
    drop_last=False,
    shuffle=True,
    num_workers=4
)

print(f"Dataset size: {len(train_ds)}")
print(f"Number of batches: {len(train_loader)}")

# Initialize training components
os.makedirs(args.save_dir, exist_ok=True)
ctx = nullcontext() if args.device == "cpu" else torch.cuda.amp.autocast()
scaler = torch.cuda.amp.GradScaler(enabled=(args.device != "cpu"))
optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=args.learning_rate)

print("Data loader and optimizer setup completed!")


Dataset size: 595375
Number of batches: 24808
Data loader and optimizer setup completed!


In [11]:
# Helper functions
def get_lr(current_step, total_steps, lr):
    return lr / 10 + 0.5 * lr * (1 + math.cos(math.pi * current_step / total_steps))

# Simplified training function
def train_step(X, Y, loss_mask, pixel_values, step, epoch, total_steps):
    loss_fct = nn.CrossEntropyLoss(reduction='none')

    # Adjust learning rate
    lr = get_lr(step, total_steps, args.learning_rate)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    # Forward pass
    with ctx:
        res = model(X, pixel_values=pixel_values)
        loss = loss_fct(
            res.logits.view(-1, res.logits.size(-1)),
            Y.view(-1)
        ).view(Y.size())

        loss = (loss * loss_mask).sum() / loss_mask.sum()
        loss += res.aux_loss
        loss = loss / args.accumulation_steps

    # Backward pass
    scaler.scale(loss).backward()

    # Gradient update
    if (step + 1) % args.accumulation_steps == 0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad(set_to_none=True)

    return loss.item(), lr

print("Training function definition completed!")


Training function definition completed!


In [None]:
# Start training
print("=" * 50)
print("Starting VLM pre-training")
print("=" * 50)

model.train()
iter_per_epoch = len(train_loader)
total_steps = args.epochs * iter_per_epoch

for epoch in range(args.epochs):
    start_time = time.time()
    epoch_loss = 0.0

    for step, (X, Y, loss_mask, pixel_values) in enumerate(train_loader):
        # Move data to device
        X = X.to(args.device)
        Y = Y.to(args.device)
        loss_mask = loss_mask.to(args.device)
        pixel_values = pixel_values.to(args.device)

        # Training step
        global_step = epoch * iter_per_epoch + step
        loss, lr = train_step(X, Y, loss_mask, pixel_values, global_step, epoch, total_steps)
        epoch_loss += loss

        # Print logs
        if step % args.log_interval == 0 and step > 0:
            spend_time = time.time() - start_time
            avg_loss = epoch_loss / (step + 1)
            # Calculate estimated time remaining for the epoch
            steps_remaining_epoch = iter_per_epoch - (step + 1)
            time_per_step = spend_time / (step + 1)
            estimated_time_epoch = steps_remaining_epoch * time_per_step

            # Calculate estimated total time remaining
            steps_remaining_total = total_steps - global_step - 1
            estimated_time_total = estimated_time_total = steps_remaining_total * time_per_step

            print(f'Epoch:[{epoch+1}/{args.epochs}]({step}/{iter_per_epoch}) '
                  f'loss:{loss:.3f} avg_loss:{avg_loss:.3f} lr:{lr:.7f} '
                  f'Epoch ETA: {estimated_time_epoch:.2f}s Total ETA: {estimated_time_total:.2f}s')

        # Save model
        if (step + 1) % args.save_interval == 0:
            model.eval()
            moe_path = '_moe' if model_config.use_moe else ''
            ckp = f'{args.save_dir}/pretrain_vlm_{model_config.hidden_size}{moe_path}.pth'
            state_dict = model.state_dict()
            # Only save non-vision_encoder parameters
            clean_state_dict = {
                key: value for key, value in state_dict.items()
                if not key.startswith('vision_encoder.')
            }
            clean_state_dict = {k: v.half() for k, v in clean_state_dict.items()}
            torch.save(clean_state_dict, ckp)
            print(f"Model saved to: {ckp}")
            model.train()

    # Statistics after each epoch
    epoch_time = time.time() - start_time
    avg_loss = epoch_loss / iter_per_epoch
    print(f'Epoch {epoch+1} completed! Average loss: {avg_loss:.3f}, Time: {epoch_time:.2f}s')

print("Training completed!")

In [None]:
# Final model save
model.eval()
moe_path = '_moe' if model_config.use_moe else ''
final_ckp = f'{args.save_dir}/pretrain_vlm_{model_config.hidden_size}{moe_path}_final.pth'

state_dict = model.state_dict()
clean_state_dict = {
    key: value for key, value in state_dict.items()
    if not key.startswith('vision_encoder.')
}
clean_state_dict = {k: v.half() for k, v in clean_state_dict.items()}
torch.save(clean_state_dict, final_ckp)

print(f"Final model saved to: {final_ckp}")
print("All training processes completed!")


In [None]:
# Try to chat with the model

import os
from PIL import Image
from transformers import TextStreamer

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

def chat_with_vlm(prompt, pixel_values, image_names):
    messages = [{"role": "user", "content": prompt}]
    new_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )[-args.max_seq_len + 1:]

    inputs = tokenizer(
        new_prompt,
        return_tensors="pt",
        truncation=True
    ).to(args.device)

    print(f'[Image]: {image_names}')
    print('🤖️: ', end='')
    generated_ids = model.generate(
        inputs["input_ids"],
        max_new_tokens=args.max_seq_len,
        num_return_sequences=1,
        do_sample=True,
        attention_mask=inputs["attention_mask"],
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        streamer=streamer,
        top_p=0.95,
        temperature=0.6,
        pixel_values=pixel_values
    )

    response = tokenizer.decode(generated_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    messages.append({"role": "assistant", "content": response})
    print('\n\n')


image_dir = './dataset/eval_images/'
prompt = f"{model.params.image_special_token}\nPlease describe the content of the image."

for image_file in os.listdir(image_dir):
    image = Image.open(os.path.join(image_dir, image_file)).convert('RGB')
    pixel_tensors = MiniMindVLM.image2tensor(image, preprocess).to(args.device).unsqueeze(0)
    chat_with_vlm(prompt, pixel_tensors, image_file)

## Reference

- [Minimind-v](https://github.com/jingyaogong/minimind-v)