Mini VLA-Agent: Vision + Language â†’ Action

In [2]:
!pip install -q gymnasium imageio[ffmpeg] transformers torch torchvision accelerate sentencepiece


In [1]:

import gymnasium as gym
import torch
import torch.nn as nn
import torchvision.transforms as T
from transformers import CLIPProcessor, CLIPModel, DistilBertModel, DistilBertTokenizer
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import imageio.v2 as imageio
from IPython.display import Video, HTML
import json
import imageio
from base64 import b64encode


In [2]:
class MiniVLAAgent(nn.Module):
    def __init__(self, action_dim):
        super().__init__()
        self.clip = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
        self.text_encoder = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.fuse = nn.Linear(self.clip.config.projection_dim + self.text_encoder.config.dim, 256)
        self.policy_head = nn.Linear(256, action_dim)

    def forward(self, image, text_input):
        with torch.no_grad():
            vision_emb = self.clip.get_image_features(**image)
            text_emb = self.text_encoder(**text_input).last_hidden_state[:, 0, :]
        fused = torch.relu(self.fuse(torch.cat([vision_emb, text_emb], dim=-1)))
        return self.policy_head(fused)


# Setup environment

In [3]:
env = gym.make("CartPole-v1", render_mode="rgb_array")
n_actions = env.action_space.n
device = "cuda" if torch.cuda.is_available() else "cpu"

agent = MiniVLAAgent(n_actions).to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
text_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [4]:
def vla_action(frame, instruction):
    image = clip_processor(images=Image.fromarray(frame), return_tensors="pt").to(device)
    text_input = text_tokenizer(instruction, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        logits = agent(image, text_input)
        probs = torch.softmax(logits, dim=-1)
        action = torch.argmax(probs, dim=-1).item()
    return action

In [7]:


frames = []
logs = []  # store logs for analysis

obs, _ = env.reset()
instruction = "Keep the pole balanced"
print(f"ðŸŽ¯ Instruction for VLA: '{instruction}'\n")

for step in range(1000):
    frame = env.render()
    action = vla_action(frame, instruction)  # vision-language reasoning

    obs, reward, done, trunc, info = env.step(action)
    frames.append(frame)

    # --- Logging for understanding ---
    print(f"ðŸŒ€ Step {step}")
    print(f"Observation (simplified): {str(obs)[:120]}...")
    print(f"Action taken: {action}")
    print(f"Reward received: {reward:.3f}")
    if 'clip_score' in info:
        print(f"CLIP similarity score: {info['clip_score']:.3f}")
    print("-" * 60)

    # --- Save to structured log list ---
    logs.append({
        "step": step,
        "instruction": instruction,
        "action": str(action),
        "reward": float(reward),
        "done": done,
        "truncated": trunc,
        "info": {k: float(v) if isinstance(v, (int, float)) else str(v) for k, v in info.items()}
    })

    if done or trunc:
        print("\nâœ… Task completed or terminated early.")
        break

env.close()

# --- Optional: export log file for GitHub transparency ---
with open("vla_run_log.json", "w") as f:
    json.dump(logs, f, indent=4)

print("\nðŸ“„ Saved structured log to 'vla_run_log.json'")


ðŸŽ¯ Instruction for VLA: 'Keep the pole balanced'

ðŸŒ€ Step 0
Observation (simplified): [ 0.02522683  0.20051472  0.04357047 -0.27844906]...
Action taken: 1
Reward received: 1.000
------------------------------------------------------------
ðŸŒ€ Step 1
Observation (simplified): [ 0.02923713  0.3949889   0.03800149 -0.5570778 ]...
Action taken: 1
Reward received: 1.000
------------------------------------------------------------
ðŸŒ€ Step 2
Observation (simplified): [ 0.0371369   0.58955735  0.02685993 -0.83755   ]...
Action taken: 1
Reward received: 1.000
------------------------------------------------------------
ðŸŒ€ Step 3
Observation (simplified): [ 0.04892805  0.78430235  0.01010893 -1.1216663 ]...
Action taken: 1
Reward received: 1.000
------------------------------------------------------------
ðŸŒ€ Step 4
Observation (simplified): [ 0.06461409  0.9792903  -0.01232439 -1.4111613 ]...
Action taken: 1
Reward received: 1.000
------------------------------------------------------

In [8]:

# --- Save frames as video ---
video_path = "vla_cartpole_run.mp4"
imageio.mimsave(video_path, frames, fps=30)
print(f"ðŸŽ¬ Saved video to: {video_path}")

# --- Display inline in Colab ---
mp4 = open(video_path, 'rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML(f"""
<video width=480 controls>
    <source src="{data_url}" type="video/mp4">
</video>
""")




ðŸŽ¬ Saved video to: vla_cartpole_run.mp4
