In [2]:
# Cell 1: Install Dependencies
# Run this once at the start
!pip install -q torch transformers datasets accelerate pillow sentencepiece

# If using 4-bit quantization (recommended for free Colab/T4 GPU)
!pip install -q bitsandbytes

In [3]:
# Cell 2: Imports & Configuration
import os
import re
import json
import torch
import numpy as np
from PIL import Image, ImageDraw
from typing import List, Dict, Tuple, Any, Optional
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm.notebook import tqdm  # Use notebook version of tqdm

# Configuration
# On Colab, we use the remote HuggingFace hub path directly
MODEL_PATH = "THUDM/cogagent-9b-20241220" 
DATASET_NAME = "google/android_in_the_wild"
SUBSET = "general" 
SPLIT = "test"
MAX_SAMPLES = 10  # Start small to test the pipeline!

# Constants
ACTION_TYPE_MAP = {
    0: "touch", 1: "lift", 2: "type", 3: "scroll",
    4: "press_back", 5: "press_home", 6: "press_enter"
}
COGA_RES = 1000.0 
DIST_THRESHOLD = 0.20 

print("Environment Configured.")

Environment Configured.


In [4]:
# Cell 3: Helper Functions & Visualization

def calculate_centroid(box: List[int]) -> Tuple[float, float]:
    if not box or len(box) != 4:
        return (0.0, 0.0)
    cx = (box[0] + box[2]) / 2.0
    cy = (box[1] + box[3]) / 2.0
    return (cx / COGA_RES, cy / COGA_RES)

def euclidean_distance(p1: Tuple[float, float], p2: Tuple[float, float]) -> float:
    return np.sqrt((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)

def parse_cogagent_output(text: str) -> Dict[str, Any]:
    result = {"action": "unknown", "point": None, "text": None, "raw_box": None}
    text = text.strip()
    
    # Extract Box
    box_match = re.search(r"\[\[(\d+),(\d+),(\d+),(\d+)\]\]", text)
    if box_match:
        coords = [int(c) for c in box_match.groups()]
        result["raw_box"] = coords
        result["point"] = calculate_centroid(coords)
    
    # Determine Action
    lower_text = text.lower()
    if "click" in lower_text or "tap" in lower_text:
        result["action"] = "touch"
    elif "type" in lower_text:
        result["action"] = "type"
        text_match = re.search(r"(?:type|text)\s*[:=]?\s*['\"]([^'\"]+)['\"]", lower_text)
        if text_match: result["text"] = text_match.group(1)
    elif "scroll" in lower_text: result["action"] = "scroll"
    elif "back" in lower_text: result["action"] = "press_back"
    elif "home" in lower_text: result["action"] = "press_home"
    elif "enter" in lower_text: result["action"] = "press_enter"
        
    return result

def build_prompt(goal: str, history: List[str]) -> str:
    history_str = "History steps:\n" + ("None\n" if not history else "".join([f"{i}. {h}\n" for i, h in enumerate(history[-3:])]))
    return f"Task: {goal}\n\n{history_str}\n(Platform: Android)\n\n(Answer in Action-Operation-Sensitive format with Grounded Operation.)"

def visualize_prediction(image, goal, pred, gt):
    """Draws GT (Green) and Pred (Red) on the image for debugging"""
    img_viz = image.copy()
    draw = ImageDraw.Draw(img_viz)
    w, h = img_viz.size
    
    # Draw GT Point (Green Circle)
    if gt['touch_point'] != (0.0, 0.0):
        gt_x, gt_y = gt['touch_point'][0] * w, gt['touch_point'][1] * h
        draw.ellipse((gt_x-10, gt_y-10, gt_x+10, gt_y+10), outline="green", width=3)
        
    # Draw Pred Box (Red Rectangle)
    if pred['raw_box']:
        # CogAgent box is 0-1000, need to scale to image size
        p_box = [c / 1000.0 for c in pred['raw_box']]
        draw.rectangle(
            [p_box[0]*w, p_box[1]*h, p_box[2]*w, p_box[3]*h], 
            outline="red", width=3
        )
        
    display(img_viz) # Jupyter/Colab function to show image
    print(f"Goal: {goal}")
    print(f"GT: {gt['action_type']} | Pred: {pred['action']}")

In [5]:
# Cell 3: Helper Functions & Visualization

def calculate_centroid(box: List[int]) -> Tuple[float, float]:
    if not box or len(box) != 4:
        return (0.0, 0.0)
    cx = (box[0] + box[2]) / 2.0
    cy = (box[1] + box[3]) / 2.0
    return (cx / COGA_RES, cy / COGA_RES)

def euclidean_distance(p1: Tuple[float, float], p2: Tuple[float, float]) -> float:
    return np.sqrt((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)

def parse_cogagent_output(text: str) -> Dict[str, Any]:
    result = {"action": "unknown", "point": None, "text": None, "raw_box": None}
    text = text.strip()
    
    # Extract Box
    box_match = re.search(r"\[\[(\d+),(\d+),(\d+),(\d+)\]\]", text)
    if box_match:
        coords = [int(c) for c in box_match.groups()]
        result["raw_box"] = coords
        result["point"] = calculate_centroid(coords)
    
    # Determine Action
    lower_text = text.lower()
    if "click" in lower_text or "tap" in lower_text:
        result["action"] = "touch"
    elif "type" in lower_text:
        result["action"] = "type"
        text_match = re.search(r"(?:type|text)\s*[:=]?\s*['\"]([^'\"]+)['\"]", lower_text)
        if text_match: result["text"] = text_match.group(1)
    elif "scroll" in lower_text: result["action"] = "scroll"
    elif "back" in lower_text: result["action"] = "press_back"
    elif "home" in lower_text: result["action"] = "press_home"
    elif "enter" in lower_text: result["action"] = "press_enter"
        
    return result

def build_prompt(goal: str, history: List[str]) -> str:
    history_str = "History steps:\n" + ("None\n" if not history else "".join([f"{i}. {h}\n" for i, h in enumerate(history[-3:])]))
    return f"Task: {goal}\n\n{history_str}\n(Platform: Android)\n\n(Answer in Action-Operation-Sensitive format with Grounded Operation.)"

def visualize_prediction(image, goal, pred, gt):
    """Draws GT (Green) and Pred (Red) on the image for debugging"""
    img_viz = image.copy()
    draw = ImageDraw.Draw(img_viz)
    w, h = img_viz.size
    
    # Draw GT Point (Green Circle)
    if gt['touch_point'] != (0.0, 0.0):
        gt_x, gt_y = gt['touch_point'][0] * w, gt['touch_point'][1] * h
        draw.ellipse((gt_x-10, gt_y-10, gt_x+10, gt_y+10), outline="green", width=3)
        
    # Draw Pred Box (Red Rectangle)
    if pred['raw_box']:
        # CogAgent box is 0-1000, need to scale to image size
        p_box = [c / 1000.0 for c in pred['raw_box']]
        draw.rectangle(
            [p_box[0]*w, p_box[1]*h, p_box[2]*w, p_box[3]*h], 
            outline="red", width=3
        )
        
    display(img_viz) # Jupyter/Colab function to show image
    print(f"Goal: {goal}")
    print(f"GT: {gt['action_type']} | Pred: {pred['action']}")

In [6]:
# Cell 4: Load Model
print(f"ü§ñ Loading CogAgent from {MODEL_PATH}")

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)

# OPTION 1: High-RAM GPU (A100/L4)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map="auto"
).eval()

# OPTION 2: Low-RAM GPU (T4 - Free Colab) - Uncomment below if needed
# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_PATH,
#     trust_remote_code=True,
#     load_in_4bit=True, 
#     device_map="auto"
# ).eval()

print("Model Loaded!")

ü§ñ Loading CogAgent from THUDM/cogagent-9b-20241220


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenization_chatglm.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/THUDM/cogagent-9b-20241220:
- tokenization_chatglm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer.model:   0%|          | 0.00/2.62M [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

configuration_chatglm.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/THUDM/cogagent-9b-20241220:
- configuration_chatglm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`torch_dtype` is deprecated! Use `dtype` instead!


modeling_chatglm.py: 0.00B [00:00, ?B/s]

visual.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/THUDM/cogagent-9b-20241220:
- visual.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/THUDM/cogagent-9b-20241220:
- modeling_chatglm.py
- visual.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/3.12G [00:00<?, ?B/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/211 [00:00<?, ?B/s]

Model Loaded!


In [7]:
# Cell 5: Debug Mode Evaluation Loop (Deep Logging)
import os, io, re, json, threading, torch, types, time, gc
import numpy as np
from typing import Any, List, Dict, Tuple
from PIL import Image
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel, TextIteratorStreamer
from tqdm.notebook import tqdm

# ================= üîß Ë∞ÉËØïÂºÄÂÖ≥ =================
DEBUG_MODE = True  # ÂºÄÂêØËØ¶ÁªÜÊó•Âøó

# ================= Configuration =================
MODEL_PATH = "THUDM/cogagent-9b-20241220"
DATASET_NAME = "cjfcsjt/AITW_Single"
SUBSET = "unseen_subject"
TARGET_SPLIT = "test"
LOG_FILE = "/content/drive/MyDrive/cogagent_aitw_eval_log_debug.jsonl" 

# ================= üöë COMPATIBILITY PATCH üöë =================
def _fix_compatibility(model):
    print("üîß Applying compatibility patches...")
    def _manual_extract_past(self, outputs, standardized_output_keys=None):
        if hasattr(outputs, "past_key_values"): return "past_key_values", outputs.past_key_values
        elif isinstance(outputs, (tuple, list)) and len(outputs) > 1: return "past_key_values", outputs[1]
        return "past_key_values", None
    
    if not hasattr(model, "_extract_past_from_model_output"):
        model._extract_past_from_model_output = types.MethodType(_manual_extract_past, model)
    
    try:
        if not hasattr(model.config, 'num_hidden_layers'):
            model.config.num_hidden_layers = model.config.num_layers
    except AttributeError: pass
    
    model.config.use_cache = False
    print("‚úÖ Patches applied.")

# ================= Helper Functions =================
COGA_RES = 1000.0
DIST_THRESHOLD = 0.20 
ACTION_TYPE_MAP = {0: "touch", 1: "lift", 2: "type", 3: "scroll", 4: "press_back", 5: "press_home", 6: "press_enter"}

def calculate_centroid(box: List[int]) -> Tuple[float, float]:
    if not box or len(box) != 4: return (0.0, 0.0)
    cx = (box[0] + box[2]) / 2.0
    cy = (box[1] + box[3]) / 2.0
    return (cx / COGA_RES, cy / COGA_RES)

def euclidean_distance(p1: Tuple[float, float], p2: Tuple[float, float]) -> float:
    return np.sqrt((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)

def build_prompt(goal: str) -> str:
    return f"Task: {goal}\n(Platform: Android)\nAnswer in Action-Operation-Sensitive format with Grounded Operation."

def parse_output(text: str) -> Dict[str, Any]:
    result = {"action": "unknown", "point": None, "raw_box": None}
    text = text.strip().lower()
    box_match = re.search(r"\[\[(\d+),(\d+),(\d+),(\d+)\]\]", text)
    if box_match:
        coords = [int(c) for c in box_match.groups()]
        result["raw_box"] = coords
        result["point"] = calculate_centroid(coords)
    if "click" in text or "tap" in text: result["action"] = "touch"
    elif "type" in text: result["action"] = "type"
    elif "scroll" in text: result["action"] = "scroll"
    elif "back" in text: result["action"] = "press_back"
    elif "home" in text: result["action"] = "press_home"
    elif "enter" in text: result["action"] = "press_enter"
    return result

# ================= Inference Engine (With Debug Prints) =================
@torch.inference_mode()
def cog_generate(model, tokenizer, img, query):
    if DEBUG_MODE:
        print(f"    [DEBUG] Input Query: {query}")
        print(f"    [DEBUG] Image Size: {img.size}")
    
    # ÊûÑÈÄ†ËæìÂÖ•
    start_time = time.time()
    inputs = tokenizer.apply_chat_template(
        [{"role": "user", "image": img, "content": query}],
        add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
    ).to(model.device)
    
    if 'images' in inputs: 
        inputs['images'] = inputs['images'].to(torch.bfloat16)
        if DEBUG_MODE: print(f"    [DEBUG] Image Tensor Shape: {inputs['images'].shape}")

    inputs.pop("use_cache", None)
    inputs.pop("token_type_ids", None)

    if DEBUG_MODE: print(f"    [DEBUG] Pre-processing took: {time.time() - start_time:.4f}s")

    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    
    def run(): 
        if DEBUG_MODE: print("    [DEBUG] Thread started, entering model.generate...")
        try:
            model.generate(**inputs, max_new_tokens=128, do_sample=False, streamer=streamer, use_cache=False)
        except Exception as e:
            print(f"    [DEBUG] ‚ùå Generation Thread Error: {e}")

    thread = threading.Thread(target=run)
    thread.start()
    
    generated_text = ""
    if DEBUG_MODE: print("    [DEBUG] Streaming tokens: ", end="")
    
    for t in streamer:
        if DEBUG_MODE: print(f"{t}", end="", flush=True)
        generated_text += t
        
    if DEBUG_MODE: print("\n    [DEBUG] Generation finished.")
    thread.join()
    
    del inputs
    return generated_text.strip()

# ================= MAIN EXECUTION =================
def main():
    # 1. ÊòæÂ≠òÊ£ÄÊü•
    print("üîç Checking GPU Status before loading...")
    print(torch.cuda.get_device_name(0))
    print(f"Memory Allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
    print(f"Memory Reserved:  {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")
    
    # 2. Âä†ËΩΩÊ®°Âûã (Â∞ùËØïÂº∫Âà∂Âä†ËΩΩÂà∞ CUDA)
    print(f"ü§ñ Loading CogAgent from {MODEL_PATH}")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
    
    try:
        # ‚ö†Ô∏è Âº∫Âà∂‰ΩøÁî® cuda:0ÔºåÂ¶ÇÊûúÊòæÂ≠ò‰∏çÂ§üÁõ¥Êé•ÁÇ∏Âá∫Êù•ÔºåËÄå‰∏çÊòØÂÅ∑ÂÅ∑offload
        print("‚ö° Attempting to load model fully onto GPU (device_map='cuda')...")
        model = AutoModel.from_pretrained(
            MODEL_PATH, 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16, 
            device_map="cuda" 
        ).eval()
    except Exception as e:
        print(f"‚ùå Load failed with device_map='cuda': {e}")
        print("‚ö†Ô∏è Falling back to device_map='auto' (May be slow)...")
        model = AutoModel.from_pretrained(
            MODEL_PATH, 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16, 
            device_map="auto" 
        ).eval()

    _fix_compatibility(model)

    # 3. Êï∞ÊçÆÈõÜ
    print(f"üìÇ Loading Dataset {DATASET_NAME}...")
    try: ds = load_dataset(DATASET_NAME, SUBSET, split=TARGET_SPLIT, streaming=True)
    except: ds = load_dataset(DATASET_NAME, SUBSET, split="train", streaming=True)

    print("üöÄ Starting Debug Evaluation...")
    
    for i, sample in tqdm(enumerate(ds)):
        if i >= 5: # Debug Ê®°ÂºèÂè™Ë∑ë 5 Êù°
            print("üõë Debug limit reached (5 samples).")
            break

        print(f"\n===== SAMPLE {i} START =====")
        
        # --- Data Prep ---
        raw_img = sample.get("image_encoded")
        if raw_img is None: continue
        try: 
            img = raw_img if isinstance(raw_img, Image.Image) else Image.open(raw_img).convert("RGB")
            # Áº©Â∞è‰∏ÄÁÇπ‰ª•Âä†Âø´ debug ÈÄüÂ∫¶
            if max(img.size) > 1000: img.thumbnail((1000, 1000))
        except: continue
        
        goal = sample.get("goal_info", "")
        print(f"[DEBUG] Goal: {goal}")

        # --- Inference ---
        start_t = time.time()
        try: 
            response = cog_generate(model, tokenizer, img, build_prompt(goal))
        except Exception as e:
            print(f"‚ùå Error: {e}")
            continue
        end_t = time.time()
        
        print(f"[DEBUG] Full Inference Time: {end_t - start_t:.2f}s")
        print(f"[DEBUG] Final Response: {response}")

if __name__ == "__main__":
    main()

üîç Checking GPU Status before loading...
NVIDIA A100-SXM4-40GB
Memory Allocated: 25.90 GB
Memory Reserved:  25.91 GB
ü§ñ Loading CogAgent from THUDM/cogagent-9b-20241220
‚ö° Attempting to load model fully onto GPU (device_map='cuda')...


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

‚ùå Load failed with device_map='cuda': CUDA out of memory. Tried to allocate 214.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 198.88 MiB is free. Process 29089 has 39.35 GiB memory in use. Of the allocated memory 38.94 GiB is allocated by PyTorch, and 5.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
‚ö†Ô∏è Falling back to device_map='auto' (May be slow)...




Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

üîß Applying compatibility patches...
‚úÖ Patches applied.
üìÇ Loading Dataset cjfcsjt/AITW_Single...


README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

üöÄ Starting Debug Evaluation...


0it [00:00, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



===== SAMPLE 0 START =====
[DEBUG] Goal: Go to amazon search bar
    [DEBUG] Input Query: Task: Go to amazon search bar
(Platform: Android)
Answer in Action-Operation-Sensitive format with Grounded Operation.
    [DEBUG] Image Size: (474, 1000)
    [DEBUG] Image Tensor Shape: torch.Size([1, 3, 1120, 1120])
    [DEBUG] Pre-processing took: 0.0576s
    [DEBUG] Thread started, entering model.generate...
    [DEBUG] Streaming tokens: 

KeyboardInterrupt: 

In [None]:
!nvidia-smi
