In [17]:
import sys
from pathlib import Path
from PIL import Image
import torch
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig

parent_dir = Path.cwd().parent
sys.path.append(str(parent_dir))

from molmo_utils import load_model, get_coordinates, refactor_coordinates_to_where_to_place
from utils import extract_all, visualize_points_on_image, plot_euclidean_bplot

In [2]:
model_name = 'allenai/Molmo-7B-D-0924'
model, processor = load_model(model_name)

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [3]:
prompt_start = "You are the robot in the picture. Your task is to: "
prompt_1 = " To complete this task, do two things. 1. Find the relevant object and point to it. 2. If your task somehow includes moving or putting the object somewhere in the image point to that location."
prompt_end = " Your output format should be like this: (x1, y1), (x2, y2), with (x1, y1) beeing the coordinates of the object to move and (x2, y2) the place to move it to. Dont output anything else."

In [43]:
def get_output(image_path, prompt, model, processor, temperature=0.2):
    with Image.open(image_path) as image:
        inputs = processor.process(
            images=image,
            text=prompt
        )
        inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
        with torch.autocast(device_type="cuda", enabled=True, dtype=torch.bfloat16):
                # Generate output
                output = model.generate_from_batch(
                    inputs,
                    GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>", temperature=temperature, do_sample=True),
                    tokenizer=processor.tokenizer,
                    return_dict_in_generate=True,
                    output_logits=True
                )
        return output, inputs

def get_indeces_of_coords(generated_tokens, processor, inputs):
    index_of_coords = []
    for i, token in enumerate(reversed(generated_tokens)):
        char = processor.tokenizer.decode(token, skip_special_tokens=True)
        if ("(" in char) or (")" in char):
            index_of_coords.append(len(generated_tokens) - i - 1)
    return index_of_coords[::-1]

def get_logits(generated_logits, indeces):
    return generated_logits[indeces[0]:indeces[1]]
    

In [50]:
tasks, image_paths, ground_truths, _ = extract_all("../data/first_experiments")
softmax_fn = torch.nn.Softmax(dim=-1)

for i in range(len(tasks)):
    runs_with_two_coords = 0
    outputs = []
    image_path = image_paths[i]
    task = tasks[i]

    print(f"Task: {task}")
    print("")
    with Image.open(image_path) as image:
        image_width = image.width
        image_height = image.height
    for j in range(5):
        
        # Probability for two Coordinates alone
        prompt_start = "You are the robot in the picture. Your task is to: "
        prompt_1 = " To complete this task, do two things. 1. Find the relevant object and point to it. 2. If your task somehow includes moving or putting the object somewhere in the image point to that location."
        prompt_end = " Your output format should be like this: (x1, y1), (x2, y2), with (x1, y1) beeing the coordinates of the object to move and (x2, y2) the place to move it to. Dont output anything else."
        prompt = prompt_start + task + prompt_1 + prompt_end
        
        output, inputs = get_output(image_path, prompt, model, processor)
        logits = output.logits
        generated_logits = torch.cat(logits, dim=0)
        generated_tokens = output.sequences[0, inputs['input_ids'].size(1):]
        print(processor.tokenizer.decode(generated_tokens, skip_special_tokens=True))
        
        index_of_coords = get_indeces_of_coords(generated_tokens, processor, inputs)
        coord_1_logits = get_logits(generated_logits, index_of_coords[:2])
        coord_2_logits = get_logits(generated_logits, index_of_coords[2:4])
        
        logit_probs_1 = softmax_fn(coord_1_logits)
        highest_probs_1, _ = torch.max(logit_probs_1, dim=-1)
        output_probs_1 = torch.prod(highest_probs_1)

        logit_probs_2 = softmax_fn(coord_2_logits)
        highest_probs_2, _ = torch.max(logit_probs_2, dim=-1)
        output_probs_2 = torch.prod(highest_probs_2)


        # Probability for the first Coordinates alone
        prompt_start = "You are the robot in the picture. Your task is to: "
        prompt_1 = " To complete this task, do one thing. 1. Point to the location where the relevant object should be moved to."
        prompt_end = " Your output format should be like this: (x1, y1) with (x1, y1) beeing the coordinates of the object to move. Dont output anything else."
        prompt = prompt_start + task + prompt_1 + prompt_end
        
        output, inputs = get_output(image_path, prompt, model, processor)
        logits = output.logits
        generated_logits = torch.cat(logits, dim=0)
        generated_tokens = output.sequences[0, inputs['input_ids'].size(1):]
        print(processor.tokenizer.decode(generated_tokens, skip_special_tokens=True))
        
        index_of_coords = get_indeces_of_coords(generated_tokens, processor, inputs)
        coord_logits = get_logits(generated_logits, [index_of_coords[0], index_of_coords[-1]])

        logit_probs = softmax_fn(coord_logits)
        highest_probs, _ = torch.max(logit_probs, dim=-1)
        output_probs_object = torch.prod(highest_probs)


        # Probability for the second Coordinates alone
        prompt_start = "You are the robot in the picture. Your task is to: "
        prompt_1 = " To complete this task, do one thing. 1. Find the relevant object and point to it."
        prompt_end = " Your output format should be like this: (x1, y1) with (x1, y1) beeing the coordinates where the relevant object should be moved to. Dont output anything else."
        prompt = prompt_start + task + prompt_1 + prompt_end
        
        output, inputs = get_output(image_path, prompt, model, processor)
        logits = output.logits
        generated_logits = torch.cat(logits, dim=0)
        generated_tokens = output.sequences[0, inputs['input_ids'].size(1):]
        print(processor.tokenizer.decode(generated_tokens, skip_special_tokens=True))
        
        index_of_coords = get_indeces_of_coords(generated_tokens, processor, inputs)
        coord_logits = get_logits(generated_logits, [index_of_coords[0], index_of_coords[-1]])

        logit_probs = softmax_fn(coord_logits)
        highest_probs, _ = torch.max(logit_probs, dim=-1)
        output_probs_to_move = torch.prod(highest_probs)

        print("Prob if both together vs as single inferences:")
        print(f"Coordinate 1: {output_probs_1:.6f}, Coordinate 2: {output_probs_2:.6f}")
        print(f"Coordinate 1: {output_probs_object:.6f}, Coordinate 2: {output_probs_to_move:.6f}")

        print("")

Task: Move the can from the top left of the burner to the bottom left of the burner

 (27.3, 25.5), (27.3, 50.0)
 (24.0, 57.8)
 (25.9, 58.2)
Prob if both together vs as single inferences:
Coordinate 1: 0.000850, Coordinate 2: 0.005949
Coordinate 1: 0.000123, Coordinate 2: 0.000097

 (27.4, 29.0), (27.4, 29.0)
 (25.6, 58.4)
 (25.4, 58.5)
Prob if both together vs as single inferences:
Coordinate 1: 0.000916, Coordinate 2: 0.039595
Coordinate 1: 0.000102, Coordinate 2: 0.000081

 (27.8, 25.4), (27.8, 43.3)
 (25.3, 91.7)
 (25.9, 57.4)
Prob if both together vs as single inferences:
Coordinate 1: 0.000835, Coordinate 2: 0.001683
Coordinate 1: 0.000165, Coordinate 2: 0.000099

 (27.5, 27.3), (27.5, 52.0)
 (25.0, 50.0)
 (25.2, 57.3)
Prob if both together vs as single inferences:
Coordinate 1: 0.001104, Coordinate 2: 0.001935
Coordinate 1: 0.000262, Coordinate 2: 0.000102

 (27.5, 28.9), (28.2, 50.3)
 (25.2, 58.0)
 (25.8, 58.8)
Prob if both together vs as single inferences:
Coordinate 1: 0.0008

In [None]:
prompt_start = "You are the robot in the picture. Your task is to: "
prompt_1 = " To complete this task, do one thing. 1. Find the relevant object and point to it.
prompt_end = " Your output format should be like this: (x1, y1) with (x1, y1) beeing the coordinates of the object to move. Dont output anything else."

In [None]:
for i in range(len(tasks)):
    runs_with_two_coords = 0
    outputs = []
    image_path = image_paths[i]
    task = tasks[i]
    prompt = prompt_start + task + prompt_1 + prompt_end

    print(f"Task: {task}")
    with Image.open(image_path) as image:
        image_width = image.width
        image_height = image.height
    for j in range(10):
        output, inputs = get_output(image_path, prompt, model, processor)
        logits = output.logits
        generated_logits = torch.cat(logits, dim=0)
        generated_tokens = output.sequences[0, inputs['input_ids'].size(1):]
        
        index_of_coords = get_indeces_of_coords(generated_tokens, processor, inputs)
        coord_logits = get_logits(generated_logits, [index_of_coords[0], index_of_coords[-1]])
        
        softmax_fn = torch.nn.Softmax(dim=-1)

        logit_probs = softmax_fn(coord_logits)
        highest_probs, _ = torch.max(logit_probs, dim=-1)
        output_probs = torch.prod(highest_probs)

        print(f"Coordinate 1: {output_probs_1}, Coordinate 2: {output_probs_2}")