In [1]:
import numpy as np
import torch
import math
import json
import random
import matplotlib.pyplot as plt
import torch.nn.functional as F
import torch.nn as nn
from sentence_transformers import SentenceTransformer
from torch.utils.data import DataLoader
from tqdm import tqdm
import cv2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def set_seed(seed=50):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

set_seed()

In [3]:
path = "/Users/ibrahimbaldediallo/Documents/Code/Jarvis_project/vocab/instruction.txt"
model_path = "/Users/ibrahimbaldediallo/Documents/Code/Jarvis_project/Models/actor4.pth"
vision_path = "/Users/ibrahimbaldediallo/Documents/Code/Jarvis_project/Models/mousse_net.pth"
id_to_action_path = "/Users/ibrahimbaldediallo/Documents/Code/Jarvis_project/Models/id_to_action.json"
action_to_id_path = "/Users/ibrahimbaldediallo/Documents/Code/Jarvis_project/Models/action_to_id.json"

In [4]:
with open(id_to_action_path) as f:
    id_to_action_raw = json.load(f)

id_to_action = {int(k): v for k, v in id_to_action_raw.items()}

with open(action_to_id_path) as f:
    action_to_id_raw = json.load(f)

action_to_id = {k: int(v) for k, v in action_to_id_raw.items()}

In [5]:
vocab_size = len(action_to_id)
print(vocab_size)

1949


In [6]:
encoder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [7]:
device = torch.device('mps' if torch.mps.is_available() else 'cpu')

In [8]:
IMG_SIZE = 448 # 448 est peut etre la meilleur taille pour l'instant mais 896 fait le taffe √©galement

In [9]:
# Load image in grayscale
path = "/Users/ibrahimbaldediallo/Documents/Code/Jarvis_project/Project/image.png"
# 1. Charger l'image en niveaux de gris (1 canal)
img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)

# 2. Pr√©traitement
blur = cv2.GaussianBlur(img, (5, 5), 1.4)
edges = cv2.Canny(blur, threshold1=100, threshold2=200)  # Utiliser blur pour de meilleurs r√©sultats
edges = cv2.resize(edges, (IMG_SIZE, IMG_SIZE))  # Taille attendue par MouseNet (64x64)

# 3. Ajouter les dimensions manquantes
# - Convertir en float32 et normaliser [0, 255] -> [0, 1]
edges = edges.astype(np.float32) / 255.0

# - Ajouter les dimensions: [Hauteur, Largeur] -> [Canaux, Hauteur, Largeur]
edges = np.expand_dims(edges, axis=0)  # Maintenant shape (1, 64, 64)
 
#cv2.waitKey(0)
#cv2.destroyAllWindows()
img_arr = torch.tensor(edges, dtype=torch.float32).unsqueeze(0).to(device)

In [10]:
class ResidualFFN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_blocks=2):
        super(ResidualFFN, self).__init__()
        
        # Projection initiale
        self.input_proj = nn.Linear(input_dim, hidden_dim)
        
        # Blocs r√©siduels
        self.res_blocks = nn.ModuleList([
            ResidualBlock(hidden_dim) for _ in range(num_blocks)
        ])
        
        # Projection finale
        self.output_proj = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = self.input_proj(x)
        
        # Appliquer les blocs r√©siduels
        for block in self.res_blocks:
            x = block(x)
            
        return self.output_proj(x)
        
class ResidualBlock(nn.Module):
    def __init__(self, dim, dropout=0.3):
        super(ResidualBlock, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(dim, dim * 4),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(dim * 4, dim)
        )
        self.norm = nn.LayerNorm(dim)
        
    def forward(self, x):
        return self.norm(x + self.layers(x))
    

class PositionalEncoding(nn.Module):
    def __init__(self, dim, max_len=512):
        super().__init__()
        pe = torch.zeros(max_len, dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, dim, 2).float() * (-math.log(10000.0) / dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(1)  # (max_len, 1, dim)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x shape: (seq_len, batch_size, dim)
        x = x + self.pe[:x.size(0)]
        return x

class Actor(nn.Module):
    def __init__(self, encoder, dim, hidden, vocab_size, max_len=128):
        super().__init__()
        self.encoder = encoder  # pretrained SentenceTransformer
        self.rffn = ResidualFFN(384, hidden, dim)
        self.embedding = nn.Embedding(vocab_size, dim)
        self.pos_encoding = PositionalEncoding(dim, max_len=max_len)
        self.decoder_layer = nn.TransformerDecoderLayer(d_model=dim, nhead=16, dim_feedforward=hidden, dropout=0.3)
        self.transformer_decoder = nn.TransformerDecoder(self.decoder_layer, num_layers=2)  # R√©duction de 6 √† 2 couches
        self.final_projection = nn.Linear(dim, vocab_size)
        self.max_len = max_len
        self.dim = dim
        self.vocab_size = vocab_size
        self.load_state_dict(torch.load(model_path))

    def forward(self, x_texts, tgt):
        """
        x_texts: list of strings, len = batch_size
        tgt: tensor of shape (batch_size, seq_len)
        """
        batch_size = len(x_texts)
        
        # Encode input texts
        with torch.no_grad():
            x = self.encoder.encode(x_texts, convert_to_tensor=True)  # shape: (batch_size, 384)
        x = self.rffn(x)  # shape: (batch_size, dim)
       

        # Prepare target sequence
        tgt = tgt.to(device)
        tgt = self.embedding(tgt)  # (batch_size, seq_len, dim)
        tgt = tgt.permute(1, 0, 2)  # (seq_len, batch_size, dim)
        tgt = self.pos_encoding(tgt)  # add positional encoding

        # Create mask for autoregressive decoding
        seq_len = tgt.size(0)
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(seq_len).to(tgt.device)
        x = x.unsqueeze(0).repeat(seq_len, 1, 1)  # (seq_len, batch_size, dim)
        # Decode
        z = self.transformer_decoder(tgt, x, tgt_mask=tgt_mask)  # (seq_len, batch_size, dim)
        z = self.final_projection(z)  # (seq_len, batch_size, vocab_size)
        z = z.permute(1, 0, 2)  # (batch_size, seq_len, vocab_size)

        return z
    
    def forward_training(self, x, tgt):
        """
        x: encoder output (batch_size, dim)
        tgt: tensor of shape (batch_size, seq_len)
        """

        # Projette x dans le bon espace si n√©cessaire
        x = self.rffn(x)  # (batch_size, dim)

        # Embedding + Positional encoding
        tgt = self.embedding(tgt)  # (batch_size, seq_len, dim)
        tgt = tgt.permute(1, 0, 2)  # (seq_len, batch_size, dim)
        tgt = self.pos_encoding(tgt)

        # Memory (encoder output) doit √™tre (seq_len_enc, batch_size, dim)
        # Ici on suppose x est global, donc on le r√©p√®te
        x = x.unsqueeze(0)  # (1, batch_size, dim)

        # Masque auto-r√©gressif pour le d√©codeur
        seq_len = tgt.size(0)
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(seq_len).to(tgt.device)

        # Transformer decoder
        z = self.transformer_decoder(tgt, x, tgt_mask=tgt_mask)  # (seq_len, batch_size, dim)
        z = self.final_projection(z)  # (seq_len, batch_size, vocab_size)
        z = z.permute(1, 0, 2)  # (batch_size, seq_len, vocab_size)

        return z

    
    @torch.no_grad()
    def generate(self, x_text:list[str], max_len=32, start_token_id=1, end_token_id=2):
        """
        x_text : liste de string
        Retourne une liste de listes contenant les ID g√©n√©r√©s
        """
        # Encode input texts
        with torch.no_grad():
            x = self.encoder.encode(x_text, convert_to_tensor=True)
        # Encoder: passe par rffn si n√©cessaire
        x = self.rffn(x)  # (batch_size, dim)
        memory = x.unsqueeze(0)  # (1, batch_size, dim)

        batch_size = x.size(0)
        device = x.device

        # Initialiser avec <BOS>
        generated = torch.full((batch_size, 1), start_token_id, dtype=torch.long, device=device)

        for _ in range(max_len):
            # Embed + position
            tgt_embed = self.embedding(generated)  # (batch_size, seq_len, dim)
            tgt_embed = tgt_embed.permute(1, 0, 2)  # (seq_len, batch_size, dim)
            tgt_embed = self.pos_encoding(tgt_embed)

            # Masque causal
            seq_len = generated.size(1)
            tgt_mask = nn.Transformer.generate_square_subsequent_mask(seq_len).to(device)

            # Decode
            output = self.transformer_decoder(tgt_embed, memory, tgt_mask=tgt_mask)
            logits = self.final_projection(output)  # (seq_len, batch_size, vocab_size)
            next_token_logits = logits[-1, :, :]  # dernier pas de temps ‚Üí (batch_size, vocab_size)

            # Greedy : choisir l'indice du max
            next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)  # (batch_size, 1)

            # Ajouter √† la s√©quence
            generated = torch.cat([generated, next_token], dim=1)

            # Option d'arr√™t : si tous les batchs ont g√©n√©r√© <EOS>
            if (next_token == end_token_id).all():
                break

        return generated  # (batch_size, seq_len_generated)

In [11]:
class MouseNet(nn.Module):
    def __init__(self, in_channels, num_classes=2, input_size=IMG_SIZE):
        super(MouseNet, self).__init__()
        self.input_size = input_size
        
        # Couches convolutionnelles
        self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=8, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(in_channels=8, out_channels=16, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
        
        # Calcul de la taille apr√®s convolutions
        # input_size -> /2 -> /2 = input_size // 4
        self.feature_size = self._get_conv_output_size(in_channels, input_size)
        
        # Couches fully connected
        self.fc1 = nn.Linear(self.feature_size, 64)
        self.fc2 = nn.Linear(64, 128)
        self.out = nn.Linear(128, num_classes)
        self.load_state_dict(torch.load(vision_path))
        
    def _get_conv_output_size(self, in_channels, input_size):
        """Calcule la taille de sortie des couches convolutionnelles"""
        # Simulation d'un passage dans les conv layers
        x = torch.randn(1, in_channels, input_size, input_size)
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = F.relu(self.conv3(x))
        return x.view(1, -1).size(1)

    def forward(self, x, features_only=False):
        # x shape attendue: (batch_size, in_channels, height, width)
        batch_size = x.size(0)
        
        # Couches convolutionnelles
        x = F.relu(self.conv1(x))  # (batch, 8, H, W)
        x = self.pool(x)           # (batch, 8, H/2, W/2)
        x = F.relu(self.conv2(x))  # (batch, 16, H/2, W/2)
        x = self.pool(x)           # (batch, 16, H/4, W/4)
        x = F.relu(self.conv3(x))  # (batch, 32, H/4, W/4)
        
        # Aplatissement pour les couches fully connected
        x = x.view(batch_size, -1)  # (batch, 32 * H/4 * W/4)
        
        # Couches fully connected
        x = F.relu(self.fc1(x))    # (batch, 64)
        x = F.relu(self.fc2(x))    # (batch, 128)
        if features_only==True:
            return x
        else:
            x = self.out(x)            # (batch, num_classes)
            return torch.sigmoid(x).to(torch.float32)

In [None]:
"""
Objectif:

(actions_seq)     ‚îÄ‚îÄ‚ñ∂ Embedding + GRU ‚îÄ‚îÄ‚îê
(instruction)     ‚îÄ‚îÄ‚ñ∂ SentenceEncoder ‚îÄ‚î§‚îÄ‚îÄ‚ñ∂ Fusion (attention) ‚îÄ‚ñ∂ Decoder
(screenshot)      ‚îÄ‚îÄ‚ñ∂ MouseNet        ‚îÄ‚îò
"""

In [12]:
class VisionActor(nn.Module):
    def __init__(self, in_channels, encoder, dim, hidden, vocab_size):
        super(VisionActor, self).__init__()
        self.in_channels = in_channels
        self.encoder = encoder
        self.dim = dim
        self.hidden = hidden
        self.vocab_size = vocab_size
        self.actor = Actor(self.encoder, self.dim, self.hidden, self.vocab_size)
        self.mousenet = MouseNet(self.in_channels)
        self.vision_rffn = ResidualFFN(self.mousenet.out.in_features, hidden, dim)
        self.rnn = nn.GRU(input_size=dim, hidden_size=dim, num_layers=1, batch_first=True)
        self.norm = nn.LayerNorm(dim)
        self.fusion = nn.MultiheadAttention(dim, num_heads=8)
        self.pointer_head = nn.Sequential(
            nn.Linear(dim, 128),
            nn.ReLU(),
            self.mousenet.out,  # x, y normalis√©s
            nn.Sigmoid()       # born√© entre [0, 1]
        )

    def forward(self, x, tgt, screenshots=None, action_history=None):
        """
        x: encoder output (batch_size, dim),

        tgt: tensor of shape (batch_size, seq_len),

        screenshot: image converti en niveau de gris,
        
        Retourne une liste de listes contenant les ID g√©n√©r√©s ainsi que des coordonn√©es (x, y)
        """
        if screenshots is not None:
            # encoder le text
            txt_encoded = self.actor.rffn(x) # shape: (batch_size, dim)

            # encoder l'image
            vision_features = self.mousenet.forward(screenshots, features_only=True) # shape: (batch_size, 128)
            vision_encoded = self.vision_rffn(vision_features) # shape: (batch_size, dim)
            pointer_out = self.pointer_head(vision_encoded.squeeze(0))  # (batch_size, 2)
            
            if action_history is not None:
                # encoder les actions pass√©
                embed_action = self.actor.embedding(action_history) # shape: (seq_len, dim)
                _, action_encoded = self.rnn(embed_action) # shape: (batch_size, dim)
                fusion_input = torch.stack([txt_encoded, vision_encoded, action_encoded], dim=0) # shape: (3, batch_size, dim)

                fused_output, _ = self.fusion(fusion_input, fusion_input, fusion_input)  # shape: (3, batch_size, dim)
                fused_rep = fused_output.mean(dim=0).unsqueeze(0) # shape: (1, batch_size, dim)
                
            else:
                fused_rep, _ = self.fusion(txt_encoded.unsqueeze(0), vision_encoded.unsqueeze(0), vision_encoded.unsqueeze(0)) # shape (1, batch_size, dim)

            norm_fused_rep = self.norm(fused_rep)

            batch_size = fused_rep.size(0)
            device = fused_rep.device

            # traiter target sequence
            tgt = tgt.to(device)
            tgt = self.actor.embedding(tgt)  # (batch_size, seq_len, dim)
            tgt = tgt.permute(1, 0, 2)  # (seq_len, batch_size, dim)
            tgt = self.actor.pos_encoding(tgt)  #  positional encoding

            # Masque auto-r√©gressif pour le d√©codeur
            seq_len = tgt.size(0)
            tgt_mask = nn.Transformer.generate_square_subsequent_mask(seq_len).to(tgt.device)

            # passage au decoder
            z = self.actor.transformer_decoder(tgt, norm_fused_rep, tgt_mask=tgt_mask)  # (seq_len, batch_size, dim)
            z = self.actor.final_projection(z)  # (seq_len, batch_size, vocab_size)
            z = z.permute(1, 0, 2)  # (batch_size, seq_len, vocab_size)
            
            return z, pointer_out
        else:
            return self.actor.forward_training(x, tgt), screenshots


    @torch.no_grad()
    def generate(self, x_text:list[str], screenshots=None, action_history=None, max_len=32, start_token_id=1, end_token_id=2):
        """
        x_text : liste de string
        Retourne une liste de listes contenant les ID g√©n√©r√©s
        """
        if screenshots is not None:
            # encoder le text
            with torch.no_grad():
                txt_features = self.encoder.encode(x_text, convert_to_tensor=True) # shape: (batch_size, 384)
            txt_encoded = self.actor.rffn(txt_features) # shape: (batch_size, dim)

            # encoder l'image
            vision_features = self.mousenet.forward(screenshots, features_only=True) # shape: (batch_size, 128)
            vision_encoded = self.vision_rffn(vision_features) # shape: (batch_size, dim)
            pointer_out = self.pointer_head(vision_encoded.squeeze(0))  # (batch_size, 2)
            
            if action_history is not None:
                # encoder les actions pass√©
                embed_action = self.actor.embedding(action_history) # shape: (seq_len, dim)
                _, action_encoded = self.rnn(embed_action) # shape: (batch_size, dim)
                fusion_input = torch.stack([txt_encoded, vision_encoded, action_encoded], dim=0) # shape: (3, batch_size, dim)

                fused_output, _ = self.fusion(fusion_input, fusion_input, fusion_input)  # shape: (3, batch_size, dim)
                fused_rep = fused_output.mean(dim=0).unsqueeze(0) # shape: (1, batch_size, dim)
                
            else:
                fused_rep, _ = self.fusion(txt_encoded.unsqueeze(0), vision_encoded.unsqueeze(0), vision_encoded.unsqueeze(0)) # shape (1, batch_size, dim)

            norm_fused_rep = self.norm(fused_rep)

            batch_size = fused_rep.size(0)
            device = fused_rep.device

            # Initialiser avec <BOS>
            generated = torch.full((batch_size, 1), start_token_id, dtype=torch.long, device=device)

            for _ in range(max_len):
                # Embed + position
                tgt_embed = self.actor.embedding(generated)  # (batch_size, seq_len, dim)
                tgt_embed = tgt_embed.permute(1, 0, 2)  # (seq_len, batch_size, dim)
                tgt_embed = self.actor.pos_encoding(tgt_embed) # shape: (1, 1, 512)

                # Masque causal
                seq_len = generated.size(1)
                tgt_mask = nn.Transformer.generate_square_subsequent_mask(seq_len).to(device)

                # Decode
                output = self.actor.transformer_decoder(tgt_embed, norm_fused_rep, tgt_mask=tgt_mask)
                logits = self.actor.final_projection(output)  # (seq_len, batch_size, vocab_size)
                next_token_logits = logits[-1, :, :]  # dernier pas de temps ‚Üí (batch_size, vocab_size)

                # Greedy : choisir l'indice du max
                next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)  # (batch_size, 1)

                # Ajouter √† la s√©quence
                generated = torch.cat([generated, next_token], dim=1)

                # Option d'arr√™t : si tous les batchs ont g√©n√©r√© <EOS>
                if (next_token == end_token_id).all():
                    break

            return generated, pointer_out  # ((batch_size, seq_len_generated), (batch_size, 2))
        else:
            return self.actor.generate(x_text), screenshots


In [13]:
dim, hidden, in_channels = 512, 512, 1
model = VisionActor(in_channels, encoder, dim, hidden, vocab_size).to(device)

In [14]:
def convert_to_id(seq):
    len_seq = len(seq)
    new_seq = []
    for i in range(len_seq):
        for n in seq[i]:
            a = action_to_id[n]
            new_seq.append(a)
    return torch.tensor(new_seq, dtype=torch.int32, device=device)

In [15]:
action_history = [["cmd+space", "safari", "enter"]]
id_history = convert_to_id(action_history)
print(id_history)

tensor([ 791,    8, 1603], device='mps:0', dtype=torch.int32)


In [16]:
z, coordinate = model.generate(["go on spotify"], img_arr, id_history)
print(z)
print(coordinate)

tensor([[   1,  791,   96, 1603,    2]], device='mps:0')
tensor([0.5190, 0.5338], device='mps:0')


In [17]:
def translate(seq):
    return [id_to_action[n] for n in seq.tolist() if n!=1 and n!=2 and n!=3]

translation = translate(z[0])
print(f"sequence: {translation}")

sequence: ['cmd+space', 'coda', 'enter']


In [41]:
def collate_fn(batch):
    """Fonction de collation pour le DataLoader"""
    max_seq_len = max(sample['target_sequence'].shape[0] for sample in batch)
    
    # Padding des s√©quences
    padded_sequences = []
    for sample in batch:
        seq = sample['target_sequence']
        padded = torch.nn.functional.pad(seq, (0, max_seq_len - seq.shape[0]), value=0)
        padded_sequences.append(padded)
    
    return {
        'text': [s['text'] for s in batch],
        'screenshots': torch.stack([s['screenshot'] for s in batch]),  # (B, 1, H, W)
        'target_sequences': torch.stack(padded_sequences),  # (B, max_seq_len)
        'target_coords': torch.stack([s['target_coords'] for s in batch]),  # (B, 2)
        'action_histories': [s['action_history'] for s in batch],
        'urls': [s['url'] for s in batch],
        'element_texts': [s['element_text'] for s in batch]
    }

In [50]:
tensor_path = "/Users/ibrahimbaldediallo/Documents/Code/Jarvis_project/web_scrapper/vision_actor_real_dataset.pt"

dataset = torch.load(tensor_path, weights_only=False)
print(f"\nüìä Dataset info:")
print(f"  - Type: {type(dataset)}")
print(f"  - Length: {len(dataset)}")
print(f"  - Vocab size: {dataset.vocab_size}")

print(dataset[10]["action_history"])


üìä Dataset info:
  - Type: <class '__main__.VisionActorRealDataset'>
  - Length: 11
  - Vocab size: 50
tensor([5, 6, 6, 6, 6, 6, 6, 6])


In [None]:
dataloader = DataLoader(
    dataset,
    batch_size=4,
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=0
)

In [55]:
import random
import json

# Define comprehensive action pools
search_terms = [
    "python tutorials", "javascript", "machine learning", "web development",
    "data science", "react", "docker", "kubernetes", "git", "sql", "nodejs",
    "weather", "news", "stock prices", "recipe ideas", "travel deals",
    "how to fix bugs", "debugging tips", "api documentation", "rest api",
    "database design", "system design", "software architecture"
]

websites = [
    "https://stackoverflow.com",
    "https://github.com",
    "https://www.wikipedia.org",
    "https://www.google.com",
    "https://www.youtube.com",
    "https://www.reddit.com",
    "https://docs.python.org",
    "https://developer.mozilla.org",
    "https://www.medium.com",
    "https://www.w3schools.com",
    "https://www.amazon.com",
    "https://www.github.com",
    "https://www.twitter.com",
    "https://www.linkedin.com",
    None
]

terminal_commands = [
    "ls", "pwd", "cd ~", "mkdir test", "touch file.txt", "cat file.txt",
    "grep text file", "find . -name '*.py'", "git status", "git add .",
    "git commit -m 'fix'", "git push", "npm install", "npm start",
    "python script.py", "node app.js", "docker ps", "curl http://api.com",
    "wget file.zip", "tar -xzf file.tar.gz", "rm file.txt", "cp file1 file2",
    "mv file oldname", "chmod 755 script.sh", "sudo apt-get update",
    "pip install package", "python -m venv env", "source env/bin/activate",
    "git clone repo", "git log", "git diff", "make build"
]

text_operations = [
    "select all", "copy", "paste", "cut", "delete", "undo", "redo",
    "find and replace", "format", "bold", "italic", "underline"
]

gui_clicks = [
    "click(100,100)", "click(200,300)", "click(50,50)", "click(500,500)",
    "click(150,200)", "click(300,400)", "click(75,125)", "click(450,350)",
    "click(250,250)", "click(180,220)", "click(380,450)"
]

modifiers = [
    "cmd+space", "cmd+tab", "cmd+c", "cmd+v", "cmd+a", "cmd+z", "cmd+x",
    "cmd+s", "cmd+w", "ctrl+l", "alt+tab", "cmd+q", "cmd+n", "cmd+o"
]

apps = [
    "safari", "terminal", "vscode", "chrome", "finder", "mail", "notes",
    "calculator", "atom", "sublime", "intellij", "postman", "slack"
]

def generate_web_search_situation():
    """Generate web search situations"""
    search = random.choice(search_terms)
    website = random.choice(websites)
    
    # Previous actions (open search engine)
    prev_t2 = ['cmd+space', 'safari', 'enter'] if random.random() > 0.3 else ['cmd+space', 'chrome', 'enter']
    prev_t1 = random.choice([['google.com', 'enter'], ['duckduckgo.com', 'enter'], ['search bar click']])
    
    # Target action
    target = [search, 'enter']
    
    instruction = f"Search for {search}"
    return (instruction, [prev_t2, prev_t1], target, website)

def generate_terminal_situation():
    """Generate terminal command situations"""
    command = random.choice(terminal_commands)
    
    # Previous actions (open terminal)
    prev_t2 = ['cmd+space']
    prev_t1 = ['terminal', 'enter']
    
    # Target action
    target = [command, 'enter']
    
    instruction = f"Run: {command}"
    return (instruction, [prev_t2, prev_t1], target, None)

def generate_text_edit_situation():
    """Generate text editing situations"""
    operation = random.choice(text_operations)
    
    # Previous actions
    if operation == "find and replace":
        prev_t2 = ['cmd+space', 'vscode', 'enter']
        prev_t1 = ['cmd+h']
    elif operation in ["bold", "italic", "underline"]:
        prev_t2 = ['cmd+space', 'notes', 'enter']
        prev_t1 = ['cmd+a']
    else:
        prev_t2 = ['open file']
        prev_t1 = ['type some text']
    
    # Target action
    if operation == "copy":
        target = ['cmd+c']
    elif operation == "paste":
        target = ['cmd+v']
    elif operation == "cut":
        target = ['cmd+x']
    elif operation == "undo":
        target = ['cmd+z']
    elif operation == "redo":
        target = ['cmd+shift+z']
    elif operation == "select all":
        target = ['cmd+a']
    elif operation == "find and replace":
        target = ['search term', 'tab', 'replacement term']
    elif operation == "delete":
        target = ['delete']
    else:
        target = [operation]
    
    instruction = f"Text operation: {operation}"
    return (instruction, [prev_t2, prev_t1], target, None)

def generate_app_launch_situation():
    """Generate app launching situations"""
    app = random.choice(apps)
    
    # Previous actions
    prev_t2 = ['cmd+space']
    prev_t1 = random.choice([['spotlight open'], ['launchpad']])
    
    # Target action
    target = [app, 'enter']
    
    instruction = f"Open {app}"
    return (instruction, [prev_t2, prev_t1], target, None)

def generate_file_operation_situation():
    """Generate file operation situations"""
    operations = ["create file", "delete file", "rename file", "move file", "copy file"]
    op = random.choice(operations)
    
    prev_t2 = ['cmd+space', 'terminal', 'enter']
    prev_t1 = ['pwd']
    
    if op == "create file":
        target = ['touch newfile.txt', 'enter']
    elif op == "delete file":
        target = ['rm oldfile.txt', 'enter']
    elif op == "rename file":
        target = ['mv oldname newname', 'enter']
    elif op == "move file":
        target = ['mv file /path/to/dest', 'enter']
    else:
        target = ['cp source dest', 'enter']
    
    instruction = f"File operation: {op}"
    return (instruction, [prev_t2, prev_t1], target, None)

def generate_navigation_situation():
    """Generate navigation situations"""
    actions = [
        ("Go back", ['left_arrow'], "https://www.example.com"),
        ("Go forward", ['right_arrow'], None),
        ("Refresh page", ['cmd+r'], "https://www.example.com"),
        ("Open new tab", ['cmd+t'], None),
        ("Close tab", ['cmd+w'], None),
        ("Switch tab", ['cmd+tab'], None),
    ]
    
    action_name, target, site = random.choice(actions)
    
    prev_t2 = ['safari open']
    prev_t1 = ['navigate to site']
    
    instruction = action_name
    return (instruction, [prev_t2, prev_t1], target, site)

def generate_code_editing_situation():
    """Generate code editing situations"""
    languages = ["python", "javascript", "java", "cpp", "go"]
    lang = random.choice(languages)
    
    prev_t2 = ['cmd+space', 'vscode', 'enter']
    prev_t1 = ['cmd+o', 'select file']
    
    target = ['type code', 'cmd+s']
    
    instruction = f"Edit {lang} file"
    return (instruction, [prev_t2, prev_t1], target, None)

def generate_browser_action_situation():
    """Generate browser action situations"""
    actions = [
        ("Bookmark page", ['cmd+d']),
        ("Open downloads", ['cmd+shift+j']),
        ("View history", ['cmd+y']),
        ("Inspect element", ['cmd+option+i']),
        ("Print page", ['cmd+p']),
        ("Save page", ['cmd+s']),
        ("Zoom in", ['cmd+plus']),
        ("Zoom out", ['cmd+minus']),
    ]
    
    action_name, target = random.choice(actions)
    
    prev_t2 = ['safari open']
    prev_t1 = ['navigate to website']
    
    website = random.choice(websites)
    
    instruction = action_name
    return (instruction, [prev_t2, prev_t1], target, website)

def generate_git_situation():
    """Generate git workflow situations"""
    workflows = [
        ("Commit changes", ['git add .', 'enter'], ['git commit -m "message"', 'enter']),
        ("Push to remote", ['git status', 'enter'], ['git push', 'enter']),
        ("Create branch", ['git branch list', 'enter'], ['git checkout -b newbranch', 'enter']),
        ("View log", ['git log', 'enter'], ['q']),
        ("Stash changes", ['git status', 'enter'], ['git stash', 'enter']),
    ]
    
    workflow, prev_t1, target = random.choice(workflows)
    prev_t2 = ['cd project', 'enter']
    
    instruction = f"Git: {workflow}"
    return (instruction, [prev_t2, prev_t1], target, None)

def generate_npm_situation():
    """Generate npm/node situations"""
    operations = [
        ("Install dependencies", ['ls', 'enter'], ['npm install', 'enter']),
        ("Start dev server", ['npm install', 'enter'], ['npm start', 'enter']),
        ("Build project", ['npm run build', 'enter'], ['ls dist', 'enter']),
        ("Run tests", ['npm test', 'enter'], ['q']),
    ]
    
    op, prev_t1, target = random.choice(operations)
    prev_t2 = ['cd project', 'enter']
    
    instruction = f"NPM: {op}"
    return (instruction, [prev_t2, prev_t1], target, None)

def generate_random_workflow():
    """Generate a random realistic workflow"""
    workflows = [
        lambda: generate_web_search_situation(),
        lambda: generate_terminal_situation(),
        lambda: generate_text_edit_situation(),
        lambda: generate_app_launch_situation(),
        lambda: generate_file_operation_situation(),
        lambda: generate_navigation_situation(),
        lambda: generate_code_editing_situation(),
        lambda: generate_browser_action_situation(),
        lambda: generate_git_situation(),
        lambda: generate_npm_situation(),
    ]
    
    return random.choice(workflows)()

# Generate dataset
print("Generating 50000+ situations...")
situations = []

# Ensure variety by generating different types
num_each_type = 4545  # 4545 * 11 = 49995, then add a few more

situation_generators = [
    generate_web_search_situation,
    generate_terminal_situation,
    generate_text_edit_situation,
    generate_app_launch_situation,
    generate_file_operation_situation,
    generate_navigation_situation,
    generate_code_editing_situation,
    generate_browser_action_situation,
    generate_git_situation,
    generate_npm_situation,
    generate_random_workflow,
]

for generator in situation_generators:
    for _ in range(num_each_type):
        situations.append(generator())

# Add some extra random ones
for _ in range(100):
    situations.append(generate_random_workflow())

print(f"Generated {len(situations)} situations")

# Write to file
with open('situations_dataset.txt', 'w') as f:
    for i, (instruction, prev_actions, target, site) in enumerate(situations, 1):
        # Format as specified
        prev_str = '[' + ', '.join([f"['{x}']" if isinstance(x, str) else str(x) for x in prev_actions]) + ']'
        target_str = str(target)
        site_str = str(site)
        
        line = f"({repr(instruction)}; {prev_str}; {target_str}; {site_str})\n"
        f.write(line)

print(f"Dataset written to 'situations_dataset.txt'")
print(f"Total entries: {len(situations)}")


Generating 50000+ situations...
Generated 50095 situations
Dataset written to 'situations_dataset.txt'
Total entries: 50095


In [57]:
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import json
import os
from typing import List, Tuple, Dict, Optional
import logging
from dataclasses import dataclass
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import re

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class InteractionSample:
    """Repr√©sente une interaction g√©n√©r√©e"""
    instruction: str
    previous_actions: List[List[str]]  # Nested list of action sequences
    target_actions: List[str]
    url: str
    screenshot: Optional[torch.Tensor] = None
    target_coords: Optional[torch.Tensor] = None
    element_type: str = 'unknown'


class SyntheticDataParser:
    """Parser pour les donn√©es synth√©tiques g√©n√©r√©es"""
    
    ACTION_VOCAB = {
        # Navigation
        'click': 3,
        'double_click': 8,
        'hover': 7,
        'scroll_up': 4,
        'scroll_down': 5,
        
        # Saisie
        'type': 6,
        'backspace': 9,
        'delete': 10,
        'paste': 11,
        'copy': 12,
        'enter': 13,
        
        # Clavier syst√®me
        'cmd+space': 18,
        'cmd+c': 16,
        'cmd+v': 17,
        'cmd+a': 19,
        
        # Navigateurs
        'chrome': 20,
        'safari': 21,
        'firefox': 22,
        'edge': 23,
        
        # Sites web
        'search bar click': 24,
        'google.com': 25,
        'duckduckgo.com': 26,
    }
    
    def __init__(self):
        self.samples = []
    
    def parse_line(self, line: str) -> Optional[InteractionSample]:
        """Parse une ligne de donn√©es synth√©tiques
        
        Format: ('Instruction'; [['action1', 'action2'], ['action3']]; ['target1', 'target2']; url)
        """
        try:
            # Nettoyer la ligne
            line = line.strip()
            if line.startswith('(') and line.endswith(')'):
                line = line[1:-1]
            
            # Split par ; en gardant les structures imbriqu√©es
            parts = self.split_by_semicolon(line)
            
            if len(parts) != 4:
                logger.warning(f"Invalid format (expected 4 parts): {line[:50]}...")
                return None
            
            # Parser chaque partie
            instruction = self.parse_instruction(parts[0])
            previous_actions = self.parse_nested_actions(parts[1])
            target_actions = self.parse_actions(parts[2])
            url = self.parse_url(parts[3])
            
            if not instruction or not target_actions or not url:
                return None
            
            # D√©terminer le type d'√©l√©ment
            element_type = self.infer_element_type(instruction)
            
            sample = InteractionSample(
                instruction=instruction,
                previous_actions=previous_actions,
                target_actions=target_actions,
                url=url,
                element_type=element_type
            )
            
            return sample
        
        except Exception as e:
            logger.debug(f"Error parsing line: {e}")
            return None
    
    def split_by_semicolon(self, line: str) -> List[str]:
        """Split par ; en respectant les structures imbriqu√©es"""
        parts = []
        current = ""
        depth = 0
        
        for char in line:
            if char in '[({':
                depth += 1
                current += char
            elif char in '])}':
                depth -= 1
                current += char
            elif char == ';' and depth == 0:
                parts.append(current.strip())
                current = ""
            else:
                current += char
        
        if current:
            parts.append(current.strip())
        
        return parts
    
    def parse_instruction(self, instr: str) -> str:
        """Parse l'instruction (enl√®ve les quotes)"""
        instr = instr.strip()
        if instr.startswith("'") and instr.endswith("'"):
            return instr[1:-1]
        elif instr.startswith('"') and instr.endswith('"'):
            return instr[1:-1]
        return instr
    
    def parse_nested_actions(self, actions_str: str) -> List[List[str]]:
        """Parse les actions imbriqu√©es
        
        Format: [['action1', 'action2'], ['action3']]
        """
        nested_actions = []
        
        # Remove outer brackets
        actions_str = actions_str.strip()
        if actions_str.startswith('[') and actions_str.endswith(']'):
            actions_str = actions_str[1:-1]
        
        # Split par les sous-listes
        current_list = []
        depth = 0
        current_str = ""
        
        for char in actions_str:
            if char == '[':
                depth += 1
                if depth == 1:
                    current_str = ""
                else:
                    current_str += char
            elif char == ']':
                depth -= 1
                if depth == 0:
                    # Fin d'une sous-liste
                    actions = self.parse_actions(current_str)
                    if actions:
                        nested_actions.append(actions)
                    current_str = ""
                else:
                    current_str += char
            elif depth > 0:
                current_str += char
        
        return nested_actions
    
    def parse_actions(self, actions_str: str) -> List[str]:
        """Parse une liste plate d'actions
        
        Format: ['action1', 'action2', 'action3']
        """
        actions = []
        
        # Extraire toutes les strings entre quotes
        pattern = r"['\"]([^'\"]+)['\"]"
        matches = re.findall(pattern, actions_str)
        
        for match in matches:
            action = match.strip().lower()
            # Valider l'action
            if action in self.ACTION_VOCAB or self._is_text_input(action):
                actions.append(action)
        
        return actions
    
    def _is_text_input(self, text: str) -> bool:
        """V√©rifie si le texte est une saisie utilisateur"""
        # Si ce n'est pas un action vocab et pas une URL, c'est du texte
        return not any(x in text.lower() for x in ['cmd', 'ctrl', 'shift', 'enter', 'click'])
    
    def parse_url(self, url_str: str) -> str:
        """Parse l'URL"""
        url_str = url_str.strip()
        if url_str.startswith("'") and url_str.endswith("'"):
            return url_str[1:-1]
        elif url_str.startswith('"') and url_str.endswith('"'):
            return url_str[1:-1]
        return url_str
    
    def infer_element_type(self, instruction: str) -> str:
        """Inf√®re le type d'√©l√©ment bas√© sur l'instruction"""
        instr_lower = instruction.lower()
        
        if 'search' in instr_lower:
            return 'search'
        elif 'click' in instr_lower:
            return 'button'
        elif 'type' in instr_lower or 'enter' in instr_lower:
            return 'input'
        elif 'navigate' in instr_lower or 'go' in instr_lower:
            return 'link'
        else:
            return 'element'
    
    def convert_to_tokens(self, actions: List[str]) -> torch.Tensor:
        """Convertit les actions en tokens"""
        tokens.append(2)  # EOS
        return torch.tensor(tokens, dtype=torch.long)
    
    def load_from_file(self, filename: str) -> List[InteractionSample]:
        """Charge les donn√©es synth√©tiques depuis un fichier"""
        samples = []
        
        try:
            with open(filename, 'r') as f:
                lines = f.readlines()
            
            logger.info(f"üìÇ Loading {len(lines)} lines from {filename}")
            
            for i, line in enumerate(lines):
                if not line.strip():
                    continue
                
                sample = self.parse_line(line)
                if sample:
                    samples.append(sample)
                
                if (i + 1) % 1000 == 0:
                    logger.info(f"  ‚úì Parsed {i + 1} lines ({len(samples)} valid samples)")
            
            logger.info(f"‚úÖ Loaded {len(samples)} samples")
            self.samples = samples
            return samples
        
        except Exception as e:
            logger.error(f"‚ùå Error loading file: {e}")
            return []


class RealWebsiteCollector:
    """Collecteur de donn√©es √† partir de sites r√©els"""
    
    def __init__(self, headless=True):
        """Initialise le collecteur avec Selenium"""
        options = webdriver.ChromeOptions()
        if headless:
            options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--window-size=1280,720')
        
        try:
            self.driver = webdriver.Chrome(options=options)
            logger.info("‚úÖ Selenium WebDriver initialized")
        except Exception as e:
            logger.error(f"‚ùå Failed to initialize WebDriver: {e}")
            self.driver = None
    
    def normalize_screenshot(self, screenshot_bytes) -> Optional[torch.Tensor]:
        """Convertit un screenshot en tensor grayscale normalis√©"""
        try:
            import io
            img = Image.open(io.BytesIO(screenshot_bytes)).convert('L')
            img_array = np.array(img) / 255.0
            img_tensor = torch.from_numpy(img_array).float().unsqueeze(0)
            return img_tensor
        except Exception as e:
            logger.debug(f"Error normalizing screenshot: {e}")
            return None
    
    def find_search_bar_coordinates(self) -> Optional[Tuple[float, float]]:
        """Trouve les coordonn√©es de la barre de recherche"""
        try:
            # Chercher les √©l√©ments de recherche communs
            search_selectors = [
                'input[type="search"]',
                'input[placeholder*="search" i]',
                'input[placeholder*="Search" i]',
                'input[aria-label*="search" i]',
                '.search-input',
                '#search',
            ]
            
            for selector in search_selectors:
                elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                if elements:
                    element = elements[0]
                    location = element.location
                    size = element.size
                    
                    x = (location['x'] + size['width'] / 2) / 1280
                    y = (location['y'] + size['height'] / 2) / 720
                    
                    return (np.clip(x, 0, 1), np.clip(y, 0, 1))
            
            return None
        except Exception as e:
            logger.debug(f"Error finding search bar: {e}")
            return None
    
    def collect_from_website(self, url: str, num_samples: int = 5) -> List[InteractionSample]:
        """Collecte les coordonn√©es d'un site web r√©el"""
        samples = []
        
        if not self.driver:
            return samples
        
        try:
            logger.info(f"üåê Collecting from {url}")
            self.driver.get(url)
            time.sleep(2)
            
            # Prendre un screenshot
            screenshot_bytes = self.driver.get_screenshot_as_png()
            screenshot = self.normalize_screenshot(screenshot_bytes)
            
            if screenshot is None:
                logger.warning(f"Failed to capture screenshot from {url}")
                return samples
            
            # Trouver la barre de recherche
            search_coords = self.find_search_bar_coordinates()
            
            if search_coords:
                logger.info(f"‚úì Found search bar at {search_coords}")
            else:
                logger.warning(f"Could not find search bar at {url}")
                search_coords = (0.5, 0.1)  # Valeur par d√©faut
            
            # Cr√©er des samples avec les coordonn√©es r√©elles
            for i in range(num_samples):
                sample = InteractionSample(
                    instruction=f"Search on {url}",
                    previous_actions=[],
                    target_actions=['enter'],
                    url=url,
                    screenshot=screenshot,
                    target_coords=torch.tensor(search_coords, dtype=torch.float32),
                    element_type='search'
                )
                samples.append(sample)
            
            logger.info(f"‚úì Created {num_samples} samples from {url}")
            
        except Exception as e:
            logger.error(f"‚ùå Error collecting from {url}: {e}")
        
        return samples
    
    def close(self):
        """Ferme le driver"""
        if self.driver:
            self.driver.quit()
            logger.info("‚úÖ WebDriver closed")


class VisionActorRealDataset(Dataset):
    """Dataset combinant donn√©es synth√©tiques et r√©elles"""
    
    def __init__(self, synthetic_samples: List[InteractionSample], 
                 real_samples: List[InteractionSample] = None):
        """
        Initialise le dataset
        
        Args:
            synthetic_samples: Samples g√©n√©r√©s synth√©tiquement (texte + actions)
            real_samples: Samples avec donn√©es r√©elles du scraping (coordonn√©es)
        """
        self.synthetic_samples = synthetic_samples
        self.real_samples = real_samples or []
        self.all_samples = synthetic_samples + self.real_samples
        
        self.vocab_size = 1030
        self.BOS_TOKEN = 1
        self.EOS_TOKEN = 2
        self.parser = SyntheticDataParser()
        
        logger.info(f"üìä Dataset initialized:")
        logger.info(f"   - Synthetic samples: {len(self.synthetic_samples)}")
        logger.info(f"   - Real samples: {len(self.real_samples)}")
        logger.info(f"   - Total: {len(self.all_samples)}")
    
    def __len__(self):
        return len(self.all_samples)
    
    def __getitem__(self, idx):
        sample = self.all_samples[idx]
        
        # Convertir les actions
        prev_actions_flat = []
        for action_seq in sample.previous_actions:
            prev_actions_flat.extend(action_seq)
        
        prev_tokens = self.parser.convert_to_tokens(prev_actions_flat)
        target_tokens = self.parser.convert_to_tokens(sample.target_actions)
        
        # G√©n√©rer des coordonn√©es si absent
        if sample.target_coords is None:
            sample.target_coords = torch.tensor([np.random.random(), np.random.random()], dtype=torch.float32)
        
        # G√©n√©rer un screenshot dummy si absent
        if sample.screenshot is None:
            sample.screenshot = torch.randn(1, 720, 1280)
        
        return {
            'text': sample.instruction,
            'screenshot': sample.screenshot,  # (1, H, W)
            'previous_actions': prev_tokens,
            'target_sequence': target_tokens,
            'target_coords': sample.target_coords,  # (2,)
            'url': sample.url,
            'element_type': sample.element_type
        }


def collate_fn(batch):
    """Fonction de collation pour le DataLoader"""
    
    # Padding des s√©quences d'actions
    max_prev_len = max(item['previous_actions'].shape[0] for item in batch)
    max_target_len = max(item['target_sequence'].shape[0] for item in batch)
    
    padded_prev = []
    padded_target = []
    
    for item in batch:
        prev = torch.nn.functional.pad(
            item['previous_actions'], 
            (0, max_prev_len - item['previous_actions'].shape[0]), 
            value=0
        )
        target = torch.nn.functional.pad(
            item['target_sequence'], 
            (0, max_target_len - item['target_sequence'].shape[0]), 
            value=0
        )
        padded_prev.append(prev)
        padded_target.append(target)
    
    return {
        'text': [item['text'] for item in batch],
        'screenshots': torch.stack([item['screenshot'] for item in batch]),
        'previous_actions': torch.stack(padded_prev),
        'target_sequences': torch.stack(padded_target),
        'target_coords': torch.stack([item['target_coords'] for item in batch]),
        'urls': [item['url'] for item in batch],
        'element_types': [item['element_type'] for item in batch]
    }


# ========== SCRIPT PRINCIPAL ==========

if __name__ == "__main__":
    
    print("üöÄ VisionActor Dataset Generator (Hybrid Synthetic + Real)\n")
    
    # Parser les donn√©es synth√©tiques
    print("üìÇ Loading synthetic data...")
    synthetic_parser = SyntheticDataParser()
    
  
    
    synthetic_samples = synthetic_parser.load_from_file('/Users/ibrahimbaldediallo/Documents/Code/Jarvis_project/test/situations_dataset.txt')
    
    print(f"‚úÖ Loaded {len(synthetic_samples)} synthetic samples\n")
    
    # Afficher quelques exemples
    print("üìä Sample Synthetic Data:")
    for i, sample in enumerate(synthetic_samples[:3]):
        print(f"\n{i+1}. Instruction: {sample.instruction}")
        print(f"   Previous actions: {sample.previous_actions}")
        print(f"   Target actions: {sample.target_actions}")
        print(f"   URL: {sample.url}")
        print(f"   Element type: {sample.element_type}")
    
    # Cr√©er le dataset
    print(f"\nüìä Creating dataset...")
    dataset = VisionActorRealDataset(synthetic_samples=synthetic_samples)
    
    # DataLoader
    dataloader = DataLoader(
        dataset,
        batch_size=4,
        shuffle=True,
        collate_fn=collate_fn,
        num_workers=0
    )
    
    # Afficher un batch
    print(f"\nüìà Sample batch:")
    batch = next(iter(dataloader))
    
    print(f"  üìù Texts: {batch['text'][:2]}")
    print(f"  üñºÔ∏è  Screenshots: {batch['screenshots'].shape}")
    print(f"  üìã Previous actions: {batch['previous_actions'].shape}")
    print(f"  üéØ Target sequences: {batch['target_sequences'].shape}")
    print(f"  üìç Target coords: {batch['target_coords'].shape}")
    print(f"  üåê URLs: {batch['urls'][:2]}")
    print(f"  üè∑Ô∏è  Element types: {batch['element_types'][:2]}")
    
    print(f"\n‚úÖ Dataset ready for training!")


INFO:__main__:üìÇ Loading 50095 lines from /Users/ibrahimbaldediallo/Documents/Code/Jarvis_project/test/situations_dataset.txt




üöÄ VisionActor Dataset Generator (Hybrid Synthetic + Real)

üìÇ Loading synthetic data...


INFO:__main__:  ‚úì Parsed 1000 lines (0 valid samples)
INFO:__main__:  ‚úì Parsed 2000 lines (0 valid samples)
INFO:__main__:  ‚úì Parsed 3000 lines (0 valid samples)
INFO:__main__:  ‚úì Parsed 4000 lines (666 valid samples)
INFO:__main__:  ‚úì Parsed 5000 lines (1666 valid samples)
INFO:__main__:  ‚úì Parsed 6000 lines (2666 valid samples)
INFO:__main__:  ‚úì Parsed 7000 lines (3666 valid samples)
INFO:__main__:  ‚úì Parsed 8000 lines (4666 valid samples)
INFO:__main__:  ‚úì Parsed 9000 lines (5666 valid samples)
INFO:__main__:  ‚úì Parsed 10000 lines (6420 valid samples)
INFO:__main__:  ‚úì Parsed 11000 lines (7156 valid samples)
INFO:__main__:  ‚úì Parsed 12000 lines (7897 valid samples)
INFO:__main__:  ‚úì Parsed 13000 lines (8641 valid samples)
INFO:__main__:  ‚úì Parsed 14000 lines (9496 valid samples)
INFO:__main__:  ‚úì Parsed 15000 lines (10496 valid samples)
INFO:__main__:  ‚úì Parsed 16000 lines (11496 valid samples)
INFO:__main__:  ‚úì Parsed 17000 lines (12496 valid sampl

‚úÖ Loaded 37139 synthetic samples

üìä Sample Synthetic Data:

1. Instruction: Search for weather
   Previous actions: [['cmd+space', 'safari', 'enter'], ['search bar click']]
   Target actions: ['weather', 'enter']
   URL: https://www.w3schools.com
   Element type: search

2. Instruction: Search for weather
   Previous actions: [['cmd+space', 'safari', 'enter'], ['search bar click']]
   Target actions: ['weather', 'enter']
   URL: https://www.google.com
   Element type: search

3. Instruction: Search for database design
   Previous actions: [['cmd+space', 'chrome', 'enter'], ['duckduckgo.com', 'enter']]
   Target actions: ['database design', 'enter']
   URL: https://www.linkedin.com
   Element type: search

üìä Creating dataset...

üìà Sample batch:


NameError: name 'tokens' is not defined