In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np
import pandas as pd
import wandb
import gc
from sklearn.model_selection import train_test_split
from timm.models.layers import DropPath, to_2tuple, trunc_normal_
from timm.models.registry import register_model
from timm.models.vision_transformer import _cfg, Mlp, Block

# Create a custom Vision Transformer model
# Create dataset classes (using your BalancedDataset approach) and training function
class BalancedDataset(Dataset):
    def __init__(self, X, y, limit_per_label=1600):
        self.X = X
        self.y = y
        self.limit_per_label = limit_per_label
        self.classes = np.unique(y)
        self.indices = self.balance_classes()

    def balance_classes(self):
        indices = []
        for cls in self.classes:
            cls_indices = np.where(self.y == cls)[0]
            if len(cls_indices) > self.limit_per_label:
                cls_indices = np.random.choice(cls_indices, self.limit_per_label, replace=False)
            indices.extend(cls_indices)
        np.random.shuffle(indices)
        return indices

    def re_sample(self):
        self.indices = self.balance_classes()

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        index = self.indices[idx]
        return self.X[index], self.y[index]
# Custom Dataset for validation with limit per class
class BalancedValidationDataset(Dataset):
    def __init__(self, X, y, limit_per_label=400):
        self.X = X
        self.y = y
        self.limit_per_label = limit_per_label
        self.classes = np.unique(y)
        self.indices = self.balance_classes()

    def balance_classes(self):
        indices = []
        for cls in self.classes:
            cls_indices = np.where(self.y == cls)[0]
            if len(cls_indices) > self.limit_per_label:
                cls_indices = np.random.choice(cls_indices, self.limit_per_label, replace=False)
            indices.extend(cls_indices)
        np.random.shuffle(indices)
        return indices
    
    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        index = self.indices[idx]
        return self.X[index], self.y[index]


# Training function with learning rate scheduler
def train_model_vit(model, train_loader, val_loader, test_loader, num_epochs=500, lr=1e-4, max_patience=20, device='cuda'):
    model = model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=int(max_patience/5), verbose=True)
    criterion = nn.CrossEntropyLoss()
    best_test_loss = float('inf')
    patience = max_patience
    
    for epoch in range(num_epochs):
        train_loader.dataset.re_sample()
        model.train()
        train_loss, train_accuracy = 0.0, 0.0
        
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * X_batch.size(0)
            train_accuracy += (outputs.argmax(dim=1) == y_batch).float().mean().item()
        
        # Validation phase
        model.eval()
        val_loss, val_accuracy = 0.0, 0.0
        with torch.no_grad():
            for X_val, y_val in val_loader:
                X_val, y_val = X_val.to(device), y_val.to(device)
                outputs = model(X_val)
                loss = criterion(outputs, y_val)
                val_loss += loss.item() * X_val.size(0)
                val_accuracy += (outputs.argmax(dim=1) == y_val).float().mean().item()
        
        # Test phase
        test_loss, test_accuracy = 0.0, 0.0
        y_true, y_pred = [], []
        with torch.no_grad():
            for X_test, y_test in test_loader:
                X_test, y_test = X_test.to(device), y_test.to(device)
                outputs = model(X_test)
                loss = criterion(outputs, y_test)
                test_loss += loss.item() * X_test.size(0)
                test_accuracy += (outputs.argmax(dim=1) == y_test).float().mean().item()
                y_true.extend(y_test.cpu().numpy())
                y_pred.extend(outputs.argmax(dim=1).cpu().numpy())
        
        # Scheduler step
        scheduler.step(test_loss / len(test_loader.dataset))

        # Log metrics to WandB
        wandb.log({
            "epoch": epoch,
            "train_loss": train_loss / len(train_loader.dataset),
            "val_loss": val_loss / len(val_loader.dataset),
            "train_accuracy": train_accuracy / len(train_loader),
            "val_accuracy": val_accuracy / len(val_loader),
            "learning_rate": optimizer.param_groups[0]['lr'],
            "test_loss": test_loss / len(test_loader.dataset),
            "test_accuracy": test_accuracy / len(test_loader),
            "confusion_matrix": wandb.plot.confusion_matrix(probs=None,
                y_true=y_true, preds=y_pred, class_names=np.unique(y_true)),
            "classification_report": classification_report(y_true, y_pred, target_names=label_mapping.keys())
        })
        
        # Early stopping
        if test_loss < best_test_loss:
            best_test_loss = test_loss
            patience = max_patience
        else:
            patience -= 1
            if patience <= 0:
                print("Early stopping triggered.")
                break

    return model
def init_rope_frequencies(dim, num_heads, theta, rotate=False):
    # Adjust the size of `mag` to match the per-head dimension
    per_head_dim = dim // ( num_heads)
    mag = 1 / (theta ** (torch.arange(0, per_head_dim).float() / (dim // num_heads))).unsqueeze(0)

    # Adjust `angles` accordingly
    angles = torch.rand(num_heads, per_head_dim//2) * 2 * torch.pi if rotate else torch.zeros(num_heads, per_head_dim//2)

    # Compute `freq_x` and `freq_y` with matching dimensions
    freq_x = mag * torch.cat([torch.cos(angles), torch.cos(torch.pi / 2 + angles)], dim=-1)
    freq_y = mag * torch.cat([torch.sin(angles), torch.sin(torch.pi / 2 + angles)], dim=-1)

    return torch.stack([freq_x, freq_y], dim=0)


def apply_rotary_position_embeddings(freqs, q, k):
    # Ensure `cos` and `sin` have the same shape as `q` and `k` by adding unsqueeze
    cos, sin = freqs[0].unsqueeze(1), freqs[1].unsqueeze(1)    
    
    # Broadcast `cos` and `sin` to match `q` and `k` dimensions
    q_rot = (q * cos) + (torch.roll(q, shifts=1, dims=-1) * sin)
    k_rot = (k * cos) + (torch.roll(k, shifts=1, dims=-1) * sin)
    
    return q_rot, k_rot
class CrossAttention(nn.Module):
    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., theta=10000):
        super().__init__()
        self.num_heads = num_heads
        self.dim = dim
        self.scale = qk_scale or (dim // num_heads) ** -0.5
        self.theta = theta

        self.wq = nn.Linear(dim, dim, bias=qkv_bias)
        self.wk = nn.Linear(dim, dim, bias=qkv_bias)
        self.wv = nn.Linear(dim, dim, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)
        
        # Initialize rotary frequencies
        self.freqs = init_rope_frequencies(dim, num_heads, theta)

    def forward(self, x):
        B, N, C = x.shape
        q = self.wq(x[:, 0:1, ...]).view(B, 1, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
        k = self.wk(x).view(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
        v = self.wv(x).view(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)

        # Apply rotary position embedding
        q_rot, k_rot = apply_rotary_position_embeddings(self.freqs.to(x.device), q, k)

        # Attention calculation with rotated embeddings
        attn = (q_rot @ k_rot.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)
        

        x = (attn @ v).transpose(1, 2).reshape(B, 1, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x
    
class CrossAttentionBlock(nn.Module):
    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., theta=10.0,
                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, has_mlp=True):
        super().__init__()
        self.norm1 = norm_layer(dim)
        self.attn = CrossAttention(
            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, theta=theta)
        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
        self.has_mlp = has_mlp
        if has_mlp:
            self.norm2 = norm_layer(dim)
            mlp_hidden_dim = int(dim * mlp_ratio)
            self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)

    def forward(self, x):
        x = x[:, 0:1, ...] + self.drop_path(self.attn(self.norm1(x)))
        if self.has_mlp:
            x = x + self.drop_path(self.mlp(self.norm2(x)))

        return x

class VisionTransformer1D(nn.Module):
    def __init__(self, input_size, num_classes=4, patch_sizes=[20, 40], overlap=0.5, dim=128, depth=6, heads=8, mlp_dim=256, dropout=0.2, theta = 10.0):
        super(VisionTransformer1D, self).__init__()
        if isinstance(patch_sizes, int):
            patch_sizes = [patch_sizes]
        self.num_branches = len(patch_sizes)
        self.dim = dim
        self.overlap = overlap
        self.branches = nn.ModuleList()
        
        # Set up branches for different patch sizes
        for patch_size in patch_sizes:
            stride = int(patch_size * (1 - overlap))
            max_patches = (input_size - patch_size) // stride + 1
            max_patches = (input_size // patch_size) ** 2
            patch_embed = nn.Linear(patch_size, dim)
            pos_embedding = nn.Embedding(max_patches + 1, dim)  # "+ 1" to account for class token
            transformer = nn.TransformerEncoder(
                nn.TransformerEncoderLayer(dim, heads, mlp_dim, dropout), depth
            )
            self.branches.append(nn.ModuleDict({
                'patch_embed': patch_embed,
                'pos_embedding': pos_embedding,
                'transformer': transformer
            }))

        # Learnable class token
        self.class_token = nn.Parameter(torch.zeros(1, 1, dim))

        # Cross-Attention for fusion of multiple patch sizes
        self.cross_attention = CrossAttentionBlock(dim, heads, theta=theta)

        # Classification head
        self.fc = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, x):
        batch_size, seq_len = x.shape
        branch_outputs = []
        
        # Extract patches, embed, and process with transformer for each branch
        for branch in self.branches:
            patch_size = branch['patch_embed'].in_features
            stride = int(patch_size * (1 - self.overlap))
            num_patches = (seq_len - patch_size) // stride + 1
            patches = [x[:, i * stride : i * stride + patch_size] for i in range(num_patches)]
            x_branch = torch.stack(patches, dim=1)
            x_branch = branch['patch_embed'](x_branch)
            
            # Append class token and add positional embeddings
            class_token = self.class_token.expand(batch_size, -1, -1)
            x_branch = torch.cat((class_token, x_branch), dim=1)
            x_branch = x_branch + branch['pos_embedding'](torch.arange(num_patches + 1, device=x.device)).unsqueeze(0)
            x_branch = branch['transformer'](x_branch)
            branch_outputs.append(x_branch)

        # Apply cross-attention to combine the representations from each branch
        x_fused = torch.cat(branch_outputs, dim=1)
        x_fused = self.cross_attention(x_fused)

        # Classification based on the class token representation
        x = self.fc(x_fused[:, 0])  # Use the class token at position 0 for classification
        return x



In [2]:
batch_size = 128


# Main script to load data and train the model
if __name__ == "__main__":
    # Load and preprocess data
    X = pd.read_pickle("Pickles/train.pkl")
    y = X["label"]
    label_mapping = {'star': 0, 'binary_star': 1, 'galaxy': 2, 'agn': 3}
    y = y.map(label_mapping).values
    columns = ["parallax", "ra", "dec", "ra_error", "dec_error", "parallax_error", "pmra", "pmdec", "pmra_error", "pmdec_error", 
           "phot_g_mean_flux", "flagnopllx", "phot_g_mean_flux_error", "phot_bp_mean_flux", "phot_rp_mean_flux", 
           "phot_bp_mean_flux_error", "phot_rp_mean_flux_error"]
    X = X[columns]
    
    # Read test data
    X_test = pd.read_pickle("Pickles/test.pkl")
    y_test = X_test["label"].map(label_mapping).values
    X_test = X_test[columns]

    # Convert test data to torch tensors
    X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)  # Convert DataFrame to NumPy array first
    y_test_tensor = torch.tensor(y_test, dtype=torch.long)

    # Create the test dataset without the unsqueeze
    test_dataset = BalancedValidationDataset(X_test_tensor, y_test_tensor)

    # Create the DataLoader
    test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False)

    
    # Split data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Clear memory
    del X, y
    gc.collect()

    # Convert to torch tensors and create datasets
    X_train = torch.tensor(X_train.values, dtype=torch.float32)
    X_val = torch.tensor(X_val.values, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.long)
    y_val = torch.tensor(y_val, dtype=torch.long)
    train_dataset = BalancedDataset(X_train, y_train)
    val_dataset = BalancedValidationDataset(X_val, y_val)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [3]:
# Define the hyperparameters
num_classes = 4
input_size = 17
patch_sizes=[1]
dim = 32
depth = 4
heads = 8
mlp_dim = 128
dropout = 0.1
lr = 0.0001
patience = 30
num_epochs = 200
theta = 1.0
model_vit = VisionTransformer1D(num_classes=num_classes, input_size=input_size, patch_sizes=patch_sizes, dim=dim, depth=depth, heads=heads, mlp_dim=mlp_dim, dropout=dropout, overlap=0, theta=theta)
print(model_vit)
# print number of parameters of multihead attention
#print(sum(p.numel() for p in model_vit.attn.parameters()))
trained_model = train_model_vit(model_vit, train_loader, val_loader, test_loader, num_epochs=num_epochs, lr=lr, max_patience=patience)




VisionTransformer1D(
  (branches): ModuleList(
    (0): ModuleDict(
      (patch_embed): Linear(in_features=1, out_features=32, bias=True)
      (pos_embedding): Embedding(290, 32)
      (transformer): TransformerEncoder(
        (layers): ModuleList(
          (0-3): 4 x TransformerEncoderLayer(
            (self_attn): MultiheadAttention(
              (out_proj): NonDynamicallyQuantizableLinear(in_features=32, out_features=32, bias=True)
            )
            (linear1): Linear(in_features=32, out_features=128, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
            (linear2): Linear(in_features=128, out_features=32, bias=True)
            (norm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
            (norm2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
            (dropout1): Dropout(p=0.1, inplace=False)
            (dropout2): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
  )
  (cross_attention): CrossAttentionBlock(


  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Error: You must call wandb.init() before wandb.log()

In [4]:
# Define the hyperparameters
num_classes = 4
input_size = 17
patch_sizes=[1]
dim = 32
depth = 4
heads = 8
mlp_dim = 128
dropout = 0.1
lr = 0.0001
patience = 30
num_epochs = 200
theta = 1.0


# Define the config dictionary object
config = {"num_classes": num_classes, "patch_size": patch_sizes, "dim": dim, "depth": depth, "heads": heads, "mlp_dim": mlp_dim, 
          "dropout": dropout, "batch_size": batch_size, "lr": lr, "patience": patience}

# Initialize WandB project
wandb.init(project="gaia-crossvit", entity="joaoc-university-of-southampton", config=config)
# Initialize and train the model
model_vit = VisionTransformer1D(num_classes=num_classes, input_size=input_size, patch_sizes=patch_sizes, dim=dim, depth=depth, heads=heads, mlp_dim=mlp_dim, dropout=dropout, overlap=0, theta=theta)
trained_model = train_model_vit(model_vit, train_loader, val_loader, test_loader, num_epochs=num_epochs, lr=lr, max_patience=patience)

# Save the model and finish WandB session
wandb.finish()

wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: joaoc (joaoc-university-of-southampton). Use `wandb login --relogin` to force relogin


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


KeyboardInterrupt: 

In [42]:
# Define the hyperparameters
num_classes = 4
input_size = 17
patch_sizes=[1]
dim = 32
depth = 4
heads = 8
mlp_dim = 128
dropout = 0.1
lr = 0.0001
patience = 30
num_epochs = 200
theta = 1.0


# Define the config dictionary object
config = {"num_classes": num_classes, "patch_size": patch_sizes, "dim": dim, "depth": depth, "heads": heads, "mlp_dim": mlp_dim, 
          "dropout": dropout, "batch_size": batch_size, "lr": lr, "patience": patience, "theta": theta}

# Initialize WandB project
wandb.init(project="gaia-crossvit", entity="joaoc-university-of-southampton", config=config)
# Initialize and train the model
model_vit = VisionTransformer1D(num_classes=num_classes, input_size=input_size, patch_sizes=patch_sizes, dim=dim, depth=depth, heads=heads, mlp_dim=mlp_dim, dropout=dropout, overlap=0, theta=theta)
trained_model = train_model_vit(model_vit, train_loader, val_loader, test_loader, num_epochs=num_epochs, lr=lr, max_patience=patience)

# Save the model and finish WandB session
wandb.finish()

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Early stopping triggered.


VBox(children=(Label(value='0.143 MB of 0.143 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇███
learning_rate,████████████████████▄▄▄▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁
test_accuracy,▁▃▄▆▇▇▇▇████████████████████████████████
test_loss,█▄▃▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_accuracy,▁▃▅▆▆▆▇▇▇▇▇▇▇█▇█████████████████████████
train_loss,█▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▅▅▆▆▆▆▇▇▇▇▇▇▇▇█████████████████████████
val_loss,██▆▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
classification_report,precis...
epoch,139
learning_rate,0.0
test_accuracy,0.82598
test_loss,0.39565
train_accuracy,0.8265
train_loss,0.40827
val_accuracy,0.84428
val_loss,0.37131


In [45]:
# Define the hyperparameters
num_classes = 4
input_size = 17
patch_sizes=[3]
dim = 128
depth = 4
heads = 8
mlp_dim = 512
dropout = 0.1
lr = 0.0001
patience = 30
num_epochs = 200
theta = 1.0


# Define the config dictionary object
config = {"num_classes": num_classes, "patch_size": patch_sizes, "dim": dim, "depth": depth, "heads": heads, "mlp_dim": mlp_dim, 
          "dropout": dropout, "batch_size": batch_size, "lr": lr, "patience": patience, "theta": theta}

# Initialize WandB project
wandb.init(project="gaia-crossvit-Rope", entity="joaoc-university-of-southampton", config=config)
# Initialize and train the model
model_vit = VisionTransformer1D(num_classes=num_classes, input_size=input_size, patch_sizes=patch_sizes, dim=dim, depth=depth, heads=heads, mlp_dim=mlp_dim, dropout=dropout, overlap=0, theta=theta)
trained_model = train_model_vit(model_vit, train_loader, val_loader, test_loader, num_epochs=num_epochs, lr=lr, max_patience=patience)

# Save the model and finish WandB session
wandb.finish()

VBox(children=(Label(value='0.023 MB of 0.023 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▄▅▆▇█
learning_rate,▁▁▁▁▁▁▁▁
test_accuracy,▁▂▅▇▇▆██
test_loss,█▄▃▂▂▂▁▁
train_accuracy,▁▅▆▇█▇██
train_loss,█▃▂▂▁▁▁▁
val_accuracy,▁▃▅▆▇▆██
val_loss,█▅▄▃▂▂▁▁

0,1
classification_report,precis...
epoch,7
learning_rate,0.0001
test_accuracy,0.80303
test_loss,0.47316
train_accuracy,0.78978
train_loss,0.48522
val_accuracy,0.82281
val_loss,0.44641


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Early stopping triggered.


VBox(children=(Label(value='0.143 MB of 0.143 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███
learning_rate,█████████████▄▄▄▄▄▄▄▃▃▃▃▃▃▃▃▃▃▂▂▂▂▁▁▁▁▁▁
test_accuracy,▂▁▃▂▅▅▄▅▅▄▅▅▆▅▅▆▆▇▆▆▇▆▇▇▆▆▇▇█▇▇▆▇█▇█▇▇▇▇
test_loss,█▄▄▃▃▂▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_accuracy,▁▅▆▇▇▇▇▇▇▇██████████████████████████████
train_loss,█▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▃▅▆▆▇▆▇▇▇▇▇▇▇█▇████████████████████████
val_loss,█▇▇▆▄▄▄▃▃▃▃▂▂▂▂▂▂▁▁▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
classification_report,precis...
epoch,140
learning_rate,0.0
test_accuracy,0.85225
test_loss,0.34034
train_accuracy,0.87775
train_loss,0.30894
val_accuracy,0.89248
val_loss,0.29572


In [47]:
# Define the hyperparameters
num_classes = 4
input_size = 17
patch_sizes=[3]
dim = 128
depth = 4
heads = 8
mlp_dim = 512
dropout = 0.1
lr = 0.0001
patience = 30
num_epochs = 200
theta = 100.0


# Define the config dictionary object
config = {"num_classes": num_classes, "patch_size": patch_sizes, "dim": dim, "depth": depth, "heads": heads, "mlp_dim": mlp_dim, 
          "dropout": dropout, "batch_size": batch_size, "lr": lr, "patience": patience, "theta": theta}

# Initialize WandB project
wandb.init(project="gaia-crossvit-Rope", entity="joaoc-university-of-southampton", config=config)
# Initialize and train the model
model_vit = VisionTransformer1D(num_classes=num_classes, input_size=input_size, patch_sizes=patch_sizes, dim=dim, depth=depth, heads=heads, mlp_dim=mlp_dim, dropout=dropout, overlap=0, theta=theta)
trained_model = train_model_vit(model_vit, train_loader, val_loader, test_loader, num_epochs=num_epochs, lr=lr, max_patience=patience)

# Save the model and finish WandB session
wandb.finish()

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


VBox(children=(Label(value='0.221 MB of 0.221 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇█████
learning_rate,█████████████████▄▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
test_accuracy,▂▁▄▅▆▆▆▆▆▇▇▇▇▆▇▇▇▇▇▇▇▇▇▇████████████████
test_loss,█▆▅▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_accuracy,▁▂▂▂▅▅▆▅▆▅▆▇▇▇▇▇▇▇▇▇▇▇▇▇▆▇▇▇▇▇▇██▇███▇▇▇
train_loss,███▆▆▅▅▅▄▃▃▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▂▁▁▁▁▂▁▁▁▁▁▁▁▁
val_accuracy,▁▁▃▅▆▇▆▇▇▇▇▇▇▇▇▇▇▇▇█████████████████████
val_loss,█▆▅▅▄▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
classification_report,precis...
epoch,199
learning_rate,0.0
test_accuracy,0.86045
test_loss,0.32758
train_accuracy,0.87778
train_loss,0.29804
val_accuracy,0.89118
val_loss,0.29069


In [48]:
# Define the hyperparameters
num_classes = 4
input_size = 17
patch_sizes=[3]
dim = 128
depth = 4
heads = 8
mlp_dim = 512
dropout = 0.1
lr = 0.0001
patience = 30
num_epochs = 200
theta = 10000.0


# Define the config dictionary object
config = {"num_classes": num_classes, "patch_size": patch_sizes, "dim": dim, "depth": depth, "heads": heads, "mlp_dim": mlp_dim, 
          "dropout": dropout, "batch_size": batch_size, "lr": lr, "patience": patience, "theta": theta}

# Initialize WandB project
wandb.init(project="gaia-crossvit-Rope", entity="joaoc-university-of-southampton", config=config)
# Initialize and train the model
model_vit = VisionTransformer1D(num_classes=num_classes, input_size=input_size, patch_sizes=patch_sizes, dim=dim, depth=depth, heads=heads, mlp_dim=mlp_dim, dropout=dropout, overlap=0, theta=theta)
trained_model = train_model_vit(model_vit, train_loader, val_loader, test_loader, num_epochs=num_epochs, lr=lr, max_patience=patience)

# Save the model and finish WandB session
wandb.finish()

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Early stopping triggered.


VBox(children=(Label(value='0.169 MB of 0.169 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇█
learning_rate,███████████████▄▄▄▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
test_accuracy,▁▂▄▅▅▆▇▇▇▇▇▇▇▇▇▇▇▇▇█▇█▇▇▇█▇████▇████████
test_loss,█▆▆▆▅▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_accuracy,▁▆▇▇▇▇██████████████████████████████████
train_loss,█▃▂▂▂▂▂▁▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▅▅▆▆▆▇▆▇▇▇▇▇▇▇▇▇▇▇▇████████████████████
val_loss,█▆▅▄▃▃▃▃▂▂▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
classification_report,precis...
epoch,159
learning_rate,0.0
test_accuracy,0.84844
test_loss,0.34021
train_accuracy,0.86792
train_loss,0.31313
val_accuracy,0.8827
val_loss,0.30769


In [46]:
# Define the hyperparameters
num_classes = 4
input_size = 17
patch_sizes=[1,17]
dim = 128
depth = 4
heads = 8
mlp_dim = 512
dropout = 0.1
lr = 0.0001
patience = 30
num_epochs = 200
theta = 1.0


# Define the config dictionary object
config = {"num_classes": num_classes, "patch_size": patch_sizes, "dim": dim, "depth": depth, "heads": heads, "mlp_dim": mlp_dim, 
          "dropout": dropout, "batch_size": batch_size, "lr": lr, "patience": patience, "theta": theta}

# Initialize WandB project
wandb.init(project="gaia-crossvit-Rope", entity="joaoc-university-of-southampton", config=config)
# Initialize and train the model
model_vit = VisionTransformer1D(num_classes=num_classes, input_size=input_size, patch_sizes=patch_sizes, dim=dim, depth=depth, heads=heads, mlp_dim=mlp_dim, dropout=dropout, overlap=0, theta=theta)
trained_model = train_model_vit(model_vit, train_loader, val_loader, test_loader, num_epochs=num_epochs, lr=lr, max_patience=patience)

# Save the model and finish WandB session
wandb.finish()



Early stopping triggered.


VBox(children=(Label(value='0.166 MB of 0.166 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇██
learning_rate,████████████████████▄▄▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
test_accuracy,▁▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇█▇▇▇▇▇██▇███████████████
test_loss,█▄▃▃▂▂▂▂▂▂▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_accuracy,▁▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇██████████▇███████████
train_loss,█▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▃▅▇▇▇▇▇▇▇▇█▇▇█▇████████████████████████
val_loss,█▆▅▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
classification_report,precis...
epoch,128
learning_rate,0.0
test_accuracy,0.85615
test_loss,0.34836
train_accuracy,0.8724
train_loss,0.31043
val_accuracy,0.88532
val_loss,0.30701


In [40]:
# Define the hyperparameters
num_classes = 4
input_size = 17
patch_sizes=[1]
dim = 32
depth = 4
heads = 8
mlp_dim = 128
dropout = 0.1
lr = 0.0001
patience = 30
num_epochs = 200
theta = 100.0


# Define the config dictionary object
config = {"num_classes": num_classes, "patch_size": patch_sizes, "dim": dim, "depth": depth, "heads": heads, "mlp_dim": mlp_dim, 
          "dropout": dropout, "batch_size": batch_size, "lr": lr, "patience": patience, "theta": theta}

# Initialize WandB project
wandb.init(project="gaia-crossvit", entity="joaoc-university-of-southampton", config=config)
# Initialize and train the model
model_vit = VisionTransformer1D(num_classes=num_classes, input_size=input_size, patch_sizes=patch_sizes, dim=dim, depth=depth, heads=heads, mlp_dim=mlp_dim, dropout=dropout, overlap=0, theta=theta)
trained_model = train_model_vit(model_vit, train_loader, val_loader, test_loader, num_epochs=num_epochs, lr=lr, max_patience=patience)

# Save the model and finish WandB session
wandb.finish()

0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇██
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_accuracy,▁▄▅▅▅▅▅▅▅▅▇▇▇▇▇▇▇▇▇▇██▇███▇▇█▇██████████
test_loss,█▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_accuracy,▁▆▅▆▆▆▆▇▇▇▇▇▇█▇█████████████████████████
train_loss,█▄▃▃▃▃▂▂▂▂▂▂▂▁▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▃▄▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇██████████████████
val_loss,█▅▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
classification_report,precis...
epoch,52
learning_rate,0.0001
test_accuracy,0.86309
test_loss,0.36658
train_accuracy,0.82528
train_loss,0.40337
val_accuracy,0.83579
val_loss,0.39756


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Early stopping triggered.


VBox(children=(Label(value='0.183 MB of 0.183 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇███
learning_rate,████████████████████████▄▄▄▄▄▄▄▄▄▄▄▃▃▂▁▁
test_accuracy,▁▅▅▅▆▆▇▇▇▇▇█████████████████████████████
test_loss,█▇▆▅▅▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_accuracy,▁▆▆▆▇▇██████████████████████████████████
train_loss,█▇▇▆▆▄▄▃▃▃▃▃▃▃▂▃▃▂▂▂▂▂▂▁▁▂▁▂▁▁▂▁▁▂▁▁▁▁▁▁
val_accuracy,▁▁▂▄▅▆▇▇▇▇▇▇█▇██▇▇▇█████████████████████
val_loss,██▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
classification_report,precis...
epoch,170
learning_rate,0.0
test_accuracy,0.83613
test_loss,0.40052
train_accuracy,0.84077
train_loss,0.38896
val_accuracy,0.84951
val_loss,0.36845


In [46]:
# Define the hyperparameters
num_classes = 4
input_size = 17
patch_sizes=[3]
dim = 64
depth = 4
heads = 16
mlp_dim = 4*dim
dropout = 0.1
lr = 0.0001
patience = 30
num_epochs = 200
theta = 1.0


# Define the config dictionary object
config = {"num_classes": num_classes, "patch_size": patch_sizes, "dim": dim, "depth": depth, "heads": heads, "mlp_dim": mlp_dim, 
          "dropout": dropout, "batch_size": batch_size, "lr": lr, "patience": patience}

# Initialize WandB project
wandb.init(project="gaia-crossvit", entity="joaoc-university-of-southampton", config=config)
# Initialize and train the model
model_vit = VisionTransformer1D(num_classes=num_classes, input_size=input_size, patch_sizes=patch_sizes, dim=dim, depth=depth, heads=heads, mlp_dim=mlp_dim, dropout=dropout, overlap=0, theta=theta)
trained_model = train_model_vit(model_vit, train_loader, val_loader, test_loader, num_epochs=num_epochs, lr=lr, max_patience=patience)

# Save the model and finish WandB session
wandb.finish()



Early stopping triggered.


0,1
epoch,▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇████
learning_rate,███████████████▄▃▃▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_accuracy,▁▅▅▆▇▇▇▇▇▇█▇████████████████████████████
test_loss,█▅▃▃▃▂▂▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_accuracy,▁▁▂▂▃▄▅▅▅▆▅▆▇▆▆▇▇▇▇▆▇█▆▆▇█▇█▇██▇█▇▇██▇██
train_loss,█▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▄▅▅▆▇▇▇▇▇▇█▇███████▇▇██████████████████
val_loss,█▄▃▂▂▂▂▁▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
classification_report,precis...
epoch,176
learning_rate,0.0
test_accuracy,0.89043
test_loss,0.30069
train_accuracy,0.87274
train_loss,0.30805
val_accuracy,0.85408
val_loss,0.34795


In [39]:
# Define the hyperparameters
num_classes = 4
input_size = 17
patch_sizes=[1]
dim = 128
depth = 4
heads = 16
mlp_dim = 128
dropout = 0.1
lr = 0.0001
patience = 30
num_epochs = 200


# Define the config dictionary object
config = {"num_classes": num_classes, "patch_size": patch_sizes, "dim": dim, "depth": depth, "heads": heads, "mlp_dim": mlp_dim, 
          "dropout": dropout, "batch_size": batch_size, "lr": lr, "patience": patience}

# Initialize WandB project
wandb.init(project="gaia-crossvit", entity="joaoc-university-of-southampton", config=config)
# Initialize and train the model
model_vit = VisionTransformer1D(num_classes=num_classes, input_size=input_size, patch_sizes=patch_sizes, dim=dim, depth=depth, heads=heads, mlp_dim=mlp_dim, dropout=dropout, overlap=0)
trained_model = train_model_vit(model_vit, train_loader, val_loader, test_loader, num_epochs=num_epochs, lr=lr, max_patience=patience)

# Save the model and finish WandB session
wandb.finish()



Early stopping triggered.


VBox(children=(Label(value='0.180 MB of 0.180 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇███
learning_rate,███████████▄▄▄▄▄▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
test_accuracy,▁▂▂▅▅▇▆▆▆▅▇▆█▇▇▇▇▇▇▇█▇▇▇▇▇█▇█▇▇▇▇█████▇█
test_loss,█▆▆▅▆▄▃▃▃▃▂▂▂▂▃▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_accuracy,▁▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇█▇██▇█████████████████
train_loss,█▆▆▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▂▁▂▁▁▁▁▁
val_accuracy,▁▃▄▅▆▆▅▆▆▇▆▇▇▇▇▇▇▇▇▇▇██▇████████████████
val_loss,█▅▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
classification_report,precis...
epoch,130
learning_rate,0.0
test_accuracy,0.85205
test_loss,0.36469
train_accuracy,0.84278
train_loss,0.37306
val_accuracy,0.83577
val_loss,0.40119


In [40]:
# Define the hyperparameters
num_classes = 4
input_size = 17
patch_sizes=[1, 17]
dim = 128
depth = 4
heads = 16
mlp_dim = 128
dropout = 0.1
lr = 0.0001
patience = 30
num_epochs = 200


# Define the config dictionary object
config = {"num_classes": num_classes, "patch_size": patch_sizes, "dim": dim, "depth": depth, "heads": heads, "mlp_dim": mlp_dim, 
          "dropout": dropout, "batch_size": batch_size, "lr": lr, "patience": patience}

# Initialize WandB project
wandb.init(project="gaia-crossvit", entity="joaoc-university-of-southampton", config=config)
# Initialize and train the model
model_vit = VisionTransformer1D(num_classes=num_classes, input_size=input_size, patch_sizes=patch_sizes, dim=dim, depth=depth, heads=heads, mlp_dim=mlp_dim, dropout=dropout, overlap=0)
trained_model = train_model_vit(model_vit, train_loader, val_loader, test_loader, num_epochs=num_epochs, lr=lr, max_patience=patience)

# Save the model and finish WandB session
wandb.finish()



Early stopping triggered.


VBox(children=(Label(value='0.193 MB of 0.193 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇███
learning_rate,██████████████▄▄▃▃▃▃▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_accuracy,▁▅▆▆▆▇▇▇▇▆▇█████████████████████████████
test_loss,█▇▅▄▃▃▃▂▂▁▂▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_accuracy,▁▁▂▃▄▃▃▃▃▄▅▅▆▅▅▅▅▆▆▇▆▆▆█▇█▇██▇▇▆▆▇▇█▇█▇▇
train_loss,█▇▇▇▆▇▆▅▅▅▅▅▄▄▅▃▃▂▃▃▂▂▃▂▂▂▂▂▂▁▁▂▂▂▂▁▂▂▂▂
val_accuracy,▁▂▂▄▄▄▅▅▆▆▆▇▆▇▇███▇▇███████████████▇▇▇▇▇
val_loss,█▇▇▅▆▄▃▄▃▃▃▂▂▂▁▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
classification_report,precis...
epoch,162
learning_rate,0.0
test_accuracy,0.88906
test_loss,0.2835
train_accuracy,0.88105
train_loss,0.29405
val_accuracy,0.85732
val_loss,0.33419
