In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Cell 1: Import required libraries
import os
import yaml
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import warnings
from dotenv import load_dotenv
warnings.filterwarnings('ignore')

# load env variables
load_dotenv('../../../.env')

# Import your model and data loading components
from dataloader.dataset_wrapper import create_wrapper_from_dataframe

print("Libraries imported successfully!")

Libraries imported successfully!


In [3]:
# First install ClearML if not already installed: pip install clearml
import clearml
from clearml import Task, Logger

# Initialize ClearML Task
task = Task.init(project_name='CSMP_thesis_project', task_name='CSMP_traning_phase_1', reuse_last_task_id=False)
logger = Logger.current_logger()

ClearML Task: created new task id=3ca7b102c6c8472b9a7638bd224ecfae
ClearML results page: https://app.clear.ml/projects/0fec81950d384f0294d2c713df3887db/experiments/3ca7b102c6c8472b9a7638bd224ecfae/output/log


ClearML Monitor: GPU monitoring failed getting GPU reading, switching off GPU monitoring


In [4]:
# Cell 2: Configuration and paths setup
CONFIG_PATH = "../../../configs/config.yaml"
TRAIN_CSV_PATH = "../../../data/traning_and_validation/train_deduplicated.csv"

# Device selection 
if torch.backends.mps.is_available():
    DEVICE = 'mps'
    print("Using MPS (Metal Performance Shaders) for GPU acceleration")
elif torch.cuda.is_available():
    DEVICE = 'cuda'
    print("Using CUDA for GPU acceleration")
else:
    DEVICE = 'cpu'
    print("Using CPU")

print(f"Using device: {DEVICE}")
print(f"Validation data path: {TRAIN_CSV_PATH}")

Using CUDA for GPU acceleration
Using device: cuda
Validation data path: ../../../data/traning_and_validation/train_deduplicated.csv


In [None]:
# Cell 3: Load configuration
print("Loading configuration...")
config = yaml.load(open(CONFIG_PATH, "r"), Loader=yaml.FullLoader)

print("Configuration loaded:")
print(f"- Batch size: {config.get('batch_size', 64)}")
print(f"- Model config keys: {list(config.get('model', {}).keys())}")
print(f"- Loss config: {config.get('loss', {})}")

Loading configuration...
Configuration loaded:
- Batch size: 256
- Model config keys: []
- Loss config: {'temperature': 0.1, 'use_cosine_similarity': True, 'alpha_weight': 0.75}


In [6]:
task.connect(config)

{'batch_size': 256,
 'epochs': 10,
 'eval_every_n_epochs': 2,
 'log_every_n_steps': 2,
 'learning_rate': '1e-06',
 'weight_decay': 0.0001,
 'fp16_precision': True,
 'truncation': True,
 'model_config': {'emb_dim': 256,
  'spec_embed_dim': 256,
  'embed_dim': 128,
  'feat_dim': 512,
  'num_layer': 5,
  'layers': 5,
  'drop_ratio': 0.1,
  'dropout': 0.1,
  'pool': 'mean'},
 'dataset': {'s': 1,
  'num_workers': 0,
  'valid_size': 0.1,
  'ms2_file': '/home/xieting/graph_transformer_esa_hpc/data/qotf_20.mgf',
  'smi_file': '/home/xieting/graph_transformer_esa_hpc/data/smi_qtof_20.npy'},
 'loss': {'temperature': 0.1,
  'use_cosine_similarity': True,
  'alpha_weight': 0.75}}

In [7]:
# Cell 4: Load and explore validation data
print("Loading train data...")
df_train = pd.read_csv(TRAIN_CSV_PATH)

print(f"Validation dataset shape: {df_train.shape}")
print(f"Columns: {list(df_train.columns)}")
print(f"Sample data:")
df_train.head()

Loading train data...
Validation dataset shape: (798444, 10)
Columns: ['peaks_json', 'ion_source', 'compound_source', 'instrument', 'adduct', 'precursor_mz', 'smiles', 'inchikey', 'ion_mode', 'molecular_formula']
Sample data:


Unnamed: 0,peaks_json,ion_source,compound_source,instrument,adduct,precursor_mz,smiles,inchikey,ion_mode,molecular_formula
0,"[[42.014248, 0.10199999999999998], [42.26601, ...",ESI,Crude,Orbitrap,[M+H]+,377.186,CC12CCC(C(=O)N(CNc3cc4c(cc3)c3ccccc3o4)C1=O)C2...,RNKMIWQDRWSWCD-UHFFFAOYSA-N,Positive,C23H24N2O3
1,"[[49.01717, 0.155], [49.020023, 0.253], [67.05...",ESI,Crude,Orbitrap,[M+H]+,377.186,CC12CCC(C(=O)N(CNc3cc4c(cc3)c3ccccc3o4)C1=O)C2...,RNKMIWQDRWSWCD-UHFFFAOYSA-N,Positive,C23H24N2O3
2,"[[49.017338, 0.242], [49.020237, 0.181], [67.0...",ESI,Crude,Orbitrap,[M+H]+,377.186,CC12CCC(C(=O)N(CNc3cc4c(cc3)c3ccccc3o4)C1=O)C2...,RNKMIWQDRWSWCD-UHFFFAOYSA-N,Positive,C23H24N2O3
3,"[[49.01701, 0.144], [49.019947, 0.244], [139.0...",ESI,Crude,Orbitrap,[M+H]+,377.186,CC12CCC(C(=O)N(CNc3cc4c(cc3)c3ccccc3o4)C1=O)C2...,RNKMIWQDRWSWCD-UHFFFAOYSA-N,Positive,C23H24N2O3
4,"[[49.017166, 0.155], [49.020008, 0.253], [139....",ESI,Crude,Orbitrap,[M+H]+,377.186,CC12CCC(C(=O)N(CNc3cc4c(cc3)c3ccccc3o4)C1=O)C2...,RNKMIWQDRWSWCD-UHFFFAOYSA-N,Positive,C23H24N2O3


In [8]:
df_train_sample = df_train.sample(n=10000,random_state=42).reset_index(drop=True)

In [None]:
# Cell 6: Prepare validation data loader
print("Preparing data loaders")

# Create data wrapper from DataFrame
wrapper = create_wrapper_from_dataframe(
    df=df_train_sample,
    batch_size=config.get('batch_size'),  
    num_workers=8,
    valid_size=config.get('valid_size'),  
    use_ddp=False,
    output_dir="../../../data/train_feature/",
    recompute=True
)

# Get the data loader
train_loader, val_loader = wrapper.get_data_loaders()

Preparing data loaders
Converting DataFrame to compatible files...
Processed 10000 valid spectra out of 10000 total entries.
Create data wrapper
calculating molecular graphs


  6%|▋         | 513/8000 [00:00<00:11, 629.28it/s]

SMILES [I-].O=C(OCC1=CC[N+]2(C)CCC(O)C12)C(O)(C(O)C)C(C)C calculation failure


 11%|█▏        | 901/8000 [00:01<00:12, 588.99it/s]

SMILES [Cl-].O=C(O)C=1C=CC=CC1C=2C=3C=CC(=CC3OC4=CC(C=CC42)=[N+](CC)CC)N(CC)CC calculation failure


 17%|█▋        | 1347/8000 [00:02<00:10, 623.07it/s]

SMILES [Na+].O=C(CCCCCCCCCCC)CC(O)S(=O)(=O)[O-] calculation failure


 21%|██        | 1666/8000 [00:02<00:10, 626.41it/s]

SMILES [Cl-].O=C1C(=COC2=C1C=C(C(O)=C2C[NH+](C)C)CC)C=3C=CC=4OCCOC4C3 calculation failure


 28%|██▊       | 2243/8000 [00:03<00:08, 640.32it/s]

SMILES [Cl-].O=C1C=2C=C(C(O)=C(C2OC(=C1C=3C=CC=4OCCOC4C3)C)C[NH+](C)C)CCC calculation failure


 38%|███▊      | 3018/8000 [00:04<00:07, 628.32it/s]

SMILES [Cl-].O=C1C2=CC=C(O)C(=C2OC(=C1C=3C=CC=4OCCCOC4C3)C)C[NH+](C)C calculation failure


 39%|███▉      | 3147/8000 [00:05<00:07, 634.90it/s]

SMILES [Na+].O=P([O-])(O)OCC1OC(N2C=NC=3C(=NC=NC32)N)C(O)C1O calculation failure


 45%|████▍     | 3591/8000 [00:05<00:07, 626.73it/s]

SMILES [Cl-].O=C1C2=CC=C(O)C(=C2OC(=C1C=3C=CC=4OCCCOC4C3)C)C[NH+](C)C calculation failure


 52%|█████▏    | 4162/8000 [00:06<00:06, 628.17it/s]

SMILES [I-].O=C(OCC1=CC[N+]2(C)CCC(O)C12)C(O)(C(O)C)C(C)C calculation failure
SMILES [Cl-].OC=1C=C(O)C=2C=C(OC3OC(CO)C(O)C(O)C3O)C(=[O+]C2C1)C=4C=CC(O)=C(O)C4 calculation failure


 58%|█████▊    | 4614/8000 [00:07<00:05, 629.15it/s]

SMILES [Na+].O=C(CCCCCCCCCCC)CC(O)S(=O)(=O)[O-] calculation failure


 62%|██████▏   | 4936/8000 [00:08<00:04, 627.69it/s]

SMILES CCC1=C(C2=NC1=CC3=C(C4=C([N-]3)C(=C5[C@H]([C@@H](C(=N5)C=C6C(=C(C(=C2)[N-]6)C=C)C)C)CCC(=O)OC/C=C(\C)/CCC[C@H](C)CCC[C@H](C)CCCC(C)C)[C@H](C4=O)C(=O)OC)C)C=O.[Mg+2] calculation failure


 64%|██████▍   | 5125/8000 [00:08<00:04, 620.24it/s]

SMILES [I-].O=C(OCC1=CC[N+]2(C)CCC(O)C12)C(O)(C(O)C)C(C)C calculation failure


 70%|██████▉   | 5570/8000 [00:09<00:03, 620.61it/s]

SMILES C1C(N(C2=C(N1)N=C(NC2=O)N)C=O)CNC3=CC=C(C=C3)C(=O)N[C@@H](CCC(=O)[O-])C(=O)[O-].[Ca+2] calculation failure


 77%|███████▋  | 6140/8000 [00:09<00:02, 629.24it/s]

SMILES [Na+].O=C([O-])C(CC)C1OC(C(=CC=CC2C=CC3CCCC3C2C(=O)C4=CC=CN4)CC)C(C)CC1 calculation failure
SMILES [K+].[K+].O=C([O-])C1OC(OC2C(OC(C(=O)[O-])C(O)C2O)OC3CCC4(C)C5C(=O)C=C6C7CC(C(=O)O)(C)CCC7(C)CCC6(C)C5(C)CCC4C3(C)C)C(O)C(O)C1O calculation failure


 89%|████████▉ | 7104/8000 [00:11<00:01, 626.92it/s]

SMILES [K+].O=C([O-])C12CCC(C(=C)C)C2C3CCC4C5(C)CCC(=O)C(C)(C)C5CCC4(C)C3(C)CC1 calculation failure


100%|██████████| 8000/8000 [00:12<00:00, 618.31it/s]


Calculated 6875 molecular graph-mass spectrometry pairs
calculating molecular graphs


  3%|▎         | 66/2000 [00:00<00:02, 657.66it/s]

SMILES [Na+].O=C(CCCCCCCCCCC)CC(O)S(=O)(=O)[O-] calculation failure


 16%|█▌        | 324/2000 [00:00<00:02, 629.50it/s]

SMILES [Br-].O=C(OC1CC2C3OC3C(C1)[N+]2(C)CCCC)C(C=4C=CC=CC4)CO calculation failure


 32%|███▏      | 646/2000 [00:01<00:02, 631.04it/s]

SMILES [I-].O=C(OCC1=CC[N+]2(C)CCC(O)C12)C(O)(C(O)C)C(C)C calculation failure
SMILES [K+].O=S(=O)([O-])ON=C(SC1OC(CO)C(O)C(O)C1O)CC=C.O calculation failure


 46%|████▌     | 914/2000 [00:01<00:01, 634.20it/s]

SMILES [Cl-].OC=1C=C(O)C=2C=C(OC3OC(CO)C(O)C(O)C3O)C(=[O+]C2C1)C=4C=CC(O)=C(O)C4 calculation failure


 69%|██████▊   | 1373/2000 [00:02<00:01, 620.42it/s]

SMILES [Cl-].O=C1C2=CC=C(O)C(=C2OC(=C1C=3C=CC=4OCCCOC4C3)C)C[NH+](C)C calculation failure
SMILES [K+].O=S(=O)([O-])ON=C(SC1OC(CO)C(O)C(O)C1O)CC=C.O calculation failure


100%|██████████| 2000/2000 [00:03<00:00, 633.62it/s]

Calculated 1697 molecular graph-mass spectrometry pairs





In [10]:
from model import ModelCLR

# Initialize model architecture
model = ModelCLR(**config["model_config"]).to(DEVICE)

In [11]:
model

ModelCLR(
  (Smiles_model): SmilesModel(
    (x_embedding1): Embedding(119, 256)
    (x_embedding2): Embedding(4, 256)
    (x_embedding3): Embedding(8, 256)
    (x_embedding4): Embedding(6, 256)
    (x_embedding5): Embedding(5, 256)
    (gnns): ModuleList(
      (0-4): 5 x GINEConv()
    )
    (batch_norms): ModuleList(
      (0-4): 5 x BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (feat_lin): Linear(in_features=256, out_features=512, bias=True)
    (out_lin): Sequential(
      (0): Linear(in_features=512, out_features=512, bias=True)
      (1): ReLU(inplace=True)
      (2): Linear(in_features=512, out_features=256, bias=True)
    )
  )
  (MS_model): MSModel(
    (mz_embedder): FourierEmbedder()
    (input_compress): Linear(in_features=257, out_features=256, bias=True)
    (peak_attn_layers): ModuleList(
      (0-4): 5 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in

In [12]:
# Parameter Count
print("PARAMETER ANALYSIS:")
print("-" * 40)
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Trainable Parameters: {trainable_params:,}")


PARAMETER ANALYSIS:
----------------------------------------
Trainable Parameters: 6,102,784


In [13]:
# Memory Usage (approximate)
print("MEMORY ANALYSIS:")
print("-" * 40)
param_size = sum(p.numel() * p.element_size() for p in model.parameters())
buffer_size = sum(b.numel() * b.element_size() for b in model.buffers())
model_size_mb = (param_size + buffer_size) / 1024 / 1024

print(f"Model Size: {model_size_mb:.2f} MB")
print(f"Parameter Memory: {param_size / 1024 / 1024:.2f} MB")
print(f"Buffer Memory: {buffer_size / 1024 / 1024:.2f} MB")


MEMORY ANALYSIS:
----------------------------------------
Model Size: 23.29 MB
Parameter Memory: 23.28 MB
Buffer Memory: 0.01 MB


In [14]:
from loss.nt_xent import NTXentLoss

# Initialize loss function
temperature = config.get('loss', {}).get('temperature', 0.1)
batch_size = config.get('batch_size', 512)
use_cosine_similarity = config.get('loss', {}).get('use_cosine_similarity', True)
alpha_weight = config.get('loss', {}).get('alpha_weight', 1.0)

criterion = NTXentLoss(
    device=DEVICE, 
    batch_size=batch_size, 
    temperature=temperature, 
    use_cosine_similarity=use_cosine_similarity, 
    alpha_weight=alpha_weight
)

In [15]:
# Cell 15: Training Setup and Optimizer
import torch.optim as optim

print("Setting up training components...")
OUTPUT_DIR = "../../../models/models_experiments/candidate_v1"

# Initialize optimizer
optimizer = optim.AdamW(
    model.parameters(),
    lr=float(config.get('learning_rate', 5e-6)),
    weight_decay=float(config.get('weight_decay', 1e-4))
)

# Training configuration
epochs = config.get('epochs', 100)
eval_every_n_epochs = config.get('eval_every_n_epochs', 5)
log_every_n_steps = config.get('log_every_n_steps', 2)

# Create checkpoint directory
checkpoint_dir = os.path.join(OUTPUT_DIR, "checkpoints")
os.makedirs(checkpoint_dir, exist_ok=True)

print(f"Training for {epochs} epochs")
print(f"Optimizer: AdamW with lr={config.get('learning_rate', 5e-6)}")
print(f"Checkpoint directory: {checkpoint_dir}")

Setting up training components...
Training for 10 epochs
Optimizer: AdamW with lr=1e-06
Checkpoint directory: ../../../models/models_experiments/candidate_v1/checkpoints


In [16]:
# Cell 16: Training and Evaluation Functions
def train_epoch(model, train_loader, criterion, optimizer, device, epoch):
    model.train()
    total_loss = 0
    num_batches = 0
    batch_losses = []
    
    progress_bar = tqdm(train_loader, desc=f"Training Epoch {epoch}")
    
    for batch_idx, (graphs, mzs, intensities, num_peaks) in enumerate(progress_bar):
        
        # Move data to device
        graphs = graphs.to(device)
        mzs = mzs.to(device)
        intensities = intensities.to(device)
        num_peaks = num_peaks.to(device)
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        mol_features, spec_features = model(graphs, mzs, intensities, num_peaks)
        loss = criterion(mol_features, spec_features)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        # Track loss
        batch_loss = loss.item()
        total_loss += batch_loss
        batch_losses.append(batch_loss)
        num_batches += 1
        
        # Update progress bar
        progress_bar.set_postfix({
            'Loss': f'{batch_loss:.4f}',
            'Avg Loss': f'{total_loss/num_batches:.4f}'
        })
        
        # Log every n steps
        if batch_idx % log_every_n_steps == 0:
            print(f"Epoch {epoch}, Batch {batch_idx}, Loss: {batch_loss:.4f}")
    
    avg_loss = total_loss / num_batches
    return avg_loss, batch_losses

In [17]:
def evaluate_model(model, val_loader, criterion, device, epoch):
    model.eval()
    total_loss = 0
    num_batches = 0
    
    molecular_features_list = []
    spectral_features_list = []
    
    with torch.no_grad():
        progress_bar = tqdm(val_loader, desc=f"Evaluating Epoch {epoch}")
        
        for batch_idx, (graphs, mzs, intensities, num_peaks) in enumerate(progress_bar):
            # Move data to device
            graphs = graphs.to(device)
            mzs = mzs.to(device)
            intensities = intensities.to(device)
            num_peaks = num_peaks.to(device)
            
            # Forward pass
            mol_features, spec_features = model(graphs, mzs, intensities, num_peaks)
            loss = criterion(mol_features, spec_features)
            
            total_loss += loss.item()
            num_batches += 1
            
            # Store features for retrieval metrics
            molecular_features_list.append(mol_features.cpu().numpy())
            spectral_features_list.append(spec_features.cpu().numpy())
            
            progress_bar.set_postfix({'Val Loss': f'{loss.item():.4f}'})
    
    avg_loss = total_loss / num_batches
    
    # Compute retrieval metrics
    all_mol_features = np.vstack(molecular_features_list)
    all_spec_features = np.vstack(spectral_features_list)
    
    # Compute cosine similarities
    cosine_similarities = np.sum(all_mol_features * all_spec_features, axis=1)
    mean_similarity = np.mean(cosine_similarities)
        
    return avg_loss, mean_similarity

In [18]:
def save_checkpoint(model, optimizer, epoch, train_loss, val_loss, checkpoint_dir):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'train_loss': train_loss,
        'val_loss': val_loss,
        'config': config
    }
    
    checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch}.pth')
    torch.save(checkpoint, checkpoint_path)
    print(f"Checkpoint saved: {checkpoint_path}")
    
    # Save best model
    best_checkpoint_path = os.path.join(checkpoint_dir, 'best_model.pth')
    if not os.path.exists(best_checkpoint_path):
        torch.save(checkpoint, best_checkpoint_path)
        print(f"Best model saved: {best_checkpoint_path}")
    else:
        best_checkpoint = torch.load(best_checkpoint_path)
        if val_loss < best_checkpoint['val_loss']:
            torch.save(checkpoint, best_checkpoint_path)
            print(f"New best model saved: {best_checkpoint_path}")

In [None]:
import time
import json

# Training history for plotting
train_history = {
    'epochs': [],
    'train_losses': [],
    'val_losses': [],
    'val_similarities': [],
}

best_val_loss = float('inf')
start_time = time.time()

print("Starting training...")
print("=" * 60)

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    
    # Training phase
    train_loss, batch_losses = train_epoch(
        model, train_loader, criterion, optimizer, DEVICE, epoch
    )
    
    # Log training loss every epoch
    logger.report_scalar("Loss", "Train", iteration=epoch, value=train_loss)
    
    
    # Validation phase (every n epochs)
    if epoch % eval_every_n_epochs == 0:
        val_loss, val_similarity = evaluate_model(
            model, val_loader, criterion, DEVICE, epoch
        )
        
            
        # Log results
        epoch_time = time.time() - epoch_start_time
        total_time = time.time() - start_time
        
        # Log validation metrics to ClearML
        logger.report_scalar("Loss", "Validation", iteration=epoch, value=val_loss)
        logger.report_scalar("Similarity", "Cosine Similarity", iteration=epoch, value=val_similarity)
        
        print(f"\nEpoch {epoch}/{epochs}")
        print(f"Train Loss: {train_loss:.6f}")
        print(f"Val Loss: {val_loss:.6f}")
        print(f"Val mean similarity: {val_similarity:.4f}")
        print(f"Learning Rate: {current_lr:.2e}")
        print(f"Epoch Time: {epoch_time:.2f}s, Total Time: {total_time:.2f}s")
        print("-" * 60)
        
        # Store history - convert numpy types to Python native types
        train_history['epochs'].append(int(epoch))
        train_history['train_losses'].append(float(train_loss))
        train_history['val_losses'].append(float(val_loss))
        train_history['val_similarities'].append(float(val_similarity))
        
        # Save checkpoint
        save_checkpoint(
            model, optimizer, epoch, 
            train_loss, val_loss, checkpoint_dir
        )
        
        if epoch % (eval_every_n_epochs * 2) == 0 and len(train_history['epochs']) > 1:
            # Create loss plot
            import matplotlib.pyplot as plt
            
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
            
            # Loss plot
            epochs_list = train_history['epochs']
            ax1.plot(epochs_list, train_history['train_losses'], 'b-', label='Train Loss', linewidth=2)
            ax1.plot(epochs_list, train_history['val_losses'], 'r-', label='Val Loss', linewidth=2)
            ax1.set_xlabel('Epoch')
            ax1.set_ylabel('Loss')
            ax1.set_title('Training and Validation Loss')
            ax1.legend()
            ax1.grid(True, alpha=0.3)
            
            # Similarity plot
            ax2.plot(epochs_list, train_history['val_similarities'], 'g-', linewidth=2)
            ax2.set_xlabel('Epoch')
            ax2.set_ylabel('Cosine Similarity')
            ax2.set_title('Validation Cosine Similarity')
            ax2.grid(True, alpha=0.3)
            
            plt.tight_layout()
            
            # Log plot to ClearML
            logger.report_matplotlib_figure("Training Progress", "Loss and Similarity", iteration=epoch, figure=plt)
            plt.close()
    
    else:
        current_lr = optimizer.param_groups[0]['lr']
        epoch_time = time.time() - epoch_start_time
        
        print(f"Epoch {epoch}/{epochs} - Train Loss: {train_loss:.6f}, LR: {current_lr:.2e}, Time: {epoch_time:.2f}s")

# Final summary logging
print("\nTraining completed!")
total_training_time = (time.time() - start_time)/3600
print(f"Total training time: {total_training_time:.2f} hours")

Starting training...


Training Epoch 1:   0%|          | 0/26 [00:00<?, ?it/s]

Training Epoch 1:   4%|▍         | 1/26 [00:00<00:23,  1.07it/s, Loss=5.6384, Avg Loss=5.6384]

Epoch 1, Batch 0, Loss: 5.6384


Training Epoch 1:  12%|█▏        | 3/26 [00:01<00:11,  2.09it/s, Loss=5.6191, Avg Loss=5.6229]

Epoch 1, Batch 2, Loss: 5.6191


Training Epoch 1:  19%|█▉        | 5/26 [00:02<00:07,  2.95it/s, Loss=5.6001, Avg Loss=5.6147]

Epoch 1, Batch 4, Loss: 5.6001


Training Epoch 1:  27%|██▋       | 7/26 [00:02<00:05,  3.18it/s, Loss=5.5955, Avg Loss=5.6103]

Epoch 1, Batch 6, Loss: 5.5955


Training Epoch 1:  35%|███▍      | 9/26 [00:03<00:05,  3.11it/s, Loss=5.6143, Avg Loss=5.6091]

Epoch 1, Batch 8, Loss: 5.6143


Training Epoch 1:  42%|████▏     | 11/26 [00:04<00:05,  2.98it/s, Loss=5.5900, Avg Loss=5.6059]

Epoch 1, Batch 10, Loss: 5.5900


Training Epoch 1:  50%|█████     | 13/26 [00:04<00:03,  3.48it/s, Loss=5.6076, Avg Loss=5.6057]

Epoch 1, Batch 12, Loss: 5.6076


Training Epoch 1:  58%|█████▊    | 15/26 [00:05<00:03,  3.47it/s, Loss=5.6014, Avg Loss=5.6046]

Epoch 1, Batch 14, Loss: 5.6014


Training Epoch 1:  65%|██████▌   | 17/26 [00:05<00:02,  3.64it/s, Loss=5.5829, Avg Loss=5.6030]

Epoch 1, Batch 16, Loss: 5.5829


Training Epoch 1:  73%|███████▎  | 19/26 [00:06<00:02,  3.40it/s, Loss=5.5833, Avg Loss=5.6014]

Epoch 1, Batch 18, Loss: 5.5833


Training Epoch 1:  81%|████████  | 21/26 [00:06<00:01,  3.31it/s, Loss=5.5810, Avg Loss=5.6003]

Epoch 1, Batch 20, Loss: 5.5810


Training Epoch 1:  88%|████████▊ | 23/26 [00:07<00:00,  3.34it/s, Loss=5.5990, Avg Loss=5.5999]

Epoch 1, Batch 22, Loss: 5.5990


Training Epoch 1:  96%|█████████▌| 25/26 [00:08<00:00,  3.24it/s, Loss=5.5898, Avg Loss=5.5986]

Epoch 1, Batch 24, Loss: 5.5898


Training Epoch 1: 100%|██████████| 26/26 [00:08<00:00,  3.07it/s, Loss=5.5890, Avg Loss=5.5982]


Epoch 1/10 - Train Loss: 5.598230, LR: 1.00e-06, Time: 8.46s


Training Epoch 2:   4%|▍         | 1/26 [00:00<00:14,  1.67it/s, Loss=5.5798, Avg Loss=5.5798]

Epoch 2, Batch 0, Loss: 5.5798


Training Epoch 2:  12%|█▏        | 3/26 [00:01<00:09,  2.47it/s, Loss=5.6007, Avg Loss=5.5830]

Epoch 2, Batch 2, Loss: 5.6007


Training Epoch 2:  19%|█▉        | 5/26 [00:01<00:06,  3.22it/s, Loss=5.5816, Avg Loss=5.5829]

Epoch 2, Batch 4, Loss: 5.5816


Training Epoch 2:  27%|██▋       | 7/26 [00:02<00:05,  3.27it/s, Loss=5.5678, Avg Loss=5.5801]

Epoch 2, Batch 6, Loss: 5.5678


Training Epoch 2:  35%|███▍      | 9/26 [00:03<00:05,  3.25it/s, Loss=5.5833, Avg Loss=5.5814]

Epoch 2, Batch 8, Loss: 5.5833


Training Epoch 2:  42%|████▏     | 11/26 [00:03<00:04,  3.03it/s, Loss=5.5773, Avg Loss=5.5806]

Epoch 2, Batch 10, Loss: 5.5773


Training Epoch 2:  50%|█████     | 13/26 [00:04<00:03,  3.51it/s, Loss=5.5804, Avg Loss=5.5803]

Epoch 2, Batch 12, Loss: 5.5804


Training Epoch 2:  58%|█████▊    | 15/26 [00:04<00:03,  3.48it/s, Loss=5.5816, Avg Loss=5.5804]

Epoch 2, Batch 14, Loss: 5.5816


Training Epoch 2:  65%|██████▌   | 17/26 [00:05<00:02,  3.67it/s, Loss=5.5722, Avg Loss=5.5801]

Epoch 2, Batch 16, Loss: 5.5722


Training Epoch 2:  73%|███████▎  | 19/26 [00:05<00:02,  3.40it/s, Loss=5.5729, Avg Loss=5.5800]

Epoch 2, Batch 18, Loss: 5.5729


Training Epoch 2:  81%|████████  | 21/26 [00:06<00:01,  3.27it/s, Loss=5.5785, Avg Loss=5.5801]

Epoch 2, Batch 20, Loss: 5.5785


Training Epoch 2:  88%|████████▊ | 23/26 [00:07<00:00,  3.32it/s, Loss=5.5987, Avg Loss=5.5810]

Epoch 2, Batch 22, Loss: 5.5987


Training Epoch 2:  96%|█████████▌| 25/26 [00:07<00:00,  3.24it/s, Loss=5.5867, Avg Loss=5.5808]

Epoch 2, Batch 24, Loss: 5.5867


Training Epoch 2: 100%|██████████| 26/26 [00:08<00:00,  3.19it/s, Loss=5.5798, Avg Loss=5.5807]
Evaluating Epoch 2: 100%|██████████| 7/7 [00:01<00:00,  5.58it/s, Val Loss=5.1014]



Epoch 2/10
Train Loss: 5.580743
Val Loss: 5.498726
Val mean similarity: -0.0902
Learning Rate: 1.00e-06
Epoch Time: 9.42s, Total Time: 17.89s
------------------------------------------------------------
Checkpoint saved: ../../../models/models_experiments/candidate_v1/checkpoints/checkpoint_epoch_2.pth
Best model saved: ../../../models/models_experiments/candidate_v1/checkpoints/best_model.pth


Training Epoch 3:   4%|▍         | 1/26 [00:00<00:14,  1.70it/s, Loss=5.5821, Avg Loss=5.5821]

Epoch 3, Batch 0, Loss: 5.5821


Training Epoch 3:  12%|█▏        | 3/26 [00:01<00:09,  2.45it/s, Loss=5.5819, Avg Loss=5.5792]

Epoch 3, Batch 2, Loss: 5.5819


Training Epoch 3:  19%|█▉        | 5/26 [00:01<00:06,  3.15it/s, Loss=5.5743, Avg Loss=5.5786]

Epoch 3, Batch 4, Loss: 5.5743


Training Epoch 3:  27%|██▋       | 7/26 [00:02<00:05,  3.23it/s, Loss=5.5705, Avg Loss=5.5777]

Epoch 3, Batch 6, Loss: 5.5705


Training Epoch 3:  35%|███▍      | 9/26 [00:03<00:05,  3.26it/s, Loss=5.5766, Avg Loss=5.5772]

Epoch 3, Batch 8, Loss: 5.5766


Training Epoch 3:  42%|████▏     | 11/26 [00:03<00:04,  3.02it/s, Loss=5.5686, Avg Loss=5.5756]

Epoch 3, Batch 10, Loss: 5.5686


Training Epoch 3:  50%|█████     | 13/26 [00:04<00:03,  3.48it/s, Loss=5.5723, Avg Loss=5.5754]

Epoch 3, Batch 12, Loss: 5.5723


Training Epoch 3:  58%|█████▊    | 15/26 [00:04<00:03,  3.47it/s, Loss=5.5714, Avg Loss=5.5747]

Epoch 3, Batch 14, Loss: 5.5714


Training Epoch 3:  65%|██████▌   | 17/26 [00:05<00:02,  3.66it/s, Loss=5.5769, Avg Loss=5.5747]

Epoch 3, Batch 16, Loss: 5.5769


Training Epoch 3:  73%|███████▎  | 19/26 [00:05<00:02,  3.40it/s, Loss=5.5679, Avg Loss=5.5736]

Epoch 3, Batch 18, Loss: 5.5679


Training Epoch 3:  81%|████████  | 21/26 [00:06<00:01,  3.31it/s, Loss=5.5778, Avg Loss=5.5738]

Epoch 3, Batch 20, Loss: 5.5778


Training Epoch 3:  88%|████████▊ | 23/26 [00:07<00:00,  3.35it/s, Loss=5.5759, Avg Loss=5.5737]

Epoch 3, Batch 22, Loss: 5.5759


Training Epoch 3:  96%|█████████▌| 25/26 [00:07<00:00,  3.27it/s, Loss=5.5779, Avg Loss=5.5735]

Epoch 3, Batch 24, Loss: 5.5779


Training Epoch 3: 100%|██████████| 26/26 [00:08<00:00,  3.18it/s, Loss=5.5667, Avg Loss=5.5733]


Epoch 3/10 - Train Loss: 5.573266, LR: 1.00e-06, Time: 8.17s


Training Epoch 4:   4%|▍         | 1/26 [00:00<00:16,  1.55it/s, Loss=5.5773, Avg Loss=5.5773]

Epoch 4, Batch 0, Loss: 5.5773


Training Epoch 4:  12%|█▏        | 3/26 [00:01<00:09,  2.39it/s, Loss=5.5763, Avg Loss=5.5717]

Epoch 4, Batch 2, Loss: 5.5763


Training Epoch 4:  19%|█▉        | 5/26 [00:01<00:06,  3.12it/s, Loss=5.5629, Avg Loss=5.5674]

Epoch 4, Batch 4, Loss: 5.5629


Training Epoch 4:  27%|██▋       | 7/26 [00:02<00:05,  3.17it/s, Loss=5.5595, Avg Loss=5.5665]

Epoch 4, Batch 6, Loss: 5.5595


Training Epoch 4:  35%|███▍      | 9/26 [00:03<00:05,  3.23it/s, Loss=5.5761, Avg Loss=5.5669]

Epoch 4, Batch 8, Loss: 5.5761


Training Epoch 4:  42%|████▏     | 11/26 [00:03<00:04,  3.02it/s, Loss=5.5578, Avg Loss=5.5660]

Epoch 4, Batch 10, Loss: 5.5578


Training Epoch 4:  50%|█████     | 13/26 [00:04<00:03,  3.46it/s, Loss=5.5599, Avg Loss=5.5661]

Epoch 4, Batch 12, Loss: 5.5599


Training Epoch 4:  58%|█████▊    | 15/26 [00:04<00:03,  3.42it/s, Loss=5.5688, Avg Loss=5.5663]

Epoch 4, Batch 14, Loss: 5.5688


Training Epoch 4:  65%|██████▌   | 17/26 [00:05<00:02,  3.63it/s, Loss=5.5657, Avg Loss=5.5667]

Epoch 4, Batch 16, Loss: 5.5657


Training Epoch 4:  73%|███████▎  | 19/26 [00:06<00:02,  3.39it/s, Loss=5.5540, Avg Loss=5.5658]

Epoch 4, Batch 18, Loss: 5.5540


Training Epoch 4:  81%|████████  | 21/26 [00:06<00:01,  3.30it/s, Loss=5.5657, Avg Loss=5.5655]

Epoch 4, Batch 20, Loss: 5.5657


Training Epoch 4:  88%|████████▊ | 23/26 [00:07<00:00,  3.35it/s, Loss=5.5714, Avg Loss=5.5658]

Epoch 4, Batch 22, Loss: 5.5714


Training Epoch 4:  96%|█████████▌| 25/26 [00:07<00:00,  3.26it/s, Loss=5.5677, Avg Loss=5.5656]

Epoch 4, Batch 24, Loss: 5.5677


Training Epoch 4: 100%|██████████| 26/26 [00:08<00:00,  3.16it/s, Loss=5.5675, Avg Loss=5.5657]
Evaluating Epoch 4: 100%|██████████| 7/7 [00:01<00:00,  5.53it/s, Val Loss=5.0908]



Epoch 4/10
Train Loss: 5.565673
Val Loss: 5.489662
Val mean similarity: -0.0063
Learning Rate: 1.00e-06
Epoch Time: 9.50s, Total Time: 49.27s
------------------------------------------------------------
Checkpoint saved: ../../../models/models_experiments/candidate_v1/checkpoints/checkpoint_epoch_4.pth
New best model saved: ../../../models/models_experiments/candidate_v1/checkpoints/best_model.pth


Training Epoch 5:   4%|▍         | 1/26 [00:00<00:15,  1.62it/s, Loss=5.5691, Avg Loss=5.5691]

Epoch 5, Batch 0, Loss: 5.5691


Training Epoch 5:  12%|█▏        | 3/26 [00:01<00:09,  2.45it/s, Loss=5.5663, Avg Loss=5.5655]

Epoch 5, Batch 2, Loss: 5.5663


Training Epoch 5:  19%|█▉        | 5/26 [00:01<00:06,  3.17it/s, Loss=5.5570, Avg Loss=5.5637]

Epoch 5, Batch 4, Loss: 5.5570


Training Epoch 5:  27%|██▋       | 7/26 [00:02<00:05,  3.24it/s, Loss=5.5635, Avg Loss=5.5639]

Epoch 5, Batch 6, Loss: 5.5635


Training Epoch 5:  35%|███▍      | 9/26 [00:03<00:05,  3.27it/s, Loss=5.5726, Avg Loss=5.5655]

Epoch 5, Batch 8, Loss: 5.5726


Training Epoch 5:  42%|████▏     | 11/26 [00:03<00:04,  3.04it/s, Loss=5.5591, Avg Loss=5.5648]

Epoch 5, Batch 10, Loss: 5.5591


Training Epoch 5:  50%|█████     | 13/26 [00:04<00:03,  3.47it/s, Loss=5.5614, Avg Loss=5.5639]

Epoch 5, Batch 12, Loss: 5.5614


Training Epoch 5:  58%|█████▊    | 15/26 [00:04<00:03,  3.44it/s, Loss=5.5617, Avg Loss=5.5641]

Epoch 5, Batch 14, Loss: 5.5617


Training Epoch 5:  65%|██████▌   | 17/26 [00:05<00:02,  3.63it/s, Loss=5.5612, Avg Loss=5.5640]

Epoch 5, Batch 16, Loss: 5.5612


Training Epoch 5:  73%|███████▎  | 19/26 [00:05<00:02,  3.39it/s, Loss=5.5606, Avg Loss=5.5636]

Epoch 5, Batch 18, Loss: 5.5606


Training Epoch 5:  81%|████████  | 21/26 [00:06<00:01,  3.30it/s, Loss=5.5714, Avg Loss=5.5640]

Epoch 5, Batch 20, Loss: 5.5714


Training Epoch 5:  88%|████████▊ | 23/26 [00:07<00:00,  3.33it/s, Loss=5.5689, Avg Loss=5.5640]

Epoch 5, Batch 22, Loss: 5.5689


Training Epoch 5:  96%|█████████▌| 25/26 [00:07<00:00,  3.24it/s, Loss=5.5641, Avg Loss=5.5638]

Epoch 5, Batch 24, Loss: 5.5641


Training Epoch 5: 100%|██████████| 26/26 [00:08<00:00,  3.18it/s, Loss=5.5596, Avg Loss=5.5636]


Epoch 5/10 - Train Loss: 5.563597, LR: 1.00e-06, Time: 8.19s


Training Epoch 6:   4%|▍         | 1/26 [00:00<00:14,  1.70it/s, Loss=5.5693, Avg Loss=5.5693]

Epoch 6, Batch 0, Loss: 5.5693


Training Epoch 6:  12%|█▏        | 3/26 [00:01<00:09,  2.35it/s, Loss=5.5654, Avg Loss=5.5649]

Epoch 6, Batch 2, Loss: 5.5654


Training Epoch 6:  19%|█▉        | 5/26 [00:01<00:06,  3.07it/s, Loss=5.5568, Avg Loss=5.5630]

Epoch 6, Batch 4, Loss: 5.5568


Training Epoch 6:  27%|██▋       | 7/26 [00:02<00:05,  3.18it/s, Loss=5.5560, Avg Loss=5.5628]

Epoch 6, Batch 6, Loss: 5.5560


Training Epoch 6:  35%|███▍      | 9/26 [00:03<00:05,  3.25it/s, Loss=5.5649, Avg Loss=5.5625]

Epoch 6, Batch 8, Loss: 5.5649


Training Epoch 6:  42%|████▏     | 11/26 [00:03<00:04,  3.00it/s, Loss=5.5509, Avg Loss=5.5612]

Epoch 6, Batch 10, Loss: 5.5509


Training Epoch 6:  50%|█████     | 13/26 [00:04<00:03,  3.49it/s, Loss=5.5605, Avg Loss=5.5617]

Epoch 6, Batch 12, Loss: 5.5605


Training Epoch 6:  58%|█████▊    | 15/26 [00:04<00:03,  3.47it/s, Loss=5.5603, Avg Loss=5.5618]

Epoch 6, Batch 14, Loss: 5.5603


Training Epoch 6:  65%|██████▌   | 17/26 [00:05<00:02,  3.66it/s, Loss=5.5524, Avg Loss=5.5616]

Epoch 6, Batch 16, Loss: 5.5524


Training Epoch 6:  73%|███████▎  | 19/26 [00:06<00:02,  3.40it/s, Loss=5.5541, Avg Loss=5.5609]

Epoch 6, Batch 18, Loss: 5.5541


Training Epoch 6:  81%|████████  | 21/26 [00:06<00:01,  3.31it/s, Loss=5.5655, Avg Loss=5.5609]

Epoch 6, Batch 20, Loss: 5.5655


Training Epoch 6:  88%|████████▊ | 23/26 [00:07<00:00,  3.34it/s, Loss=5.5614, Avg Loss=5.5608]

Epoch 6, Batch 22, Loss: 5.5614


Training Epoch 6:  96%|█████████▌| 25/26 [00:07<00:00,  3.26it/s, Loss=5.5645, Avg Loss=5.5609]

Epoch 6, Batch 24, Loss: 5.5645


Training Epoch 6: 100%|██████████| 26/26 [00:08<00:00,  3.16it/s, Loss=5.5525, Avg Loss=5.5606]
Evaluating Epoch 6: 100%|██████████| 7/7 [00:01<00:00,  5.18it/s, Val Loss=5.0877]



Epoch 6/10
Train Loss: 5.560579
Val Loss: 5.486757
Val mean similarity: -0.0350
Learning Rate: 1.00e-06
Epoch Time: 9.59s, Total Time: 87.62s
------------------------------------------------------------
Checkpoint saved: ../../../models/models_experiments/candidate_v1/checkpoints/checkpoint_epoch_6.pth


Connecting multiple input models with the same name: `best_model`. This might result in the wrong model being used when executing remotely


New best model saved: ../../../models/models_experiments/candidate_v1/checkpoints/best_model.pth


Training Epoch 7:   4%|▍         | 1/26 [00:00<00:14,  1.76it/s, Loss=5.5598, Avg Loss=5.5598]

Epoch 7, Batch 0, Loss: 5.5598


Training Epoch 7:  12%|█▏        | 3/26 [00:01<00:09,  2.45it/s, Loss=5.5645, Avg Loss=5.5597]

Epoch 7, Batch 2, Loss: 5.5645


Training Epoch 7:  19%|█▉        | 5/26 [00:01<00:06,  3.18it/s, Loss=5.5549, Avg Loss=5.5592]

Epoch 7, Batch 4, Loss: 5.5549


Training Epoch 7:  27%|██▋       | 7/26 [00:02<00:05,  3.25it/s, Loss=5.5589, Avg Loss=5.5593]

Epoch 7, Batch 6, Loss: 5.5589


Training Epoch 7:  35%|███▍      | 9/26 [00:03<00:05,  3.27it/s, Loss=5.5635, Avg Loss=5.5588]

Epoch 7, Batch 8, Loss: 5.5635


Training Epoch 7:  42%|████▏     | 11/26 [00:03<00:05,  2.97it/s, Loss=5.5496, Avg Loss=5.5579]

Epoch 7, Batch 10, Loss: 5.5496


Training Epoch 7:  50%|█████     | 13/26 [00:04<00:03,  3.45it/s, Loss=5.5578, Avg Loss=5.5584]

Epoch 7, Batch 12, Loss: 5.5578


Training Epoch 7:  58%|█████▊    | 15/26 [00:04<00:03,  3.44it/s, Loss=5.5600, Avg Loss=5.5589]

Epoch 7, Batch 14, Loss: 5.5600


Training Epoch 7:  65%|██████▌   | 17/26 [00:05<00:02,  3.63it/s, Loss=5.5573, Avg Loss=5.5591]

Epoch 7, Batch 16, Loss: 5.5573


Training Epoch 7:  73%|███████▎  | 19/26 [00:05<00:02,  3.37it/s, Loss=5.5576, Avg Loss=5.5589]

Epoch 7, Batch 18, Loss: 5.5576


Training Epoch 7:  81%|████████  | 21/26 [00:06<00:01,  3.28it/s, Loss=5.5551, Avg Loss=5.5585]

Epoch 7, Batch 20, Loss: 5.5551


Training Epoch 7:  88%|████████▊ | 23/26 [00:07<00:00,  3.32it/s, Loss=5.5593, Avg Loss=5.5587]

Epoch 7, Batch 22, Loss: 5.5593


Training Epoch 7:  96%|█████████▌| 25/26 [00:07<00:00,  3.24it/s, Loss=5.5647, Avg Loss=5.5588]

Epoch 7, Batch 24, Loss: 5.5647


Training Epoch 7: 100%|██████████| 26/26 [00:08<00:00,  3.16it/s, Loss=5.5562, Avg Loss=5.5587]


Epoch 7/10 - Train Loss: 5.558659, LR: 1.00e-06, Time: 8.22s


Training Epoch 8:   4%|▍         | 1/26 [00:00<00:17,  1.39it/s, Loss=5.5544, Avg Loss=5.5544]

Epoch 8, Batch 0, Loss: 5.5544


Training Epoch 8:  12%|█▏        | 3/26 [00:01<00:10,  2.27it/s, Loss=5.5595, Avg Loss=5.5561]

Epoch 8, Batch 2, Loss: 5.5595


Training Epoch 8:  19%|█▉        | 5/26 [00:01<00:06,  3.04it/s, Loss=5.5606, Avg Loss=5.5583]

Epoch 8, Batch 4, Loss: 5.5606


Training Epoch 8:  27%|██▋       | 7/26 [00:02<00:06,  3.15it/s, Loss=5.5588, Avg Loss=5.5579]

Epoch 8, Batch 6, Loss: 5.5588


Training Epoch 8:  35%|███▍      | 9/26 [00:03<00:05,  3.21it/s, Loss=5.5620, Avg Loss=5.5583]

Epoch 8, Batch 8, Loss: 5.5620


Training Epoch 8:  42%|████▏     | 11/26 [00:03<00:04,  3.00it/s, Loss=5.5517, Avg Loss=5.5576]

Epoch 8, Batch 10, Loss: 5.5517


Training Epoch 8:  50%|█████     | 13/26 [00:04<00:03,  3.48it/s, Loss=5.5514, Avg Loss=5.5577]

Epoch 8, Batch 12, Loss: 5.5514


Training Epoch 8:  58%|█████▊    | 15/26 [00:04<00:03,  3.44it/s, Loss=5.5559, Avg Loss=5.5577]

Epoch 8, Batch 14, Loss: 5.5559


Training Epoch 8:  65%|██████▌   | 17/26 [00:05<00:02,  3.62it/s, Loss=5.5511, Avg Loss=5.5577]

Epoch 8, Batch 16, Loss: 5.5511


Training Epoch 8:  73%|███████▎  | 19/26 [00:06<00:02,  3.37it/s, Loss=5.5536, Avg Loss=5.5576]

Epoch 8, Batch 18, Loss: 5.5536


Training Epoch 8:  81%|████████  | 21/26 [00:06<00:01,  3.28it/s, Loss=5.5518, Avg Loss=5.5570]

Epoch 8, Batch 20, Loss: 5.5518


Training Epoch 8:  88%|████████▊ | 23/26 [00:07<00:00,  3.31it/s, Loss=5.5665, Avg Loss=5.5572]

Epoch 8, Batch 22, Loss: 5.5665


Training Epoch 8:  96%|█████████▌| 25/26 [00:08<00:00,  3.22it/s, Loss=5.5558, Avg Loss=5.5568]

Epoch 8, Batch 24, Loss: 5.5558


Training Epoch 8: 100%|██████████| 26/26 [00:08<00:00,  3.11it/s, Loss=5.5569, Avg Loss=5.5568]
Evaluating Epoch 8: 100%|██████████| 7/7 [00:01<00:00,  5.63it/s, Val Loss=5.0859]



Epoch 8/10
Train Loss: 5.556778
Val Loss: 5.485030
Val mean similarity: -0.0447
Learning Rate: 1.00e-06
Epoch Time: 9.62s, Total Time: 125.94s
------------------------------------------------------------
Checkpoint saved: ../../../models/models_experiments/candidate_v1/checkpoints/checkpoint_epoch_8.pth
New best model saved: ../../../models/models_experiments/candidate_v1/checkpoints/best_model.pth


Training Epoch 9:   4%|▍         | 1/26 [00:00<00:15,  1.65it/s, Loss=5.5595, Avg Loss=5.5595]

Epoch 9, Batch 0, Loss: 5.5595


Training Epoch 9:  12%|█▏        | 3/26 [00:01<00:09,  2.47it/s, Loss=5.5595, Avg Loss=5.5588]

Epoch 9, Batch 2, Loss: 5.5595


Training Epoch 9:  19%|█▉        | 5/26 [00:01<00:06,  3.22it/s, Loss=5.5578, Avg Loss=5.5581]

Epoch 9, Batch 4, Loss: 5.5578


Training Epoch 9:  27%|██▋       | 7/26 [00:02<00:05,  3.28it/s, Loss=5.5560, Avg Loss=5.5579]

Epoch 9, Batch 6, Loss: 5.5560


Training Epoch 9:  35%|███▍      | 9/26 [00:03<00:05,  3.28it/s, Loss=5.5496, Avg Loss=5.5562]

Epoch 9, Batch 8, Loss: 5.5496


Training Epoch 9:  42%|████▏     | 11/26 [00:03<00:04,  3.05it/s, Loss=5.5463, Avg Loss=5.5555]

Epoch 9, Batch 10, Loss: 5.5463


Training Epoch 9:  50%|█████     | 13/26 [00:04<00:03,  3.51it/s, Loss=5.5498, Avg Loss=5.5551]

Epoch 9, Batch 12, Loss: 5.5498


Training Epoch 9:  58%|█████▊    | 15/26 [00:04<00:03,  3.45it/s, Loss=5.5566, Avg Loss=5.5554]

Epoch 9, Batch 14, Loss: 5.5566


Training Epoch 9:  65%|██████▌   | 17/26 [00:05<00:02,  3.65it/s, Loss=5.5530, Avg Loss=5.5557]

Epoch 9, Batch 16, Loss: 5.5530


Training Epoch 9:  73%|███████▎  | 19/26 [00:05<00:02,  3.41it/s, Loss=5.5535, Avg Loss=5.5557]

Epoch 9, Batch 18, Loss: 5.5535


Training Epoch 9:  81%|████████  | 21/26 [00:06<00:01,  3.32it/s, Loss=5.5573, Avg Loss=5.5557]

Epoch 9, Batch 20, Loss: 5.5573


Training Epoch 9:  88%|████████▊ | 23/26 [00:07<00:00,  3.33it/s, Loss=5.5615, Avg Loss=5.5558]

Epoch 9, Batch 22, Loss: 5.5615


Training Epoch 9:  96%|█████████▌| 25/26 [00:07<00:00,  3.24it/s, Loss=5.5589, Avg Loss=5.5559]

Epoch 9, Batch 24, Loss: 5.5589


Training Epoch 9: 100%|██████████| 26/26 [00:08<00:00,  3.20it/s, Loss=5.5535, Avg Loss=5.5558]


Epoch 9/10 - Train Loss: 5.555764, LR: 1.00e-06, Time: 8.14s


Training Epoch 10:   4%|▍         | 1/26 [00:00<00:15,  1.63it/s, Loss=5.5557, Avg Loss=5.5557]

Epoch 10, Batch 0, Loss: 5.5557


Training Epoch 10:  12%|█▏        | 3/26 [00:01<00:09,  2.40it/s, Loss=5.5521, Avg Loss=5.5533]

Epoch 10, Batch 2, Loss: 5.5521


Training Epoch 10:  19%|█▉        | 5/26 [00:01<00:06,  3.14it/s, Loss=5.5477, Avg Loss=5.5529]

Epoch 10, Batch 4, Loss: 5.5477


Training Epoch 10:  27%|██▋       | 7/26 [00:02<00:05,  3.23it/s, Loss=5.5544, Avg Loss=5.5545]

Epoch 10, Batch 6, Loss: 5.5544


Training Epoch 10:  35%|███▍      | 9/26 [00:03<00:05,  3.22it/s, Loss=5.5586, Avg Loss=5.5551]

Epoch 10, Batch 8, Loss: 5.5586


Training Epoch 10:  42%|████▏     | 11/26 [00:03<00:04,  3.01it/s, Loss=5.5539, Avg Loss=5.5549]

Epoch 10, Batch 10, Loss: 5.5539


Training Epoch 10:  50%|█████     | 13/26 [00:04<00:03,  3.50it/s, Loss=5.5535, Avg Loss=5.5548]

Epoch 10, Batch 12, Loss: 5.5535


Training Epoch 10:  58%|█████▊    | 15/26 [00:04<00:03,  3.47it/s, Loss=5.5459, Avg Loss=5.5541]

Epoch 10, Batch 14, Loss: 5.5459


Training Epoch 10:  65%|██████▌   | 17/26 [00:05<00:02,  3.66it/s, Loss=5.5490, Avg Loss=5.5539]

Epoch 10, Batch 16, Loss: 5.5490


Training Epoch 10:  73%|███████▎  | 19/26 [00:06<00:02,  3.38it/s, Loss=5.5497, Avg Loss=5.5537]

Epoch 10, Batch 18, Loss: 5.5497


Training Epoch 10:  81%|████████  | 21/26 [00:06<00:01,  3.29it/s, Loss=5.5538, Avg Loss=5.5534]

Epoch 10, Batch 20, Loss: 5.5538


Training Epoch 10:  88%|████████▊ | 23/26 [00:07<00:00,  3.34it/s, Loss=5.5650, Avg Loss=5.5538]

Epoch 10, Batch 22, Loss: 5.5650


Training Epoch 10:  96%|█████████▌| 25/26 [00:07<00:00,  3.25it/s, Loss=5.5596, Avg Loss=5.5538]

Epoch 10, Batch 24, Loss: 5.5596


Training Epoch 10: 100%|██████████| 26/26 [00:08<00:00,  3.17it/s, Loss=5.5540, Avg Loss=5.5538]
Evaluating Epoch 10: 100%|██████████| 7/7 [00:01<00:00,  6.01it/s, Val Loss=5.0849]



Epoch 10/10
Train Loss: 5.553835
Val Loss: 5.483869
Val mean similarity: -0.0431
Learning Rate: 1.00e-06
Epoch Time: 9.37s, Total Time: 163.79s
------------------------------------------------------------
Checkpoint saved: ../../../models/models_experiments/candidate_v1/checkpoints/checkpoint_epoch_10.pth
New best model saved: ../../../models/models_experiments/candidate_v1/checkpoints/best_model.pth

Training completed!
Total training time: 0.05 hours


TypeError: report_single_value() takes 3 positional arguments but 4 were given

In [None]:
# Log final metrics
logger.report_single_value("Total Training Time (hours)", total_training_time)
logger.report_single_value("Best Validation Loss", min(train_history['val_losses']) if train_history['val_losses'] else float('inf'))
logger.report_single_value("Best Cosine Similarity", max(train_history['val_similarities']) if train_history['val_similarities'] else 0.0)
logger.report_single_value("Total Epochs", epochs)

# Close the task
task.close()