In [1]:
# If you are running this online (for example at Google Colab), 
# make sure you have the support files on the same folder
# Otherwise run this cell to download them

# NOTE: Downloading will take a while, be patient. You can refresh your folder from time to time to see when the files
# have been created.

import os, requests, zipfile, io 

files_url = "https://ideami.com/llm_align"

# Downloading proceeds if we detect that one of the key files to download is not present
if not os.path.exists(f"llm.py"):
    print("Downloading files using Python")
    response = requests.get(files_url)
    zipfile.ZipFile(io.BytesIO(response.content)).extractall(".")
else:
    print("you seem to have already downloaded the files. If you wish to re-download them, delete the llm.py file")


you seem to have already downloaded the files. If you wish to re-download them, delete the llm.py file


In [2]:
# Import libraries
import os, sys
import math 
from tqdm import tqdm
from datetime import datetime
import ipdb 
from typing import List, Dict, Union, Any, Tuple

# Pytorch
import torch
import torch.nn as nn
from torch.nn import functional as F

# Import some Hugging Face Libraries
import transformers
from datasets import load_dataset, load_from_disk

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

torch.cuda.empty_cache()

# Optional for debugging, if you want to see the full tensor
torch.set_printoptions(threshold=10_000)

In [3]:
#Training parameters
batch_size = 4 
epochs = 3 # 3 is good, more overfits
lr = 6e-5
lr_warmup_steps = 100
context = 1024
alpha = 0.5 
prompt_max_length = 512
compile = False
dtype = torch.bfloat16
log_iter = 50

# Hyperparameters
dropout = 0.
grad_clip = 1.0
weight_decay = 0.0

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: You are using ", device)


Device: You are using  cuda


In [4]:
# Logging 
project_name = "aligntest2"
wandb_log = True 
wandb_project = project_name
# ipdb.set_trace()
wandb_run_name = f"aligntest2_run_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"

if wandb_log:
    import wandb
    wandb.init(project=wandb_project, name=wandb_run_name)

wandb: Currently logged in as: mistigri-heriveau (mistigri-heriveau-universit-toulouse-capitole). Use `wandb login --relogin` to force relogin
wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [5]:
path = os.getcwd() 
dataset_path = path + '\data2\orpo_dataset'
dataset_name = 'mlabonne/orpo-dpo-mix-40k'
tokenizer_path = path +'/tokenizers/tok16384'
checkpoint_dir = path +'/models/'

tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_path)

# Set the tokenizer parameters
tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>\n' }}\n{% endif %}\n{% endfor %}"

# Make padding token equal to the end of sentence token (wich has ID of 2 in our case)
tokenizer.pad_token = tokenizer.eos_token

if os.path.exists(dataset_path):
    dataset = load_from_disk(dataset_path)
    print("Dataset loaded from disk")
else:
    print("Dataset not found, loading from Hugging Face")
    dataset = load_dataset(dataset_name, split='all')
    # Optional: Filter out the toxic-dpo-v0.2 dataset
    dataset = dataset.filter(lambda x: x['source'] != "toxic-dpo-v0.2")
    
    def filter_dataset(examples):
        prompt_lenght = tokenizer.apply_chat_template(examples['chosen'][:-1], tokenize=True, add_generation_prompt=True, return_tensors='pt').size(-1)
        
        if prompt_lenght < prompt_max_length:
            return True
        else:
            return False
    
    
    def preprocess_dataset(example: Union[List, Dict]):
        # ipdb.set_trace()
        prompt = [tokenizer.apply_chat_template(item[:-1], tokenize=False, add_generation_prompt=True) for item in example['chosen']]
        chosen = [tokenizer.apply_chat_template(item, tokenize=False) for item in example['chosen']]
        rejected = [tokenizer.apply_chat_template(item, tokenize=False) for item in example['rejected']]
        
        inputs = tokenizer(prompt, max_length=context, padding="max_length", truncation=True, return_tensors="pt")
        pos_labels = tokenizer(chosen, max_length=context, padding="max_length", truncation=True, return_tensors="pt")
        neg_labels = tokenizer(rejected, max_length=context, padding="max_length", truncation=True, return_tensors="pt")
        
        inputs['positive_input_ids'] = pos_labels['input_ids']
        inputs['positive_attention_mask'] = pos_labels['attention_mask']
        
        inputs['negative_input_ids'] = neg_labels['input_ids']
        inputs['negative_attention_mask'] = neg_labels['attention_mask']
        
        return inputs
    
    dataset = dataset.filter(filter_dataset)
    
    
    dataset = dataset.map(preprocess_dataset, batched = True, num_proc=1, remove_columns=dataset.column_names)
    
    dataset.save_to_disk(dataset_path)
    
    

Dataset loaded from disk


In [6]:
tokenizer.decode(dataset[0]['positive_input_ids'])

'<|user|>\nHow many colors are traditionally recognized in a visible spectrum or optical rainbow?</s> \n<|assistant|>\nTraditionally, a visible spectrum or optical rainbow is said to consist of seven colors. The order of these colors is typically remembered using the acronym ROYGBIV - Red, Orange, Yellow, Green, Blue, Indigo, and Violet. However, it is important to note that the division of the spectrum into these seven constituent colors is largely a human construct. In reality, a rainbow encompasses a continuous spectrum of colors which blend seamlessly into one another, from red light, which has longer wavelengths, to violet light, which has shorter wavelengths. The specific selection of seven colors originates from the work of Sir Isaac Newton, who chose to divide the spectrum into seven colors to correlate with the seven notes in a western major scale of music.</s> \n<|user|>\nExplain the scientific reasoning behind the continuous spectrum of colors in a rainbow.</s> \n<|assistant

In [7]:
dataset = dataset.shuffle(42).train_test_split(test_size=0.05)
train_data = dataset['train']
val_data = dataset['test']

In [8]:
data_collector = transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, collate_fn=data_collector, shuffle=False, num_workers=0)
val_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, collate_fn=data_collector, shuffle=False, num_workers=0)

In [9]:
it = iter(train_loader)
batch = next(it)
# print (tokenizer.decode(batch['positive_input_ids'][0]))

In [10]:
from llm import Llama, ModelArgs

checkpoint = torch.load(os.path.join(checkpoint_dir, 'base_model.pt'))
config = checkpoint.pop("config")

model_args = ModelArgs(
    dim=config.hidden_size, 
    n_layers=config.num_hidden_layers, 
    n_heads=config.num_attention_heads, 
    n_kv_heads=config.num_key_value_heads, 
    vocab_size=config.vocab_size, 
    norm_eps=config.rms_norm_eps, 
    rope_theta=config.rope_theta,
    max_seq_len=context, 
    dropout=config.attention_dropout, 
    hidden_dim=config.intermediate_size,
    attention_bias=config.attention_bias,
    mlp_bias=config.mlp_bias
)

model = Llama(model_args)
model.load_state_dict(checkpoint)
model = model.to(dtype=dtype, device=device)
model.train()

if compile:
    print('[INFO] Compiling model')
    model = torch.compile(model)

print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')


138.431232 M parameters


In [11]:
# Optimizer

optimizer = torch.optim.AdamW(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-8, fused = device == 'cuda', weight_decay=weight_decay)

num_training_steps = len(train_loader) * epochs
print(f"num_training_steps: {num_training_steps}")

# Scheduler for lr: first 100 steps warmup, then decay
def lr_lambda(step):
    if step < lr_warmup_steps:
        return float(step) / float(max(1, lr_warmup_steps))
    else:
        progress = float(step - lr_warmup_steps) / float(max(1, num_training_steps - lr_warmup_steps))
        return max(0.0, math.cos(math.pi * float(0.5) * 2.0 * progress))
    

scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda, last_epoch=-1)

num_training_steps: 54921


In [12]:
def compute_logps(prompt_attention_mask, chosen_inputs, chosen_attention_mask, logits):
    mask = chosen_attention_mask[:,:-1] - prompt_attention_mask[:,1:]
    per_token_lops = torch.gather(logits[:,:-1,:].log_softmax(-1), dim=2, 
                                  index=(mask * chosen_inputs[:,1:]).unsqueeze(2)).squeeze(2)
    return torch.mul(per_token_lops, mask.to(dtype)).sum(dim=1).to(dtype) / mask.sum(dim=1).to(dtype)

In [14]:
try :
    for e in range (epochs):
        for i, batch in tqdm(enumerate(train_loader), total=len(train_loader), dynamic_ncols=True):
            optimizer.zero_grad(set_to_none=True)
            batch['positive_input_ids'] = batch['positive_input_ids'].to(device)
            batch['positive_attention_mask'] = batch['positive_attention_mask'].to(device)
            batch['negative_input_ids'] = batch['negative_input_ids'].to(device)
            batch['negative_attention_mask'] = batch['negative_attention_mask'].to(device)
            batch['attention_mask'] = batch['attention_mask'].to(device)
            
            neg_labels = batch['negative_input_ids'].clone()
            pos_labels = batch['positive_input_ids'].clone()
            
            # Calculate the loss
            mask = batch['attention_mask'] * batch['positive_attention_mask'] # mask out the padding
            pos_labels = pos_labels * mask.logical_not() 
            
            pos_labels[pos_labels == 0] = tokenizer.pad_token_id           
            pos_labels[pos_labels == tokenizer.eos_token_id] = -100
            neg_labels[neg_labels == tokenizer.eos_token_id] = -100
            
            outputs_pos, loss_pos = model(batch['positive_input_ids'], pos_labels)
            outputs_neg, _ = model(batch['negative_input_ids'], neg_labels)
            
            # Calulcate per token log probabilities, essential to calculate the ORPO LOG ODDS RATIO 
            pos_prob = compute_logps(
                batch['attention_mask'], 
                batch['positive_input_ids'], 
                batch['positive_attention_mask'], 
                outputs_pos
            )
            neg_prob = compute_logps(
                batch['attention_mask'],
                batch['negative_input_ids'],
                batch['negative_attention_mask'],
                outputs_neg
            )
            
            
            # Calculate the ORPO odds ratio
            log_odds = (pos_prob - neg_prob) - (torch.log(1 - torch.exp(pos_prob)) - torch.log(1 - torch.exp(neg_prob)))
            sig_ratio = F.sigmoid(log_odds) # Sigmoid to get the ratio between 0 and 1
            ratio = torch.log(sig_ratio)
            
            # Calculate the loss
            loss = torch.mean(loss_pos - (alpha * ratio).mean()).to(dtype)
            
            # Logging 
            if i % log_iter == 0:
                print(f"Epoch: [{e}/{epochs}] Iteration: [{i}/{len(train_loader)}] Loss: {loss.item():.3f} Odds Ratio: {log_odds.mean().item():.3f}")
                if wandb_log:
                    wandb.log({"loss": loss.item(),
                               "odds_ratio": log_odds.mean().item(),
                               "lr" : scheduler.get_last_lr()[0],
                               "epoch": e,
                               "iteration": i})
                if torch.isnan(loss):
                    print("Loss is NaN, breaking")
                    if wandb_log:
                        wandb.finish()
                    torch.cuda.empty_cache()
                    sys.exit()
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            optimizer.step()
            scheduler.step()
            
        # Save the model
        sd = model.state_dict()
        sd['config'] = config
        torch.save(sd, os.path.join(checkpoint_dir, f'base_model_{e+1}.pt'))
                        
except KeyboardInterrupt:
    print("Training interrupted")
    pass
finally:
    torch.cuda.empty_cache()
    print("Training finished, GPU memory cleaned")
    pass

torch.cuda.empty_cache()

  0%|          | 0/18307 [00:00<?, ?it/s]

Epoch: [0/3] Iteration: [0/18307] Loss: 4.375 Odds Ratio: -0.482


  0%|          | 50/18307 [00:40<4:10:25,  1.22it/s]

Epoch: [0/3] Iteration: [50/18307] Loss: 2.797 Odds Ratio: -0.438


  1%|          | 100/18307 [01:21<4:08:29,  1.22it/s]

Epoch: [0/3] Iteration: [100/18307] Loss: 2.234 Odds Ratio: 0.805


  1%|          | 150/18307 [02:02<4:08:06,  1.22it/s]

Epoch: [0/3] Iteration: [150/18307] Loss: 4.188 Odds Ratio: -1.602


  1%|          | 200/18307 [02:43<4:06:25,  1.22it/s]

Epoch: [0/3] Iteration: [200/18307] Loss: 2.234 Odds Ratio: -0.365


  1%|▏         | 250/18307 [03:24<3:57:06,  1.27it/s]

Epoch: [0/3] Iteration: [250/18307] Loss: 2.672 Odds Ratio: -0.051


  2%|▏         | 300/18307 [04:02<3:45:42,  1.33it/s]

Epoch: [0/3] Iteration: [300/18307] Loss: 2.328 Odds Ratio: 1.031


  2%|▏         | 350/18307 [04:40<3:41:56,  1.35it/s]

Epoch: [0/3] Iteration: [350/18307] Loss: 2.625 Odds Ratio: 0.422


  2%|▏         | 400/18307 [05:20<4:04:50,  1.22it/s]

Epoch: [0/3] Iteration: [400/18307] Loss: 2.391 Odds Ratio: -0.159


  2%|▏         | 450/18307 [06:01<4:01:37,  1.23it/s]

Epoch: [0/3] Iteration: [450/18307] Loss: 2.844 Odds Ratio: -0.609


  3%|▎         | 500/18307 [06:41<4:02:38,  1.22it/s]

Epoch: [0/3] Iteration: [500/18307] Loss: 2.250 Odds Ratio: 0.081


  3%|▎         | 550/18307 [07:22<4:02:16,  1.22it/s]

Epoch: [0/3] Iteration: [550/18307] Loss: 2.656 Odds Ratio: 0.196


  3%|▎         | 600/18307 [08:03<4:00:44,  1.23it/s]

Epoch: [0/3] Iteration: [600/18307] Loss: 1.867 Odds Ratio: 1.953


  4%|▎         | 650/18307 [08:43<3:59:19,  1.23it/s]

Epoch: [0/3] Iteration: [650/18307] Loss: 2.703 Odds Ratio: -0.359


  4%|▍         | 700/18307 [09:24<3:57:00,  1.24it/s]

Epoch: [0/3] Iteration: [700/18307] Loss: 2.953 Odds Ratio: 0.082


  4%|▍         | 750/18307 [10:05<3:57:20,  1.23it/s]

Epoch: [0/3] Iteration: [750/18307] Loss: 1.906 Odds Ratio: 3.703


  4%|▍         | 800/18307 [10:45<3:58:01,  1.23it/s]

Epoch: [0/3] Iteration: [800/18307] Loss: 2.672 Odds Ratio: -0.013


  5%|▍         | 850/18307 [11:27<3:59:00,  1.22it/s]

Epoch: [0/3] Iteration: [850/18307] Loss: 1.773 Odds Ratio: 4.406


  5%|▍         | 900/18307 [12:07<3:57:21,  1.22it/s]

Epoch: [0/3] Iteration: [900/18307] Loss: 1.945 Odds Ratio: 2.266


  5%|▌         | 950/18307 [12:48<3:57:37,  1.22it/s]

Epoch: [0/3] Iteration: [950/18307] Loss: 2.250 Odds Ratio: 0.467


  5%|▌         | 1000/18307 [13:29<3:56:06,  1.22it/s]

Epoch: [0/3] Iteration: [1000/18307] Loss: 2.359 Odds Ratio: 3.250


  6%|▌         | 1050/18307 [14:10<3:55:49,  1.22it/s]

Epoch: [0/3] Iteration: [1050/18307] Loss: 1.961 Odds Ratio: 0.371


  6%|▌         | 1100/18307 [14:51<3:55:23,  1.22it/s]

Epoch: [0/3] Iteration: [1100/18307] Loss: 4.031 Odds Ratio: -2.453


  6%|▋         | 1150/18307 [15:33<4:03:10,  1.18it/s]

Epoch: [0/3] Iteration: [1150/18307] Loss: 2.578 Odds Ratio: 1.859


  7%|▋         | 1200/18307 [16:11<3:43:17,  1.28it/s]

Epoch: [0/3] Iteration: [1200/18307] Loss: 1.914 Odds Ratio: -0.324


  7%|▋         | 1250/18307 [16:52<3:53:30,  1.22it/s]

Epoch: [0/3] Iteration: [1250/18307] Loss: 2.078 Odds Ratio: 1.039


  7%|▋         | 1300/18307 [17:33<3:49:51,  1.23it/s]

Epoch: [0/3] Iteration: [1300/18307] Loss: 2.578 Odds Ratio: -0.402


  7%|▋         | 1350/18307 [18:13<3:49:56,  1.23it/s]

Epoch: [0/3] Iteration: [1350/18307] Loss: 2.281 Odds Ratio: 0.633


  8%|▊         | 1400/18307 [18:53<3:49:18,  1.23it/s]

Epoch: [0/3] Iteration: [1400/18307] Loss: 2.766 Odds Ratio: -0.746


  8%|▊         | 1450/18307 [19:32<3:32:40,  1.32it/s]

Epoch: [0/3] Iteration: [1450/18307] Loss: 2.234 Odds Ratio: 5.500


  8%|▊         | 1500/18307 [20:12<3:54:23,  1.20it/s]

Epoch: [0/3] Iteration: [1500/18307] Loss: 2.812 Odds Ratio: -0.480


  8%|▊         | 1550/18307 [20:51<3:39:10,  1.27it/s]

Epoch: [0/3] Iteration: [1550/18307] Loss: 2.312 Odds Ratio: -0.115


  9%|▊         | 1600/18307 [21:30<3:44:42,  1.24it/s]

Epoch: [0/3] Iteration: [1600/18307] Loss: 2.219 Odds Ratio: 0.111


  9%|▉         | 1650/18307 [22:09<3:31:48,  1.31it/s]

Epoch: [0/3] Iteration: [1650/18307] Loss: 2.328 Odds Ratio: 4.750


  9%|▉         | 1700/18307 [22:47<3:31:40,  1.31it/s]

Epoch: [0/3] Iteration: [1700/18307] Loss: 2.438 Odds Ratio: -0.023


 10%|▉         | 1750/18307 [23:25<3:30:31,  1.31it/s]

Epoch: [0/3] Iteration: [1750/18307] Loss: 1.750 Odds Ratio: 3.469


 10%|▉         | 1800/18307 [24:03<3:28:00,  1.32it/s]

Epoch: [0/3] Iteration: [1800/18307] Loss: 2.656 Odds Ratio: 0.011


 10%|█         | 1850/18307 [24:41<3:30:20,  1.30it/s]

Epoch: [0/3] Iteration: [1850/18307] Loss: 2.719 Odds Ratio: -0.297


 10%|█         | 1900/18307 [25:19<3:28:28,  1.31it/s]

Epoch: [0/3] Iteration: [1900/18307] Loss: 2.344 Odds Ratio: 0.637


 11%|█         | 1950/18307 [25:58<3:28:30,  1.31it/s]

Epoch: [0/3] Iteration: [1950/18307] Loss: 2.109 Odds Ratio: 0.551


 11%|█         | 2000/18307 [26:36<3:27:10,  1.31it/s]

Epoch: [0/3] Iteration: [2000/18307] Loss: 2.484 Odds Ratio: 0.080


 11%|█         | 2050/18307 [27:14<3:26:11,  1.31it/s]

Epoch: [0/3] Iteration: [2050/18307] Loss: 2.844 Odds Ratio: 0.613


 11%|█▏        | 2100/18307 [27:52<3:24:51,  1.32it/s]

Epoch: [0/3] Iteration: [2100/18307] Loss: 1.688 Odds Ratio: 12.250


 12%|█▏        | 2150/18307 [28:33<3:39:19,  1.23it/s]

Epoch: [0/3] Iteration: [2150/18307] Loss: 2.234 Odds Ratio: 3.750


 12%|█▏        | 2200/18307 [29:14<3:37:29,  1.23it/s]

Epoch: [0/3] Iteration: [2200/18307] Loss: 2.312 Odds Ratio: 2.531


 12%|█▏        | 2250/18307 [29:55<3:40:16,  1.21it/s]

Epoch: [0/3] Iteration: [2250/18307] Loss: 2.406 Odds Ratio: -0.223


 13%|█▎        | 2300/18307 [30:37<3:37:28,  1.23it/s]

Epoch: [0/3] Iteration: [2300/18307] Loss: 2.219 Odds Ratio: 0.906


 13%|█▎        | 2350/18307 [31:18<3:39:36,  1.21it/s]

Epoch: [0/3] Iteration: [2350/18307] Loss: 2.406 Odds Ratio: 2.047


 13%|█▎        | 2400/18307 [32:00<3:47:24,  1.17it/s]

Epoch: [0/3] Iteration: [2400/18307] Loss: 3.031 Odds Ratio: -1.219


 13%|█▎        | 2450/18307 [32:41<3:33:48,  1.24it/s]

Epoch: [0/3] Iteration: [2450/18307] Loss: 2.125 Odds Ratio: -0.066


 14%|█▎        | 2500/18307 [33:22<3:33:29,  1.23it/s]

Epoch: [0/3] Iteration: [2500/18307] Loss: 2.875 Odds Ratio: -0.906


 14%|█▍        | 2550/18307 [34:03<3:32:47,  1.23it/s]

Epoch: [0/3] Iteration: [2550/18307] Loss: 2.016 Odds Ratio: 0.641


 14%|█▍        | 2600/18307 [34:44<3:34:42,  1.22it/s]

Epoch: [0/3] Iteration: [2600/18307] Loss: 2.562 Odds Ratio: 0.449


 14%|█▍        | 2650/18307 [35:24<3:32:48,  1.23it/s]

Epoch: [0/3] Iteration: [2650/18307] Loss: 1.812 Odds Ratio: 1.258


 15%|█▍        | 2700/18307 [36:05<3:29:29,  1.24it/s]

Epoch: [0/3] Iteration: [2700/18307] Loss: 2.375 Odds Ratio: 0.711


 15%|█▌        | 2750/18307 [36:46<3:28:54,  1.24it/s]

Epoch: [0/3] Iteration: [2750/18307] Loss: 2.328 Odds Ratio: 3.875


 15%|█▌        | 2800/18307 [37:26<3:28:52,  1.24it/s]

Epoch: [0/3] Iteration: [2800/18307] Loss: 2.188 Odds Ratio: 0.017


 16%|█▌        | 2850/18307 [38:07<3:29:20,  1.23it/s]

Epoch: [0/3] Iteration: [2850/18307] Loss: 2.781 Odds Ratio: -0.201


 16%|█▌        | 2900/18307 [38:47<3:29:16,  1.23it/s]

Epoch: [0/3] Iteration: [2900/18307] Loss: 2.203 Odds Ratio: 1.422


 16%|█▌        | 2950/18307 [39:28<3:27:38,  1.23it/s]

Epoch: [0/3] Iteration: [2950/18307] Loss: 1.938 Odds Ratio: 5.531


 16%|█▋        | 3000/18307 [40:08<3:26:53,  1.23it/s]

Epoch: [0/3] Iteration: [3000/18307] Loss: 2.406 Odds Ratio: 0.150


 17%|█▋        | 3050/18307 [40:48<3:03:47,  1.38it/s]

Epoch: [0/3] Iteration: [3050/18307] Loss: 2.469 Odds Ratio: -0.270


 17%|█▋        | 3100/18307 [41:28<3:25:44,  1.23it/s]

Epoch: [0/3] Iteration: [3100/18307] Loss: 2.312 Odds Ratio: 0.742


 17%|█▋        | 3150/18307 [42:08<3:23:48,  1.24it/s]

Epoch: [0/3] Iteration: [3150/18307] Loss: 2.203 Odds Ratio: 6.188


 17%|█▋        | 3200/18307 [42:48<3:17:33,  1.27it/s]

Epoch: [0/3] Iteration: [3200/18307] Loss: 2.281 Odds Ratio: 1.867


 18%|█▊        | 3250/18307 [43:28<3:10:23,  1.32it/s]

Epoch: [0/3] Iteration: [3250/18307] Loss: 2.312 Odds Ratio: -0.324


 18%|█▊        | 3300/18307 [44:05<3:02:51,  1.37it/s]

Epoch: [0/3] Iteration: [3300/18307] Loss: 2.500 Odds Ratio: -0.104


 18%|█▊        | 3350/18307 [44:42<3:02:02,  1.37it/s]

Epoch: [0/3] Iteration: [3350/18307] Loss: 2.422 Odds Ratio: 0.186


 19%|█▊        | 3400/18307 [45:18<3:01:17,  1.37it/s]

Epoch: [0/3] Iteration: [3400/18307] Loss: 2.219 Odds Ratio: -0.031


 19%|█▉        | 3450/18307 [45:55<3:00:13,  1.37it/s]

Epoch: [0/3] Iteration: [3450/18307] Loss: 2.344 Odds Ratio: 1.328


 19%|█▉        | 3500/18307 [46:33<3:17:20,  1.25it/s]

Epoch: [0/3] Iteration: [3500/18307] Loss: 1.336 Odds Ratio: 5.625


 19%|█▉        | 3550/18307 [47:13<3:09:17,  1.30it/s]

Epoch: [0/3] Iteration: [3550/18307] Loss: 2.188 Odds Ratio: 5.031


 20%|█▉        | 3600/18307 [47:53<3:16:56,  1.24it/s]

Epoch: [0/3] Iteration: [3600/18307] Loss: 1.688 Odds Ratio: 3.875


 20%|█▉        | 3650/18307 [48:32<3:10:53,  1.28it/s]

Epoch: [0/3] Iteration: [3650/18307] Loss: 2.578 Odds Ratio: -0.262


 20%|██        | 3700/18307 [49:10<3:00:57,  1.35it/s]

Epoch: [0/3] Iteration: [3700/18307] Loss: 1.844 Odds Ratio: 0.641


 20%|██        | 3750/18307 [49:47<2:59:38,  1.35it/s]

Epoch: [0/3] Iteration: [3750/18307] Loss: 1.805 Odds Ratio: 2.109


 21%|██        | 3800/18307 [50:24<2:59:31,  1.35it/s]

Epoch: [0/3] Iteration: [3800/18307] Loss: 2.516 Odds Ratio: 0.311


 21%|██        | 3850/18307 [51:01<2:59:39,  1.34it/s]

Epoch: [0/3] Iteration: [3850/18307] Loss: 2.641 Odds Ratio: -0.184


 21%|██▏       | 3900/18307 [51:38<2:57:39,  1.35it/s]

Epoch: [0/3] Iteration: [3900/18307] Loss: 2.109 Odds Ratio: 0.250


 22%|██▏       | 3950/18307 [52:15<2:57:41,  1.35it/s]

Epoch: [0/3] Iteration: [3950/18307] Loss: 1.500 Odds Ratio: 0.906


 22%|██▏       | 4000/18307 [52:54<3:09:16,  1.26it/s]

Epoch: [0/3] Iteration: [4000/18307] Loss: 1.836 Odds Ratio: 6.438


 22%|██▏       | 4050/18307 [53:31<3:03:03,  1.30it/s]

Epoch: [0/3] Iteration: [4050/18307] Loss: 1.906 Odds Ratio: 3.875


 22%|██▏       | 4100/18307 [54:08<2:52:50,  1.37it/s]

Epoch: [0/3] Iteration: [4100/18307] Loss: 3.250 Odds Ratio: 0.395


 23%|██▎       | 4150/18307 [54:45<2:52:42,  1.37it/s]

Epoch: [0/3] Iteration: [4150/18307] Loss: 2.047 Odds Ratio: 1.828


 23%|██▎       | 4200/18307 [55:21<2:51:37,  1.37it/s]

Epoch: [0/3] Iteration: [4200/18307] Loss: 2.438 Odds Ratio: -0.120


 23%|██▎       | 4250/18307 [55:58<2:51:59,  1.36it/s]

Epoch: [0/3] Iteration: [4250/18307] Loss: 1.758 Odds Ratio: 5.812


 23%|██▎       | 4300/18307 [56:35<2:50:26,  1.37it/s]

Epoch: [0/3] Iteration: [4300/18307] Loss: 2.344 Odds Ratio: 1.500


 24%|██▍       | 4350/18307 [57:16<3:14:41,  1.19it/s]

Epoch: [0/3] Iteration: [4350/18307] Loss: 3.297 Odds Ratio: 0.029


 24%|██▍       | 4400/18307 [57:57<3:08:46,  1.23it/s]

Epoch: [0/3] Iteration: [4400/18307] Loss: 1.656 Odds Ratio: 3.031


 24%|██▍       | 4450/18307 [58:37<3:08:01,  1.23it/s]

Epoch: [0/3] Iteration: [4450/18307] Loss: 2.844 Odds Ratio: -0.187


 25%|██▍       | 4500/18307 [59:18<3:06:10,  1.24it/s]

Epoch: [0/3] Iteration: [4500/18307] Loss: 1.812 Odds Ratio: 2.141


 25%|██▍       | 4550/18307 [59:59<3:09:06,  1.21it/s]

Epoch: [0/3] Iteration: [4550/18307] Loss: 1.680 Odds Ratio: 5.531


 25%|██▌       | 4600/18307 [1:00:38<2:54:23,  1.31it/s]

Epoch: [0/3] Iteration: [4600/18307] Loss: 2.188 Odds Ratio: -0.069


 25%|██▌       | 4650/18307 [1:01:16<2:52:28,  1.32it/s]

Epoch: [0/3] Iteration: [4650/18307] Loss: 1.305 Odds Ratio: 4.188


 26%|██▌       | 4700/18307 [1:01:54<2:52:17,  1.32it/s]

Epoch: [0/3] Iteration: [4700/18307] Loss: 3.062 Odds Ratio: -0.812


 26%|██▌       | 4750/18307 [1:02:32<2:52:36,  1.31it/s]

Epoch: [0/3] Iteration: [4750/18307] Loss: 2.734 Odds Ratio: -0.480


 26%|██▌       | 4800/18307 [1:03:11<3:05:17,  1.21it/s]

Epoch: [0/3] Iteration: [4800/18307] Loss: 2.672 Odds Ratio: -0.426


 26%|██▋       | 4850/18307 [1:03:50<2:50:53,  1.31it/s]

Epoch: [0/3] Iteration: [4850/18307] Loss: 1.734 Odds Ratio: 5.531


 27%|██▋       | 4900/18307 [1:04:30<3:01:23,  1.23it/s]

Epoch: [0/3] Iteration: [4900/18307] Loss: 1.836 Odds Ratio: 4.250


 27%|██▋       | 4950/18307 [1:05:11<3:00:27,  1.23it/s]

Epoch: [0/3] Iteration: [4950/18307] Loss: 1.922 Odds Ratio: 2.312


 27%|██▋       | 5000/18307 [1:05:52<2:59:43,  1.23it/s]

Epoch: [0/3] Iteration: [5000/18307] Loss: 1.961 Odds Ratio: 0.949


 28%|██▊       | 5050/18307 [1:06:33<3:00:02,  1.23it/s]

Epoch: [0/3] Iteration: [5050/18307] Loss: 2.359 Odds Ratio: 0.531


 28%|██▊       | 5100/18307 [1:07:14<2:58:13,  1.24it/s]

Epoch: [0/3] Iteration: [5100/18307] Loss: 2.484 Odds Ratio: -0.260


 28%|██▊       | 5150/18307 [1:07:54<2:53:25,  1.26it/s]

Epoch: [0/3] Iteration: [5150/18307] Loss: 3.156 Odds Ratio: -0.934


 28%|██▊       | 5200/18307 [1:08:32<2:47:00,  1.31it/s]

Epoch: [0/3] Iteration: [5200/18307] Loss: 1.945 Odds Ratio: 4.250


 29%|██▊       | 5250/18307 [1:09:09<2:41:20,  1.35it/s]

Epoch: [0/3] Iteration: [5250/18307] Loss: 1.711 Odds Ratio: 0.641


 29%|██▉       | 5300/18307 [1:09:48<2:47:08,  1.30it/s]

Epoch: [0/3] Iteration: [5300/18307] Loss: 0.898 Odds Ratio: 3.188


 29%|██▉       | 5350/18307 [1:10:23<2:30:41,  1.43it/s]

Epoch: [0/3] Iteration: [5350/18307] Loss: 1.977 Odds Ratio: 4.969


 29%|██▉       | 5400/18307 [1:11:00<2:34:30,  1.39it/s]

Epoch: [0/3] Iteration: [5400/18307] Loss: 1.633 Odds Ratio: 2.094


 30%|██▉       | 5450/18307 [1:11:37<2:33:27,  1.40it/s]

Epoch: [0/3] Iteration: [5450/18307] Loss: 1.953 Odds Ratio: 1.250


 30%|███       | 5500/18307 [1:12:14<2:42:29,  1.31it/s]

Epoch: [0/3] Iteration: [5500/18307] Loss: 2.719 Odds Ratio: 0.049


 30%|███       | 5550/18307 [1:12:51<2:42:36,  1.31it/s]

Epoch: [0/3] Iteration: [5550/18307] Loss: 2.797 Odds Ratio: -0.617


 31%|███       | 5600/18307 [1:13:29<2:31:48,  1.40it/s]

Epoch: [0/3] Iteration: [5600/18307] Loss: 2.234 Odds Ratio: 1.094


 31%|███       | 5650/18307 [1:14:06<2:36:53,  1.34it/s]

Epoch: [0/3] Iteration: [5650/18307] Loss: 2.344 Odds Ratio: -0.266


 31%|███       | 5700/18307 [1:14:44<2:39:37,  1.32it/s]

Epoch: [0/3] Iteration: [5700/18307] Loss: 2.312 Odds Ratio: 5.906


 31%|███▏      | 5750/18307 [1:15:21<2:37:54,  1.33it/s]

Epoch: [0/3] Iteration: [5750/18307] Loss: 2.406 Odds Ratio: 0.090


 32%|███▏      | 5800/18307 [1:15:58<2:35:17,  1.34it/s]

Epoch: [0/3] Iteration: [5800/18307] Loss: 1.523 Odds Ratio: 1.445


 32%|███▏      | 5850/18307 [1:16:34<2:34:21,  1.35it/s]

Epoch: [0/3] Iteration: [5850/18307] Loss: 2.344 Odds Ratio: 1.398


 32%|███▏      | 5900/18307 [1:17:16<3:23:06,  1.02it/s]

Epoch: [0/3] Iteration: [5900/18307] Loss: 2.438 Odds Ratio: 0.059


 33%|███▎      | 5950/18307 [1:18:01<2:30:21,  1.37it/s]

Epoch: [0/3] Iteration: [5950/18307] Loss: 2.375 Odds Ratio: 3.734


 33%|███▎      | 6000/18307 [1:18:40<2:47:23,  1.23it/s]

Epoch: [0/3] Iteration: [6000/18307] Loss: 2.125 Odds Ratio: 0.264


 33%|███▎      | 6050/18307 [1:19:23<3:01:49,  1.12it/s]

Epoch: [0/3] Iteration: [6050/18307] Loss: 1.906 Odds Ratio: 4.188


 33%|███▎      | 6100/18307 [1:20:07<2:24:36,  1.41it/s]

Epoch: [0/3] Iteration: [6100/18307] Loss: 2.047 Odds Ratio: 6.656


 34%|███▎      | 6150/18307 [1:20:48<2:47:36,  1.21it/s]

Epoch: [0/3] Iteration: [6150/18307] Loss: 1.602 Odds Ratio: 7.000


 34%|███▍      | 6200/18307 [1:21:30<2:51:15,  1.18it/s]

Epoch: [0/3] Iteration: [6200/18307] Loss: 2.094 Odds Ratio: -0.256


 34%|███▍      | 6250/18307 [1:22:12<2:49:56,  1.18it/s]

Epoch: [0/3] Iteration: [6250/18307] Loss: 1.641 Odds Ratio: 0.895


 34%|███▍      | 6300/18307 [1:22:55<2:48:55,  1.18it/s]

Epoch: [0/3] Iteration: [6300/18307] Loss: 2.297 Odds Ratio: -0.118


 35%|███▍      | 6350/18307 [1:23:37<2:48:06,  1.19it/s]

Epoch: [0/3] Iteration: [6350/18307] Loss: 1.562 Odds Ratio: 2.344


 35%|███▍      | 6400/18307 [1:24:20<2:49:29,  1.17it/s]

Epoch: [0/3] Iteration: [6400/18307] Loss: 1.359 Odds Ratio: 3.812


 35%|███▌      | 6450/18307 [1:25:03<2:48:22,  1.17it/s]

Epoch: [0/3] Iteration: [6450/18307] Loss: 2.594 Odds Ratio: 0.283


 36%|███▌      | 6500/18307 [1:25:46<2:47:08,  1.18it/s]

Epoch: [0/3] Iteration: [6500/18307] Loss: 1.758 Odds Ratio: 1.547


 36%|███▌      | 6550/18307 [1:26:28<2:44:09,  1.19it/s]

Epoch: [0/3] Iteration: [6550/18307] Loss: 1.883 Odds Ratio: 0.289


 36%|███▌      | 6600/18307 [1:27:13<2:56:47,  1.10it/s]

Epoch: [0/3] Iteration: [6600/18307] Loss: 2.281 Odds Ratio: 0.527


 36%|███▋      | 6650/18307 [1:28:00<2:52:17,  1.13it/s]

Epoch: [0/3] Iteration: [6650/18307] Loss: 1.648 Odds Ratio: 7.781


 37%|███▋      | 6700/18307 [1:28:45<2:51:35,  1.13it/s]

Epoch: [0/3] Iteration: [6700/18307] Loss: 1.844 Odds Ratio: 6.344


 37%|███▋      | 6750/18307 [1:29:29<2:54:14,  1.11it/s]

Epoch: [0/3] Iteration: [6750/18307] Loss: 2.281 Odds Ratio: 0.332


 37%|███▋      | 6800/18307 [1:30:14<2:50:41,  1.12it/s]

Epoch: [0/3] Iteration: [6800/18307] Loss: 2.234 Odds Ratio: -0.125


 37%|███▋      | 6850/18307 [1:30:57<2:18:45,  1.38it/s]

Epoch: [0/3] Iteration: [6850/18307] Loss: 3.281 Odds Ratio: 0.191


 38%|███▊      | 6900/18307 [1:31:34<2:17:11,  1.39it/s]

Epoch: [0/3] Iteration: [6900/18307] Loss: 1.711 Odds Ratio: 2.859


 38%|███▊      | 6950/18307 [1:32:12<2:24:26,  1.31it/s]

Epoch: [0/3] Iteration: [6950/18307] Loss: 1.914 Odds Ratio: 1.305


 38%|███▊      | 7000/18307 [1:32:49<2:12:21,  1.42it/s]

Epoch: [0/3] Iteration: [7000/18307] Loss: 2.000 Odds Ratio: 2.125


 39%|███▊      | 7050/18307 [1:33:33<3:05:36,  1.01it/s]

Epoch: [0/3] Iteration: [7050/18307] Loss: 2.562 Odds Ratio: 0.432


 39%|███▉      | 7100/18307 [1:34:12<2:13:18,  1.40it/s]

Epoch: [0/3] Iteration: [7100/18307] Loss: 2.656 Odds Ratio: 0.439


 39%|███▉      | 7150/18307 [1:34:50<2:18:28,  1.34it/s]

Epoch: [0/3] Iteration: [7150/18307] Loss: 1.680 Odds Ratio: 3.391


 39%|███▉      | 7200/18307 [1:35:32<2:25:50,  1.27it/s]

Epoch: [0/3] Iteration: [7200/18307] Loss: 2.500 Odds Ratio: -0.297


 40%|███▉      | 7250/18307 [1:36:11<2:30:28,  1.22it/s]

Epoch: [0/3] Iteration: [7250/18307] Loss: 1.688 Odds Ratio: 4.219


 40%|███▉      | 7300/18307 [1:36:49<2:11:15,  1.40it/s]

Epoch: [0/3] Iteration: [7300/18307] Loss: 2.469 Odds Ratio: -0.408


 40%|████      | 7350/18307 [1:37:30<2:46:17,  1.10it/s]

Epoch: [0/3] Iteration: [7350/18307] Loss: 1.570 Odds Ratio: 4.938


 40%|████      | 7400/18307 [1:39:28<4:30:45,  1.49s/it] 

Epoch: [0/3] Iteration: [7400/18307] Loss: 2.516 Odds Ratio: 0.266


 41%|████      | 7450/18307 [1:40:42<4:29:07,  1.49s/it]

Epoch: [0/3] Iteration: [7450/18307] Loss: 2.703 Odds Ratio: -0.079


 41%|████      | 7500/18307 [1:41:56<4:28:06,  1.49s/it]

Epoch: [0/3] Iteration: [7500/18307] Loss: 2.250 Odds Ratio: -0.019


 41%|████      | 7550/18307 [1:43:09<4:26:34,  1.49s/it]

Epoch: [0/3] Iteration: [7550/18307] Loss: 1.500 Odds Ratio: 1.922


 42%|████▏     | 7600/18307 [1:44:23<4:25:40,  1.49s/it]

Epoch: [0/3] Iteration: [7600/18307] Loss: 2.328 Odds Ratio: 0.578


 42%|████▏     | 7650/18307 [1:45:37<4:24:25,  1.49s/it]

Epoch: [0/3] Iteration: [7650/18307] Loss: 2.188 Odds Ratio: 3.781


 42%|████▏     | 7700/18307 [1:46:51<4:22:53,  1.49s/it]

Epoch: [0/3] Iteration: [7700/18307] Loss: 2.125 Odds Ratio: 0.645


 42%|████▏     | 7750/18307 [1:48:04<4:21:52,  1.49s/it]

Epoch: [0/3] Iteration: [7750/18307] Loss: 2.625 Odds Ratio: -0.352


 43%|████▎     | 7800/18307 [1:49:18<4:20:24,  1.49s/it]

Epoch: [0/3] Iteration: [7800/18307] Loss: 2.094 Odds Ratio: 0.547


 43%|████▎     | 7850/18307 [1:50:32<4:19:09,  1.49s/it]

Epoch: [0/3] Iteration: [7850/18307] Loss: 2.438 Odds Ratio: 0.508


 43%|████▎     | 7900/18307 [1:51:46<4:18:25,  1.49s/it]

Epoch: [0/3] Iteration: [7900/18307] Loss: 2.500 Odds Ratio: -0.766


 43%|████▎     | 7950/18307 [1:52:59<4:17:25,  1.49s/it]

Epoch: [0/3] Iteration: [7950/18307] Loss: 1.383 Odds Ratio: 2.859


 44%|████▎     | 8000/18307 [1:54:13<4:15:09,  1.49s/it]

Epoch: [0/3] Iteration: [8000/18307] Loss: 2.766 Odds Ratio: -0.680


 44%|████▍     | 8050/18307 [1:55:27<4:14:16,  1.49s/it]

Epoch: [0/3] Iteration: [8050/18307] Loss: 2.531 Odds Ratio: 2.297


 44%|████▍     | 8100/18307 [1:56:41<4:12:47,  1.49s/it]

Epoch: [0/3] Iteration: [8100/18307] Loss: 2.219 Odds Ratio: 4.875


 45%|████▍     | 8150/18307 [1:57:54<4:12:08,  1.49s/it]

Epoch: [0/3] Iteration: [8150/18307] Loss: 3.422 Odds Ratio: -0.482


 45%|████▍     | 8200/18307 [1:59:08<4:10:32,  1.49s/it]

Epoch: [0/3] Iteration: [8200/18307] Loss: 2.375 Odds Ratio: -0.068


 45%|████▌     | 8250/18307 [2:00:22<4:09:33,  1.49s/it]

Epoch: [0/3] Iteration: [8250/18307] Loss: 2.500 Odds Ratio: 0.025


 45%|████▌     | 8300/18307 [2:01:36<4:08:24,  1.49s/it]

Epoch: [0/3] Iteration: [8300/18307] Loss: 2.594 Odds Ratio: 0.093


 46%|████▌     | 8350/18307 [2:02:49<4:06:28,  1.49s/it]

Epoch: [0/3] Iteration: [8350/18307] Loss: 2.531 Odds Ratio: 0.043


 46%|████▌     | 8400/18307 [2:04:03<4:05:36,  1.49s/it]

Epoch: [0/3] Iteration: [8400/18307] Loss: 1.453 Odds Ratio: 9.312


 46%|████▌     | 8450/18307 [2:05:17<4:04:20,  1.49s/it]

Epoch: [0/3] Iteration: [8450/18307] Loss: 1.922 Odds Ratio: 0.125


 46%|████▋     | 8500/18307 [2:06:30<4:02:47,  1.49s/it]

Epoch: [0/3] Iteration: [8500/18307] Loss: 1.859 Odds Ratio: 2.281


 47%|████▋     | 8550/18307 [2:07:44<4:02:05,  1.49s/it]

Epoch: [0/3] Iteration: [8550/18307] Loss: 2.625 Odds Ratio: 1.047


 47%|████▋     | 8600/18307 [2:08:58<4:00:35,  1.49s/it]

Epoch: [0/3] Iteration: [8600/18307] Loss: 2.219 Odds Ratio: 2.250


 47%|████▋     | 8650/18307 [2:10:12<3:59:02,  1.49s/it]

Epoch: [0/3] Iteration: [8650/18307] Loss: 2.094 Odds Ratio: 1.281


 48%|████▊     | 8700/18307 [2:11:25<3:58:16,  1.49s/it]

Epoch: [0/3] Iteration: [8700/18307] Loss: 2.094 Odds Ratio: 0.742


 48%|████▊     | 8750/18307 [2:12:39<3:57:20,  1.49s/it]

Epoch: [0/3] Iteration: [8750/18307] Loss: 2.000 Odds Ratio: 0.494


 48%|████▊     | 8800/18307 [2:13:53<3:56:08,  1.49s/it]

Epoch: [0/3] Iteration: [8800/18307] Loss: 1.984 Odds Ratio: 0.539


 48%|████▊     | 8850/18307 [2:15:07<3:54:29,  1.49s/it]

Epoch: [0/3] Iteration: [8850/18307] Loss: 2.391 Odds Ratio: 0.625


 49%|████▊     | 8900/18307 [2:16:20<3:53:15,  1.49s/it]

Epoch: [0/3] Iteration: [8900/18307] Loss: 2.516 Odds Ratio: 0.064


 49%|████▉     | 8950/18307 [2:17:34<3:52:24,  1.49s/it]

Epoch: [0/3] Iteration: [8950/18307] Loss: 2.047 Odds Ratio: 0.914


 49%|████▉     | 9000/18307 [2:18:48<3:50:45,  1.49s/it]

Epoch: [0/3] Iteration: [9000/18307] Loss: 2.203 Odds Ratio: 1.500


 49%|████▉     | 9050/18307 [2:20:02<3:49:33,  1.49s/it]

Epoch: [0/3] Iteration: [9050/18307] Loss: 1.469 Odds Ratio: 2.688


 50%|████▉     | 9100/18307 [2:21:15<3:49:02,  1.49s/it]

Epoch: [0/3] Iteration: [9100/18307] Loss: 1.891 Odds Ratio: 0.357


 50%|████▉     | 9150/18307 [2:22:29<3:47:07,  1.49s/it]

Epoch: [0/3] Iteration: [9150/18307] Loss: 2.391 Odds Ratio: 0.434


 50%|█████     | 9200/18307 [2:23:43<3:45:51,  1.49s/it]

Epoch: [0/3] Iteration: [9200/18307] Loss: 2.812 Odds Ratio: 1.500


 51%|█████     | 9250/18307 [2:24:57<3:44:24,  1.49s/it]

Epoch: [0/3] Iteration: [9250/18307] Loss: 1.508 Odds Ratio: 4.125


 51%|█████     | 9300/18307 [2:26:10<3:43:27,  1.49s/it]

Epoch: [0/3] Iteration: [9300/18307] Loss: 2.516 Odds Ratio: -0.020


 51%|█████     | 9350/18307 [2:27:24<3:42:03,  1.49s/it]

Epoch: [0/3] Iteration: [9350/18307] Loss: 1.406 Odds Ratio: 10.250


 51%|█████▏    | 9400/18307 [2:28:38<3:40:47,  1.49s/it]

Epoch: [0/3] Iteration: [9400/18307] Loss: 2.469 Odds Ratio: 1.984


 52%|█████▏    | 9450/18307 [2:29:52<3:39:36,  1.49s/it]

Epoch: [0/3] Iteration: [9450/18307] Loss: 2.531 Odds Ratio: 0.754


 52%|█████▏    | 9500/18307 [2:31:05<3:38:21,  1.49s/it]

Epoch: [0/3] Iteration: [9500/18307] Loss: 3.312 Odds Ratio: -0.178


 52%|█████▏    | 9550/18307 [2:32:19<3:37:11,  1.49s/it]

Epoch: [0/3] Iteration: [9550/18307] Loss: 1.273 Odds Ratio: 7.375


 52%|█████▏    | 9600/18307 [2:33:33<3:35:39,  1.49s/it]

Epoch: [0/3] Iteration: [9600/18307] Loss: 2.438 Odds Ratio: 4.219


 53%|█████▎    | 9650/18307 [2:34:46<3:34:33,  1.49s/it]

Epoch: [0/3] Iteration: [9650/18307] Loss: 1.938 Odds Ratio: 4.438


 53%|█████▎    | 9700/18307 [2:36:00<3:33:17,  1.49s/it]

Epoch: [0/3] Iteration: [9700/18307] Loss: 2.641 Odds Ratio: 0.125


 53%|█████▎    | 9750/18307 [2:37:14<3:32:30,  1.49s/it]

Epoch: [0/3] Iteration: [9750/18307] Loss: 2.312 Odds Ratio: 0.426


 54%|█████▎    | 9800/18307 [2:38:28<3:30:41,  1.49s/it]

Epoch: [0/3] Iteration: [9800/18307] Loss: 2.234 Odds Ratio: -0.291


 54%|█████▍    | 9850/18307 [2:39:41<3:29:43,  1.49s/it]

Epoch: [0/3] Iteration: [9850/18307] Loss: 1.836 Odds Ratio: 4.188


 54%|█████▍    | 9900/18307 [2:40:55<3:28:12,  1.49s/it]

Epoch: [0/3] Iteration: [9900/18307] Loss: 2.344 Odds Ratio: -0.106


 54%|█████▍    | 9950/18307 [2:42:09<3:27:10,  1.49s/it]

Epoch: [0/3] Iteration: [9950/18307] Loss: 2.062 Odds Ratio: 0.387


 55%|█████▍    | 10000/18307 [2:43:23<3:25:41,  1.49s/it]

Epoch: [0/3] Iteration: [10000/18307] Loss: 1.273 Odds Ratio: 5.594


 55%|█████▍    | 10050/18307 [2:44:36<3:24:21,  1.48s/it]

Epoch: [0/3] Iteration: [10050/18307] Loss: 1.602 Odds Ratio: 4.844


 55%|█████▌    | 10100/18307 [2:45:50<3:23:40,  1.49s/it]

Epoch: [0/3] Iteration: [10100/18307] Loss: 2.188 Odds Ratio: 0.781


 55%|█████▌    | 10150/18307 [2:47:04<3:22:07,  1.49s/it]

Epoch: [0/3] Iteration: [10150/18307] Loss: 2.812 Odds Ratio: 0.039


 56%|█████▌    | 10200/18307 [2:48:18<3:21:17,  1.49s/it]

Epoch: [0/3] Iteration: [10200/18307] Loss: 2.062 Odds Ratio: 0.777


 56%|█████▌    | 10250/18307 [2:49:31<3:20:08,  1.49s/it]

Epoch: [0/3] Iteration: [10250/18307] Loss: 2.844 Odds Ratio: -0.492


 56%|█████▋    | 10300/18307 [2:50:45<3:18:55,  1.49s/it]

Epoch: [0/3] Iteration: [10300/18307] Loss: 1.375 Odds Ratio: 3.234


 57%|█████▋    | 10350/18307 [2:51:59<3:17:13,  1.49s/it]

Epoch: [0/3] Iteration: [10350/18307] Loss: 1.594 Odds Ratio: 1.367


 57%|█████▋    | 10400/18307 [2:53:13<3:15:45,  1.49s/it]

Epoch: [0/3] Iteration: [10400/18307] Loss: 2.391 Odds Ratio: 3.328


 57%|█████▋    | 10450/18307 [2:54:26<3:14:31,  1.49s/it]

Epoch: [0/3] Iteration: [10450/18307] Loss: 3.141 Odds Ratio: 1.078


 57%|█████▋    | 10500/18307 [2:55:40<3:13:33,  1.49s/it]

Epoch: [0/3] Iteration: [10500/18307] Loss: 1.461 Odds Ratio: 7.000


 58%|█████▊    | 10550/18307 [2:56:54<3:12:28,  1.49s/it]

Epoch: [0/3] Iteration: [10550/18307] Loss: 2.531 Odds Ratio: 2.500


 58%|█████▊    | 10600/18307 [2:58:07<3:11:11,  1.49s/it]

Epoch: [0/3] Iteration: [10600/18307] Loss: 1.844 Odds Ratio: 0.594


 58%|█████▊    | 10650/18307 [2:59:21<3:10:09,  1.49s/it]

Epoch: [0/3] Iteration: [10650/18307] Loss: 1.789 Odds Ratio: 5.375


 58%|█████▊    | 10700/18307 [3:00:35<3:08:28,  1.49s/it]

Epoch: [0/3] Iteration: [10700/18307] Loss: 2.188 Odds Ratio: 2.062


 59%|█████▊    | 10750/18307 [3:01:49<3:07:55,  1.49s/it]

Epoch: [0/3] Iteration: [10750/18307] Loss: 2.547 Odds Ratio: 0.562


 59%|█████▉    | 10800/18307 [3:03:02<3:06:07,  1.49s/it]

Epoch: [0/3] Iteration: [10800/18307] Loss: 2.234 Odds Ratio: 1.961


 59%|█████▉    | 10850/18307 [3:04:16<3:04:58,  1.49s/it]

Epoch: [0/3] Iteration: [10850/18307] Loss: 1.883 Odds Ratio: 5.750


 60%|█████▉    | 10900/18307 [3:05:34<3:20:23,  1.62s/it]

Epoch: [0/3] Iteration: [10900/18307] Loss: 2.250 Odds Ratio: 1.094


 60%|█████▉    | 10940/18307 [3:06:39<2:05:41,  1.02s/it]


Training interrupted
Training finished, GPU memory cleaned
