# Text generation with deep learning

In [1]:
import pandas as pd
import os
from datetime import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import logging
import optuna
from torch.utils.tensorboard import SummaryWriter


from src.logger import setup_logger

setup_logger(level=logging.INFO)



In [2]:
class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, model="gru", n_layers=1, dropout=0.2):
        super().__init__()
        self.model_type = model.lower()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        self.encoder = nn.Embedding(input_size, hidden_size)
        rnn_class = nn.GRU if self.model_type == "gru" else nn.LSTM
        self.rnn = rnn_class(
            hidden_size, hidden_size, n_layers,
            dropout=dropout if n_layers > 1 else 0,
            batch_first=True
        )
        self.dropout = nn.Dropout(dropout)
        self.decoder = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden=None):
        batch_size = input.size(0)
        encoded = self.encoder(input)
        output, hidden = self.rnn(encoded, hidden)
        output = self.dropout(output)
        decoded = self.decoder(output.contiguous().view(-1, self.hidden_size))
        return decoded.view(batch_size, -1, self.output_size), hidden

    def init_hidden(self, batch_size, device):
        if self.model_type == "lstm":
            return (
                torch.zeros(self.n_layers, batch_size, self.hidden_size).to(device),
                torch.zeros(self.n_layers, batch_size, self.hidden_size).to(device)
            )
        else:
            return torch.zeros(self.n_layers, batch_size, self.hidden_size).to(device)

In [3]:
class TextDataset(Dataset):
    def __init__(self, text, chunk_len=200, stride=50):
        self.text = text
        self.chunk_len = chunk_len
        self.stride = stride
        self.unique_chars = sorted(set(text))
        self.char_to_idx = {c: i for i, c in enumerate(self.unique_chars)}
        self.idx_to_char = {i: c for i, c in enumerate(self.unique_chars)}
        self.data = self._process_text()
    
    def __len__(self):
        return len(self.data)
    
    def _process_text(self):
        sequences = []
        for i in range(0, len(self.text) - self.chunk_len, self.stride):
            chunk = self.text[i:i+self.chunk_len+1]
            sequences.append(chunk)
        return sequences
    
    def __getitem__(self, idx):
        chunk = self.data[idx]
        input_seq = [self.char_to_idx[c] for c in chunk[:-1]]
        target_seq = [self.char_to_idx[c] for c in chunk[1:]]
        return torch.LongTensor(input_seq), torch.LongTensor(target_seq)
    
    @property
    def vocab_size(self):
        return len(self.unique_chars)

In [4]:
def generate_sample(model, dataset, device, prompt="The", max_length=500, temperature=1.0, top_k=10, top_p=0.9):
    model.eval()
    generated = []
    input_seq = torch.LongTensor([dataset.char_to_idx[c] for c in prompt]).unsqueeze(0).to(device)  # (batch=1, seq_len)
    hidden = model.init_hidden(1, device)
    
    with torch.no_grad():
        if len(prompt) > 0:
            _, hidden = model(input_seq, hidden)
        
        input_seq = input_seq[:, -1].unsqueeze(1)
        
        for _ in range(max_length):
            outputs, hidden = model(input_seq, hidden)
            logits = outputs[:, -1, :] / temperature  # Берем последний выходной токен
            
            if top_k > 0:
                logits = _top_k_filter(logits, top_k)
            if top_p > 0.0:
                logits = _top_p_filter(logits, top_p)
            
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            generated.append(next_token.item())
            input_seq = next_token
    
    generated_str = prompt + ''.join([dataset.idx_to_char[idx] for idx in generated])
    print("\nGenerated text:")
    print(generated_str)
    return generated_str

def _top_k_filter(logits, k):
    values, _ = torch.topk(logits, k)
    min_values = values[:, -1].unsqueeze(1)
    return torch.where(logits < min_values, torch.ones_like(logits)*-float('inf'), logits)

def _top_p_filter(logits, p):
    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
    
    sorted_indices_to_remove = cumulative_probs > p
    sorted_indices_to_remove[..., 0] = 0
    indices_to_remove = sorted_indices_to_remove.scatter(
        1, sorted_indices, sorted_indices_to_remove
    )
    return logits.masked_fill(indices_to_remove, -float('inf'))

In [5]:
CharRNN(10, 10, 10).hidden_size

10

In [6]:
from tqdm.auto import tqdm

def evaluate(model, val_loader, device):
    model.eval()
    criterion = nn.CrossEntropyLoss()
    total_loss = 0.0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs = inputs.to(device)
            targets = targets.to(device)
            hidden = model.init_hidden(inputs.size(0), device)
            outputs, _ = model(inputs, hidden)
            loss = criterion(outputs.transpose(1, 2), targets)
            total_loss += loss.item() * inputs.size(0)
    return total_loss / len(val_loader.dataset)

def train_model(model, dataset, epochs=50, batch_size=32, lr=3e-4, trial=None):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True
    )
    
    optimizer = torch.optim.AdamW([
        {'params': model.encoder.parameters(), 'weight_decay': 0.01},
        {'params': model.rnn.parameters()},
        {'params': model.decoder.parameters(), 'weight_decay': 0.01}
    ], lr=lr, fused=True)
    
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=lr,
        total_steps=epochs * len(loader),
        pct_start=0.1
    )
    
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    scaler = torch.amp.GradScaler()

    writer = SummaryWriter(
        log_dir=f"runs/LR_{lr:.6f}-model_type_{model.model_type}-hidden_size_{model.hidden_size}-n_layers_{model.n_layers}-batch_size_{batch_size}"
        )
    
    best_loss = float('inf')
    grad_norms = []
    
    max_grad_norm = 1.0
    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        progress = tqdm(loader, desc=f"Epoch {epoch+1}", leave=False)
        
        for batch_idx, (inputs, targets) in enumerate(progress):
            current_batch_size = inputs.size(0)
            inputs = inputs.to(device, non_blocking=True)
            targets = targets.to(device, non_blocking=True)
            optimizer.zero_grad(set_to_none=True)
            with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
                hidden = model.init_hidden(current_batch_size, device)
                outputs, _ = model(inputs, hidden)
                loss = criterion(outputs.transpose(1, 2), targets)
                l2_reg = sum(p.norm(2) for p in model.parameters())
                loss += 0.001 * l2_reg

            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            grad_norm = torch.nn.utils.clip_grad_norm_(
                model.parameters(),
                max_norm=max_grad_norm,
                norm_type=2,
                error_if_nonfinite=False
            )
            
            grad_norms.append(grad_norm.item())
            
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            
            total_loss += loss.item()
            progress.set_postfix({
                'loss': f"{loss.item():.4f}",
                'grad': f"{grad_norm:.2f}",
                'lr': f"{optimizer.param_groups[0]['lr']:.2e}"
            })
            
            if batch_idx % 10 == 0:
                writer.add_scalar('Train/Loss', loss.item(), epoch*len(loader)+batch_idx)
                writer.add_scalar('Train/Grad_Norm', grad_norm.item(), epoch*len(loader)+batch_idx)
                writer.add_scalar('LR', optimizer.param_groups[0]['lr'], epoch*len(loader)+batch_idx)
        
        avg_loss = total_loss / len(loader)
        writer.add_scalar('Epoch/Loss', avg_loss, epoch)
        
        logging.info(f"Epoch {epoch+1}/{epochs} - "
                    f"Loss: {avg_loss:.4f} - "
                    f"Grad Norm: {grad_norm:.2f} - "
                    f"LR: {optimizer.param_groups[0]['lr']:.2e}")
        
        # Optuna integration
        if trial is not None:
            trial.report(avg_loss, epoch)
            if trial.should_prune():
                raise optuna.exceptions.TrialPruned()
        
        if avg_loss < best_loss and not torch.isnan(torch.tensor(avg_loss)):
            best_loss = avg_loss
            torch.save({
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'epoch': epoch,
                'loss': avg_loss
            }, "best_model.pth")
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    writer.close()
    return best_loss

In [7]:
def objective(trial):
    params = {
        'model_type': trial.suggest_categorical('model_type', ['lstm', 'gru']),
        'hidden_size': trial.suggest_int('hidden_size', 128, 512),
        'n_layers': trial.suggest_int('n_layers', 1, 4),
        'dropout': trial.suggest_float('dropout', 0.1, 0.5),
        'lr': trial.suggest_float('lr', 1e-4, 1e-2, log=True),
        'batch_size': trial.suggest_categorical('batch_size', [64, 128, 256])
    }
    
    # Load data
    df = pd.read_csv('data/arxiv.csv')
    text = ' '.join(df['summary'].dropna().values)
    dataset = TextDataset(text, chunk_len=250, stride=100)
    
    # Create model
    model = CharRNN(
        input_size=dataset.vocab_size,
        hidden_size=params['hidden_size'],
        output_size=dataset.vocab_size,
        model=params['model_type'],
        n_layers=params['n_layers'],
        dropout=params['dropout']
    )
    
    best_loss = train_model(
        model,
        dataset,
        epochs=10,
        batch_size=params['batch_size'],
        lr=params['lr'],
        trial=trial
    )
    
    return best_loss

In [8]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50, timeout=3600)

print("Best trial:")
trial = study.best_trial

best_trial = study.best_trial
pd.DataFrame([best_trial.params])

[I 2025-02-16 01:45:51,068] A new study created in memory with name: no-name-6b37bc1c-b4b2-40d2-928f-ce0c65060938


Epoch 1:   0%|          | 0/1133 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 01:47:22] 693302834:99 - INFO - Epoch 1/10 - Loss: 2.3419 - Grad Norm: 0.11 - LR: 7.48e-03[0m


Epoch 2:   0%|          | 0/1133 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 01:48:50] 693302834:99 - INFO - Epoch 2/10 - Loss: 2.0114 - Grad Norm: 0.09 - LR: 7.26e-03[0m


Epoch 3:   0%|          | 0/1133 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 01:50:11] 693302834:99 - INFO - Epoch 3/10 - Loss: 1.9910 - Grad Norm: 0.09 - LR: 6.61e-03[0m


Epoch 4:   0%|          | 0/1133 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 01:51:37] 693302834:99 - INFO - Epoch 4/10 - Loss: 1.9773 - Grad Norm: 0.08 - LR: 5.61e-03[0m


Epoch 5:   0%|          | 0/1133 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 01:52:52] 693302834:99 - INFO - Epoch 5/10 - Loss: 1.9635 - Grad Norm: 0.08 - LR: 4.39e-03[0m


Epoch 6:   0%|          | 0/1133 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 01:54:05] 693302834:99 - INFO - Epoch 6/10 - Loss: 1.9499 - Grad Norm: 0.08 - LR: 3.09e-03[0m


Epoch 7:   0%|          | 0/1133 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 01:55:21] 693302834:99 - INFO - Epoch 7/10 - Loss: 1.9360 - Grad Norm: 0.08 - LR: 1.87e-03[0m


Epoch 8:   0%|          | 0/1133 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 01:56:42] 693302834:99 - INFO - Epoch 8/10 - Loss: 1.9236 - Grad Norm: 0.08 - LR: 8.75e-04[0m


Epoch 9:   0%|          | 0/1133 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 01:58:03] 693302834:99 - INFO - Epoch 9/10 - Loss: 1.9136 - Grad Norm: 0.07 - LR: 2.25e-04[0m


Epoch 10:   0%|          | 0/1133 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 01:59:24] 693302834:99 - INFO - Epoch 10/10 - Loss: 1.9079 - Grad Norm: 0.07 - LR: 3.01e-08[0m


[I 2025-02-16 01:59:24,937] Trial 0 finished with value: 1.9079196999163515 and parameters: {'model_type': 'gru', 'hidden_size': 441, 'n_layers': 1, 'dropout': 0.4858638630439517, 'lr': 0.007484870584147558, 'batch_size': 256}. Best is trial 0 with value: 1.9079196999163515.


Epoch 1:   0%|          | 0/4533 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:01:42] 693302834:99 - INFO - Epoch 1/10 - Loss: 2.2286 - Grad Norm: 0.09 - LR: 3.44e-03[0m


Epoch 2:   0%|          | 0/4533 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:03:59] 693302834:99 - INFO - Epoch 2/10 - Loss: 1.9794 - Grad Norm: 0.09 - LR: 3.33e-03[0m


Epoch 3:   0%|          | 0/4533 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:06:22] 693302834:99 - INFO - Epoch 3/10 - Loss: 1.9565 - Grad Norm: 0.20 - LR: 3.04e-03[0m


Epoch 4:   0%|          | 0/4533 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:09:13] 693302834:99 - INFO - Epoch 4/10 - Loss: 1.9295 - Grad Norm: 0.20 - LR: 2.58e-03[0m


Epoch 5:   0%|          | 0/4533 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:11:44] 693302834:99 - INFO - Epoch 5/10 - Loss: 1.9144 - Grad Norm: 0.15 - LR: 2.02e-03[0m


Epoch 6:   0%|          | 0/4533 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:13:38] 693302834:99 - INFO - Epoch 6/10 - Loss: 1.9005 - Grad Norm: 0.22 - LR: 1.42e-03[0m


Epoch 7:   0%|          | 0/4533 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:15:56] 693302834:99 - INFO - Epoch 7/10 - Loss: 1.8875 - Grad Norm: 0.26 - LR: 8.59e-04[0m


Epoch 8:   0%|          | 0/4533 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:17:51] 693302834:99 - INFO - Epoch 8/10 - Loss: 1.8755 - Grad Norm: 0.15 - LR: 4.02e-04[0m


Epoch 9:   0%|          | 0/4533 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:19:53] 693302834:99 - INFO - Epoch 9/10 - Loss: 1.8660 - Grad Norm: 0.12 - LR: 1.04e-04[0m


Epoch 10:   0%|          | 0/4533 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:21:27] 693302834:99 - INFO - Epoch 10/10 - Loss: 1.8606 - Grad Norm: 0.13 - LR: 1.38e-08[0m


[I 2025-02-16 02:21:27,387] Trial 1 finished with value: 1.860638885815728 and parameters: {'model_type': 'lstm', 'hidden_size': 474, 'n_layers': 1, 'dropout': 0.3486814559811154, 'lr': 0.0034383992551262907, 'batch_size': 64}. Best is trial 1 with value: 1.860638885815728.


Epoch 1:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:22:24] 693302834:99 - INFO - Epoch 1/10 - Loss: 3.1395 - Grad Norm: 0.11 - LR: 1.80e-04[0m


Epoch 2:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:23:20] 693302834:99 - INFO - Epoch 2/10 - Loss: 2.2835 - Grad Norm: 0.14 - LR: 1.74e-04[0m


Epoch 3:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:24:22] 693302834:99 - INFO - Epoch 3/10 - Loss: 2.1471 - Grad Norm: 0.18 - LR: 1.59e-04[0m


Epoch 4:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:25:22] 693302834:99 - INFO - Epoch 4/10 - Loss: 2.0943 - Grad Norm: 0.19 - LR: 1.35e-04[0m


Epoch 5:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:26:24] 693302834:99 - INFO - Epoch 5/10 - Loss: 2.0670 - Grad Norm: 0.17 - LR: 1.05e-04[0m


Epoch 6:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:27:24] 693302834:99 - INFO - Epoch 6/10 - Loss: 2.0512 - Grad Norm: 0.16 - LR: 7.42e-05[0m


Epoch 7:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:28:24] 693302834:99 - INFO - Epoch 7/10 - Loss: 2.0418 - Grad Norm: 0.15 - LR: 4.49e-05[0m


Epoch 8:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:29:25] 693302834:99 - INFO - Epoch 8/10 - Loss: 2.0366 - Grad Norm: 0.15 - LR: 2.10e-05[0m


Epoch 9:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:30:28] 693302834:99 - INFO - Epoch 9/10 - Loss: 2.0339 - Grad Norm: 0.14 - LR: 5.41e-06[0m


Epoch 10:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:31:29] 693302834:99 - INFO - Epoch 10/10 - Loss: 2.0331 - Grad Norm: 0.14 - LR: 7.19e-10[0m


[I 2025-02-16 02:31:30,019] Trial 2 finished with value: 2.0330622869577435 and parameters: {'model_type': 'lstm', 'hidden_size': 321, 'n_layers': 1, 'dropout': 0.320673475556442, 'lr': 0.00017952978220407063, 'batch_size': 128}. Best is trial 1 with value: 1.860638885815728.


Epoch 1:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:32:44] 693302834:99 - INFO - Epoch 1/10 - Loss: 2.8145 - Grad Norm: 0.13 - LR: 2.28e-04[0m


Epoch 2:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:33:52] 693302834:99 - INFO - Epoch 2/10 - Loss: 2.0844 - Grad Norm: 0.17 - LR: 2.21e-04[0m


Epoch 3:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:35:02] 693302834:99 - INFO - Epoch 3/10 - Loss: 1.9933 - Grad Norm: 0.16 - LR: 2.01e-04[0m


Epoch 4:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:36:14] 693302834:99 - INFO - Epoch 4/10 - Loss: 1.9505 - Grad Norm: 0.18 - LR: 1.71e-04[0m


Epoch 5:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:37:22] 693302834:99 - INFO - Epoch 5/10 - Loss: 1.9261 - Grad Norm: 0.16 - LR: 1.34e-04[0m


Epoch 6:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:38:24] 693302834:99 - INFO - Epoch 6/10 - Loss: 1.9117 - Grad Norm: 0.16 - LR: 9.40e-05[0m


Epoch 7:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:39:26] 693302834:99 - INFO - Epoch 7/10 - Loss: 1.9029 - Grad Norm: 0.15 - LR: 5.69e-05[0m


Epoch 8:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:40:28] 693302834:99 - INFO - Epoch 8/10 - Loss: 1.8976 - Grad Norm: 0.15 - LR: 2.66e-05[0m


Epoch 9:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:41:29] 693302834:99 - INFO - Epoch 9/10 - Loss: 1.8948 - Grad Norm: 0.16 - LR: 6.86e-06[0m


Epoch 10:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:42:30] 693302834:99 - INFO - Epoch 10/10 - Loss: 1.8937 - Grad Norm: 0.15 - LR: 9.12e-10[0m


[I 2025-02-16 02:42:30,692] Trial 3 finished with value: 1.89370472533242 and parameters: {'model_type': 'gru', 'hidden_size': 509, 'n_layers': 1, 'dropout': 0.11791046128873349, 'lr': 0.00022762592761211306, 'batch_size': 128}. Best is trial 1 with value: 1.860638885815728.


Epoch 1:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:43:47] 693302834:99 - INFO - Epoch 1/10 - Loss: 2.8181 - Grad Norm: 0.12 - LR: 3.56e-03[0m


Epoch 2:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:45:03] 693302834:99 - INFO - Epoch 2/10 - Loss: 2.3175 - Grad Norm: 0.11 - LR: 3.46e-03[0m


Epoch 3:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:46:20] 693302834:99 - INFO - Epoch 3/10 - Loss: 2.2899 - Grad Norm: 0.12 - LR: 3.15e-03[0m


Epoch 4:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:47:36] 693302834:99 - INFO - Epoch 4/10 - Loss: 2.2759 - Grad Norm: 0.11 - LR: 2.67e-03[0m


Epoch 5:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:48:53] 693302834:99 - INFO - Epoch 5/10 - Loss: 2.2643 - Grad Norm: 0.11 - LR: 2.09e-03[0m


Epoch 6:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:50:10] 693302834:99 - INFO - Epoch 6/10 - Loss: 2.2542 - Grad Norm: 0.11 - LR: 1.47e-03[0m


Epoch 7:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:51:28] 693302834:99 - INFO - Epoch 7/10 - Loss: 2.2448 - Grad Norm: 0.11 - LR: 8.91e-04[0m


Epoch 8:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:52:44] 693302834:99 - INFO - Epoch 8/10 - Loss: 2.2370 - Grad Norm: 0.11 - LR: 4.17e-04[0m


Epoch 9:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:53:59] 693302834:99 - INFO - Epoch 9/10 - Loss: 2.2314 - Grad Norm: 0.11 - LR: 1.07e-04[0m


Epoch 10:   0%|          | 0/2266 [00:00<?, ?it/s]

[32m>>> [2025-02-16 | 02:55:16] 693302834:99 - INFO - Epoch 10/10 - Loss: 2.2285 - Grad Norm: 0.11 - LR: 1.43e-08[0m


[I 2025-02-16 02:55:16,484] Trial 4 finished with value: 2.2285121556203764 and parameters: {'model_type': 'gru', 'hidden_size': 134, 'n_layers': 4, 'dropout': 0.44910169818021606, 'lr': 0.0035647419550270004, 'batch_size': 128}. Best is trial 1 with value: 1.860638885815728.


Best trial:


Unnamed: 0,model_type,hidden_size,n_layers,dropout,lr,batch_size
0,lstm,474,1,0.348681,0.003438,64
