# Seq2Seq Builder


In this notebook we will see how to build a Seq2Seq model, to construct the source code of the seq2seq model, and also drop anything useless.

For that, we will use the following imports:
- from seq2seq.data import SeqDataset
- from seq2seq.embeddings import OneHotEmbedding
- from seq2seq.parser import parser

In [3]:
!rm -R build_test/

In [2]:
from torch.utils.data import DataLoader
from src.seq2seq.dataset import SeqDataset, pad_batch
config = {
    'device': 'cpu',
    'quiet': False,
    'batch_size': 4,
    'max_len': 128,
    'max_epochs': 1,
    'valid_file' : None,
    'nworkers': 2
}
train_file='data/ArchiveII.csv'
batch_size = config["batch_size"] if "batch_size" in config else 4
train_loader = DataLoader(
    SeqDataset(train_file, training=True, max_len=128,**config),
    batch_size=batch_size, 
    shuffle=True,
    num_workers=2,
    collate_fn=lambda batch: pad_batch(batch, fixed_length=128)
)

From 3864 sequences, filtering 0 < len < 128 we have 2326 sequences


In [3]:
from src.seq2seq.model import Seq2Seq

net = Seq2Seq(train_len=len(train_loader), **config)

In [4]:
train_metrics = net.fit(train_loader)

100%|██████████| 582/582 [01:20<00:00,  7.23it/s]


In [4]:
from src.seq2seq import train 

config = {
    'device': 'cpu',
    'batch_size': 4,
    'quiet': False,
    'max_len': 128,
    'max_epochs': 1,
    'valid_file' : None,
    'nworkers': 2
}
train_file='data/ArchiveII.csv'

out_path='./build_test/build_AE_seq2seq.csv'
# Llamar a train con los argumentos
train(
    train_file=train_file,
    config=config,
    out_path='./build_test/build_AE_seq2seq.csv',
    valid_file=None,
    nworkers=2
)

Working on ./build_test/build_AE_seq2seq.csv
From 3478 sequences, filtering 0 < len < 128 we have 2086 sequences
From 386 sequences, filtering 0 < len < 128 we have 240 sequences
No weights provided, using random initialization
Start training...


100%|██████████| 522/522 [01:18<00:00,  6.63it/s]
  0%|          | 0/60 [00:00<?, ?it/s]


KeyError: 'embedding'

# Pruebas

In [84]:
train_iter = iter(train_loader)  
batch = next(train_iter)     

In [98]:
print(batch.keys())
print(f'length: {batch["length"]}')
print(f'contact: {batch["contact"][0].shape}')
print(f'embedding: {batch["embedding"][0].shape}') 
print(f'sequence: {len(batch["sequence"][0])}') 
 
# Suponiendo que embedding_dim = 4 (como en tu condición)
embedding = batch["embedding"][0]  # [embedding_dim, seq_len]
seq_len = embedding.shape[1]

l = [i for i in range(seq_len) 
     if (embedding[0][i] == 0 and 
         embedding[1][i] == 0 and 
         embedding[2][i] == 0 and 
         embedding[3][i] == 0)]
embedding[0]

dict_keys(['contact', 'embedding', 'length', 'canonical_mask', 'interaction_prior', 'sequence', 'id'])
length: [76, 77, 83, 78]
contact: torch.Size([83, 83])
embedding: torch.Size([4, 83])
sequence: 76


tensor([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0.,
        0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0.,
        0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])

In [None]:
import matplotlib.pyplot as plt
plt.hist([i for i in len_seq if i < 128], bins=100)