# Testing notebook

In [2]:
from src.seq2seq.dataset import SeqDataset, pad_batch
from functools import partial

pad_batch_with_fixed_length = partial(pad_batch, fixed_length=128)
dataset_path = '/home/gkulemeyer/Documents/Repos/AEseq2seq/data/ArchiveII-KFold/common/fold_0_test.csv'
data = SeqDataset( dataset_path, min_len=0, max_len=512, verbose=False, cache_path=None, for_prediction=False,  training=False)

In [22]:
L = [data[i]['length'] for i in range(len(data))]
L.sort()
print(L)

[32, 33, 33, 33, 35, 36, 38, 51, 58, 58, 62, 64, 65, 65, 66, 70, 71, 71, 72, 72, 72, 72, 72, 72, 73, 73, 73, 73, 74, 74, 74, 74, 74, 74, 74, 74, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 78, 78, 78, 78, 78, 78, 79, 79, 80, 81, 82, 82, 82, 84, 85, 85, 85, 85, 85, 86, 86, 86, 87, 87, 87, 88, 88, 88, 88, 89, 89, 90, 90, 90, 90, 91, 91, 91, 91, 92, 92, 92, 92, 93, 94, 96, 96, 96, 98, 98, 98, 98, 98, 98, 98, 98, 98, 99, 99, 99, 100, 100, 100, 101, 101, 101, 101, 101, 101, 101, 101, 102, 102, 102, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 104, 104, 104, 105, 105, 105, 105, 106, 106, 106, 106, 107, 107, 107, 108, 108, 110, 110, 111, 111, 111, 111, 111, 112, 113, 113, 113, 114, 114, 114, 114, 115, 115, 115, 1

In [3]:
for n in range(3):
    print(n)
    print(data[n]['length'])
    print(data[n]['embedding'].shape)

0
119
torch.Size([4, 119])
1
120
torch.Size([4, 120])
2
120
torch.Size([4, 120])


In [4]:
from torch.utils.data import DataLoader

loader = DataLoader(data, batch_size=1, shuffle=False,collate_fn=pad_batch_with_fixed_length)

next(iter(loader))['embedding'].shape

torch.Size([1, 4, 128])

In [5]:
import torch as tr
x = tr.tensor([[
    [1,0,0,0,1,0,0,0],
    [0,1,0,0,0,1,0,0],
    [0,0,1,0,0,0,1,0],
    [0,0,0,1,0,0,0,1],]
])
x.shape

torch.Size([1, 4, 8])

In [8]:
import pandas as pd
from torch.utils.data import Dataset
import torch as tr
import os
import json
import pickle
import random
from src.seq2seq.embeddings import OneHotEmbedding


class SeqDataset2(Dataset):
    def __init__(
        self,
        dataset_path,
        min_len=0,
        max_len=512,
        verbose=False,
        cache_path=None,
        for_prediction=False,
        training=False,
        n_swaps=0,
        **kargs,
    ):
        """
        interaction_prior: none, probmat
        """
        self.max_len = max_len
        self.verbose = verbose
        if cache_path is not None and not os.path.isdir(cache_path):
            os.mkdir(cache_path)
        self.cache = cache_path

        # Loading dataset
        data = pd.read_csv(dataset_path)
        self.training = training

        assert (
            "sequence" in data.columns and "id" in data.columns
        ), "Dataset should contain 'id' and 'sequence' columns"

        data["len"] = data.sequence.str.len()

        if max_len is None:
            max_len = max(data.len)
        self.max_len = max_len

        datalen = len(data)

        data = data[(data.len >= min_len) & (data.len <= max_len)]

        if len(data) < datalen:
            print(
                f"From {datalen} sequences, filtering {min_len} < len < {max_len} we have {len(data)} sequences"
            )

        self.sequences = data.sequence.tolist()
        self.ids = data.id.tolist()
        self.embedding = OneHotEmbedding()
        self.embedding_size = self.embedding.emb_size
        self.n_swaps = n_swaps

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seqid = self.ids[idx]
        cache = f"{self.cache}/{seqid}.pk"
        if (self.cache is not None) and os.path.isfile(cache):
            item = pickle.load(open(cache, "rb"))
        else:
            sequence = self.sequences[idx]
            L = len(sequence)
            seq_emb = self.embedding.seq2emb(sequence)
            embedding_with_noise = add_noise(seq_emb, self.n_swaps)

            item = {
                "id": seqid,
                "length": L,
                "sequence": sequence,
                "embedding": seq_emb,
                "embedding_with_noise": embedding_with_noise,
            }

            if self.cache is not None:
                pickle.dump(item, open(cache, "wb"))

        return item


def pad_batch(batch, fixed_length=0):
    """batch is a dictionary with different variables lists"""
    L = [b["length"] for b in batch]
    if fixed_length == 0:
        fixed_length = max(L)
    embedding_pad = tr.zeros((len(batch), batch[0]["embedding"].shape[0], fixed_length))
    embedding_pad_w_noise = tr.zeros(
        (len(batch), batch[0]["embedding_with_noise"].shape[0], fixed_length)
    )
    mask = tr.zeros((len(batch), fixed_length), dtype=tr.bool)

    for k in range(len(batch)):
        embedding_pad[k, :, : L[k]] = batch[k]["embedding"]
        embedding_pad_w_noise[k, :, : L[k]] = batch[k]["embedding_with_noise"]
        mask[k, : L[k]] = 1

    out_batch = {
        "id": [b["id"] for b in batch],
        "length": L,
        "sequence": [b["sequence"] for b in batch],
        "embedding": embedding_pad,
        "embedding_with_noise": embedding_pad_w_noise,
        "mask": mask,
    }

    return out_batch


def add_noise(x, N=0):
    assert N < x.shape[-1], "N should be lower than the shape of x (starting on 0)"

    if N == 0:
        return x

    x_l = [_ for _ in range(x.shape[-1])]
    random.shuffle(x_l)
    v = [0, 1, 2, 3]

    for _ in range(N):
        pos = x_l[-1]
        x_l.pop()
        random.shuffle(v)
        nt = tr.zeros([4], dtype=tr.float)
        nt[v[0]] = 1.0
        x[:, pos] = nt
    return x


In [42]:
pad_batch_with_fixed_length = partial(pad_batch, fixed_length=128)
dataset_path = '/home/gkulemeyer/Documents/Repos/AEseq2seq/data/ArchiveII-KFold/common/fold_0_train.csv'
data2 = SeqDataset2( dataset_path, min_len=0, max_len=512, verbose=False, cache_path=None, for_prediction=False,  training=False, n_swaps=5)

In [43]:
print(data2[0]['embedding_with_noise'].shape)

loader = DataLoader(data2, batch_size=1, shuffle=False,collate_fn=pad_batch_with_fixed_length) 
print(next(iter(loader))['embedding_with_noise'].shape)

print(data2[0]['embedding'].shape)

loader = DataLoader(data2, batch_size=1, shuffle=False,collate_fn=pad_batch_with_fixed_length)
print(next(iter(loader))['embedding'].shape)

torch.Size([4, 119])
torch.Size([1, 4, 128])
torch.Size([4, 119])
torch.Size([1, 4, 128])


In [46]:
abs(next(iter(loader))['embedding_with_noise'].view(-1) - next(iter(loader))['embedding'].view(-1)).sum()


tensor(10.)