<a href="https://colab.research.google.com/github/Ljupka/Neural_Network_Bio/blob/main/04_DNA_enhancers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [1]:
!pip install -q genomic-benchmarks
!pip install torchmetrics -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for genomic-benchmarks (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25h

## Text preprocessing

In [2]:
import torch
example_seq = 'ACCCTGCCAACACGGGACTTTAC'
vocab = {'A':0,'C':1,'T':2,'G':3}

In [3]:
numericalized = [vocab[c] for c in example_seq]
numericalized

[0, 1, 1, 1, 2, 3, 1, 1, 0, 0, 1, 0, 1, 3, 3, 3, 0, 1, 2, 2, 2, 0, 1]

In [4]:
numericalized_tensor = torch.tensor(numericalized)
ohe_seq = torch.nn.functional.one_hot(numericalized_tensor, num_classes=5)
ohe_seq

tensor([[1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0],
        [0, 1, 0, 0, 0],
        [0, 1, 0, 0, 0],
        [0, 0, 1, 0, 0],
        [0, 0, 0, 1, 0],
        [0, 1, 0, 0, 0],
        [0, 1, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0],
        [0, 0, 0, 1, 0],
        [0, 0, 0, 1, 0],
        [0, 0, 0, 1, 0],
        [1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0],
        [0, 0, 1, 0, 0],
        [0, 0, 1, 0, 0],
        [0, 0, 1, 0, 0],
        [1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0]])

## Promoters Project

Your task is to

1.   **Create model for DNA sequence classification based on if it contains an promoter (label 1) or not (label 0)**
2.   **Show that your model is generalizing on new unseen data**

Tips
*   Use the pytorch documentation
*   You can use nn.Conv1d layer to perform convolution over 1D data.
*   Feel free to use any other improvements you can think of or find on the internet (e.g. more metrics, different architecture...)
*   Use GPU for training






In [5]:
from genomic_benchmarks.dataset_getters.pytorch_datasets import HumanNontataPromoters
import pandas as pd

train_df = pd.DataFrame(data=[{'x':x,'y':y} for x,y in HumanNontataPromoters('train')])
test_df = pd.DataFrame(data=[{'x':x,'y':y} for x,y in HumanNontataPromoters('test')])

  from tqdm.autonotebook import tqdm


In [6]:
train_df

Unnamed: 0,x,y
0,GAAGGAAGAGAGACAGGCGGGAGACCCCAGATTCTTCTAAACAACC...,0
1,AAAGTTTATAAATTTAATCTGATGAGGGTAAAAATTAATGTTCTTT...,0
2,GGCGGGCCCGGCGTCTCGCGGCCCCGGACTGACAAGGCGGCGCGGG...,0
3,ATCCCGAAAGGAGGTTGGCATTGCCCGGGTCATCGAGAGAGGGAGG...,0
4,CTTCCAGCCTGCTTATCCTCTGCCCCACTAGCCCCCACCCCCCAGC...,0
...,...,...
27092,CACAACCATCTGGGCTCGCTGAGACCTGGGCAGGCACAGGCCCAGG...,1
27093,ACATGGCAAGAAGGTGCTGACTTCCTTGGGAGATGCCATAAAGCAC...,1
27094,GGGCTGGGTCCTCTGCCCCTGCAGGTGGTCTATGTTGCCCGAAACC...,1
27095,AGCAAGACCCCATCTATATAAAACATTAAAAAGGGCCAGGCGCGGT...,1


In [7]:
from itertools import product

letters = list(vocab.keys())
triplets = [''.join(p) for p in product(letters, repeat=3)]
vocab_components = {triplet: i for i, triplet in enumerate(triplets)}

In [8]:
vocab_components

{'AAA': 0,
 'AAC': 1,
 'AAT': 2,
 'AAG': 3,
 'ACA': 4,
 'ACC': 5,
 'ACT': 6,
 'ACG': 7,
 'ATA': 8,
 'ATC': 9,
 'ATT': 10,
 'ATG': 11,
 'AGA': 12,
 'AGC': 13,
 'AGT': 14,
 'AGG': 15,
 'CAA': 16,
 'CAC': 17,
 'CAT': 18,
 'CAG': 19,
 'CCA': 20,
 'CCC': 21,
 'CCT': 22,
 'CCG': 23,
 'CTA': 24,
 'CTC': 25,
 'CTT': 26,
 'CTG': 27,
 'CGA': 28,
 'CGC': 29,
 'CGT': 30,
 'CGG': 31,
 'TAA': 32,
 'TAC': 33,
 'TAT': 34,
 'TAG': 35,
 'TCA': 36,
 'TCC': 37,
 'TCT': 38,
 'TCG': 39,
 'TTA': 40,
 'TTC': 41,
 'TTT': 42,
 'TTG': 43,
 'TGA': 44,
 'TGC': 45,
 'TGT': 46,
 'TGG': 47,
 'GAA': 48,
 'GAC': 49,
 'GAT': 50,
 'GAG': 51,
 'GCA': 52,
 'GCC': 53,
 'GCT': 54,
 'GCG': 55,
 'GTA': 56,
 'GTC': 57,
 'GTT': 58,
 'GTG': 59,
 'GGA': 60,
 'GGC': 61,
 'GGT': 62,
 'GGG': 63}

In [9]:
def get_components(seq, component_length=3):
    return [seq[i:i+component_length] for i in range(len(seq) - component_length+1)]

In [10]:
def bag_of_components(components, vocab_components):
    bow_vector = torch.zeros(len(vocab_components) + 1) # Add 1 for unknown components
    for comp in components:
        if comp in vocab_components:
            bow_vector[vocab_components[comp]] += 1
        else:
            bow_vector[len(vocab_components)] += 1 # Increment count for unknown components
    return bow_vector

In [11]:
# both of the options yield same output
#vocab_components.get('CCC', len(vocab_components))
#vocab_components.get('CCC')

In [12]:
import torch

def one_hot_encode_sequence_components(components, vocab_components):
    #print("components")
    #print(components)

    numericalized_components = [vocab_components.get(comp, len(vocab_components)) for comp in components]
    #print("numericalized components: ")
    #print(numericalized_components)

    numericalized_tensor = torch.tensor(numericalized_components)
    #print("numericalized tensor: ")
    #print(numericalized_tensor)
    # Add 1 to num_classes to account for components not found in vocab_components
    ohe_seq = torch.nn.functional.one_hot(numericalized_tensor, num_classes=len(vocab_components) + 1)

    #print("ohe seq: ")
    #print(ohe_seq)
    return ohe_seq


In [13]:
from torch.utils.data import Dataset, DataLoader

class PromotersDataset(Dataset):
    def __init__(self, train_df):
       self.train_df = train_df.copy()

       train_df['components'] = train_df['x'].apply(get_components)
       #train_df['bow_components'] = train_df['components'].apply(lambda x: bag_of_components(x, vocab_components))
       train_df['ohe_components'] = train_df['components'].apply(lambda x: one_hot_encode_sequence_components(x, vocab_components))


    def __len__(self):
       return len(self.train_df)

    def __getitem__(self, idx):
        if idx < 0 or idx >= len(self):
            raise IndexError(f"Index {idx} out of range")
        row = self.train_df.iloc[idx]
        # return a dict; convert to tensors here if desired
        return {
            "x": row["x"],
            "ohe_components": one_hot_encode_sequence_components(row["x"], vocab_components),
            "y": row["y"]
        }

In [14]:
dataset = PromotersDataset(train_df)
test_dataset = PromotersDataset(test_df)

In [15]:
print(len(dataset))
print(dataset[0])

27097
{'x': 'GAAGGAAGAGAGACAGGCGGGAGACCCCAGATTCTTCTAAACAACCAGATCTTGCGTGAACTAACCGAGCGAGAACTCACTTATCACCAAGAGGATGGTGCTGAGCCATTCATGAGGGAACTGCTGGCGTGATCCACTCACTTCCCATCAGGCCTCACTTCCAACATTGGGAATCACATTTCAGCATGAGATTTGGAGGGGAGAAACATCCACACCATACCACAAATAAAAAGTCCCAGAGTTGTCTACTT', 'ohe_components': tensor([[0, 0, 0,  ..., 0, 0, 1],
        [0, 0, 0,  ..., 0, 0, 1],
        [0, 0, 0,  ..., 0, 0, 1],
        ...,
        [0, 0, 0,  ..., 0, 0, 1],
        [0, 0, 0,  ..., 0, 0, 1],
        [0, 0, 0,  ..., 0, 0, 1]]), 'y': np.int64(0)}


In [16]:
# nr of 3-grams for the 0-th element of the train_df
first_elem = train_df.iloc[0]
len(first_elem['components'])

249

In [17]:
# Model
import torch.nn as nn
import math


class CNN(nn.Module):
    def __init__(self,num_classes=2):
        super().__init__()

        self.conv_layers = nn.Sequential(
            # in_channels:  due to the representation of the input as
            # out_channels: arbitrary value, nr of features to learn
            nn.Conv1d(in_channels=65, out_channels=16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=4, stride=4),
        )
        # Flatten layer
        self.flatten = nn.Flatten()

        # Fully connected layers
        self.fc1 = nn.Linear(16 * 62, 128) # Calculate input size based on conv output and pooling
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(128, num_classes)


    def forward(self, x):
        x = self.conv_layers(x)
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.relu3(x)
        x = self.fc2(x)
        return x

model = CNN(num_classes=2)

In [18]:
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#model(torch.rand(1, 65, 251).to(device))

In [19]:
import numpy as np
from torchmetrics import Accuracy

train_loader = torch.utils.data.DataLoader(dataset)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train (model, dataset, gpu=False):

  accuracy_function = Accuracy(task='multiclass', num_classes=2)
  criterion = nn.CrossEntropyLoss()


  model.to(device)
  accuracy_function.to(device)

  num_epochs = 2

  for epoch in range(num_epochs):
    for batch_idx, inputs in enumerate(train_loader):

      sequence = inputs['x']
      ohe_components = inputs['ohe_components'].float()
      labels = inputs['y']

      print("Ohe component: ")
      print(ohe_components)
      print("Labels: ")
      print(labels)

      print("shape: ")
      print(ohe_components.shape)
      input_reshaped = ohe_components.permute(0, 2, 1)
      print("reshaped: ")
      print(input_reshaped.shape)


      inputs = input_reshaped.to(device)
      labels = labels.to(device)

      optimizer.zero_grad()
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()

      if (batch_idx) % 10 == 0:
        print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Accuracy: %.4f'
          %(epoch+1, num_epochs, batch_idx, len(train_loader.dataset)//inputs.size()[0], loss.item(), accuracy_function(outputs,labels)))




In [20]:
train_df.head(5)

Unnamed: 0,x,y,components,ohe_components
0,GAAGGAAGAGAGACAGGCGGGAGACCCCAGATTCTTCTAAACAACC...,0,"[GAA, AAG, AGG, GGA, GAA, AAG, AGA, GAG, AGA, ...","[[tensor(0), tensor(0), tensor(0), tensor(0), ..."
1,AAAGTTTATAAATTTAATCTGATGAGGGTAAAAATTAATGTTCTTT...,0,"[AAA, AAG, AGT, GTT, TTT, TTA, TAT, ATA, TAA, ...","[[tensor(1), tensor(0), tensor(0), tensor(0), ..."
2,GGCGGGCCCGGCGTCTCGCGGCCCCGGACTGACAAGGCGGCGCGGG...,0,"[GGC, GCG, CGG, GGG, GGC, GCC, CCC, CCG, CGG, ...","[[tensor(0), tensor(0), tensor(0), tensor(0), ..."
3,ATCCCGAAAGGAGGTTGGCATTGCCCGGGTCATCGAGAGAGGGAGG...,0,"[ATC, TCC, CCC, CCG, CGA, GAA, AAA, AAG, AGG, ...","[[tensor(0), tensor(0), tensor(0), tensor(0), ..."
4,CTTCCAGCCTGCTTATCCTCTGCCCCACTAGCCCCCACCCCCCAGC...,0,"[CTT, TTC, TCC, CCA, CAG, AGC, GCC, CCT, CTG, ...","[[tensor(0), tensor(0), tensor(0), tensor(0), ..."


In [21]:
train(model, dataset)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 1.]]])
Labels: 
tensor([1])
shape: 
torch.Size([1, 251, 65])
reshaped: 
torch.Size([1, 65, 251])
Ohe component: 
tensor([[[0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 1.]]])
Labels: 
tensor([1])
shape: 
torch.Size([1, 251, 65])
reshaped: 
torch.Size([1, 65, 251])
Ohe component: 
tensor([[[0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 1.]]])
Labels: 
tensor([1])
shape: 
torch.Size([1, 251, 65])
reshaped: 
torch.Siz

## Testing

In [32]:
from tqdm import tqdm
from torchmetrics import Accuracy

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def evaluate(model, dataset, gpu=True):
  accuracy_function = Accuracy(task='binary')

  if(gpu):
    model.to(device)
    accuracy_function.to(device)

    loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
    model.eval() #Turn off training-only layers
    all_predictions = []
    all_labels = []
    with torch.no_grad(): #Dont track gradients
      for batch in tqdm(loader):
        batch_x = batch['ohe_components'].float()
        batch_y = batch['y']

        if(gpu):
          batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        # Reshape batch_x to be (batch_size, channels, length) for Conv1d
        batch_x = batch_x.permute(0, 2, 1)

        print("batch x: ")
        print(batch_x.shape)

        print("batch y: ")
        print(batch_y.shape)

        output = model(batch_x)

        print("output: ")
        print(output.shape)
        print(output)
        # Get the predicted class index
        predicted_classes = torch.argmax(output, dim=1)

        all_predictions.append(predicted_classes)
        all_labels.append(batch_y)

    print('Accuracy:', accuracy_function(torch.cat(all_predictions), torch.cat(all_labels)).item())

In [33]:
evaluate(model, test_dataset)

  0%|          | 1/283 [00:00<00:51,  5.52it/s]

batch x: 
torch.Size([32, 65, 251])
batch y: 
torch.Size([32])
output: 
torch.Size([32, 2])
tensor([[-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670]], device='cud

  8%|▊         | 23/283 [00:00<00:03, 70.12it/s]

batch x: 
torch.Size([32, 65, 251])
batch y: 
torch.Size([32])
output: 
torch.Size([32, 2])
tensor([[-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670]], device='cud

 16%|█▌        | 44/283 [00:00<00:02, 87.98it/s]

batch x: 
torch.Size([32, 65, 251])
batch y: 
torch.Size([32])
output: 
torch.Size([32, 2])
tensor([[-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670]], device='cud

 23%|██▎       | 65/283 [00:00<00:02, 94.40it/s]

batch x: 
torch.Size([32, 65, 251])
batch y: 
torch.Size([32])
output: 
torch.Size([32, 2])
tensor([[-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670]], device='cud

 30%|███       | 85/283 [00:01<00:02, 92.68it/s]

batch x: 
torch.Size([32, 65, 251])
batch y: 
torch.Size([32])
output: 
torch.Size([32, 2])
tensor([[-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670]], device='cud

 34%|███▍      | 96/283 [00:01<00:01, 95.44it/s]

batch x: 
torch.Size([32, 65, 251])
batch y: 
torch.Size([32])
output: 
torch.Size([32, 2])
tensor([[-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670]], device='cud

 42%|████▏     | 118/283 [00:01<00:01, 99.44it/s]

batch x: 
torch.Size([32, 65, 251])
batch y: 
torch.Size([32])
output: 
torch.Size([32, 2])
tensor([[-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670]], device='cud

 49%|████▉     | 140/283 [00:01<00:01, 100.79it/s]

tensor([[-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670]], device='cuda:0')
batch x: 
torch.Size([32, 65, 251])
batch y: 
torch.Size([32])
output: 
torch.Size([32

 57%|█████▋    | 161/283 [00:01<00:01, 97.31it/s]

batch x: 
torch.Size([32, 65, 251])
batch y: 
torch.Size([32])
output: 
torch.Size([32, 2])
tensor([[-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670]], device='cud

 64%|██████▍   | 181/283 [00:02<00:01, 94.71it/s]

batch x: 
torch.Size([32, 65, 251])
batch y: 
torch.Size([32])
output: 
torch.Size([32, 2])
tensor([[-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670]], device='cud

 71%|███████▏  | 202/283 [00:02<00:00, 97.09it/s]

batch x: 
torch.Size([32, 65, 251])
batch y: 
torch.Size([32])
output: 
torch.Size([32, 2])
tensor([[-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670]], device='cud

 79%|███████▉  | 223/283 [00:02<00:00, 98.45it/s]

batch x: 
torch.Size([32, 65, 251])
batch y: 
torch.Size([32])
output: 
torch.Size([32, 2])
tensor([[-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670]], device='cud

 86%|████████▌ | 244/283 [00:02<00:00, 99.02it/s]

tensor([[-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670]], device='cuda:0')
batch x: 
torch.Size([32, 65, 251])
batch y: 
torch.Size([32])
output: 
torch.Size([32

 94%|█████████▎| 265/283 [00:02<00:00, 97.82it/s]

output: 
torch.Size([32, 2])
tensor([[-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670]], device='cuda:0')
batch x: 
torch.Size([32, 65, 251])
batch y: 
torch.Size(

100%|██████████| 283/283 [00:03<00:00, 92.36it/s]

batch x: 
torch.Size([32, 65, 251])
batch y: 
torch.Size([32])
output: 
torch.Size([32, 2])
tensor([[-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670],
        [-6.4888,  5.9670]], device='cud




In [None]:
from itertools import product

# Bases
bases = ['A', 'C', 'G', 'T']

# Generate all 3-mer combinations
all_3mers = [''.join(p) for p in product(bases, repeat=3)]

print("Total 3-mers:", len(all_3mers))
print(all_3mers)

In [None]:
encoded_codons = [[vocab[base] for base in all_3mer] for all_3mer in all_3mers]
print(encoded_codons)