In [1]:
import os
from config import *
import torch
from dataloaders import CNNDataset
from torch.utils.data import random_split, DataLoader
import pandas as pd



In [2]:
dataset = CNNDataset(data_csv=P_TRAIN_CSV, target_csv=P_TARGETS_CSV, matrix_dir=ETERNA_PKG_BPP )

In [3]:
generator1 = torch.Generator().manual_seed(42)
train_dataset, val_dataset = random_split(dataset, [0.7, 0.3], generator1)

In [4]:
dataset.matrix_df[dataset.matrix_df['sequence_id'] == '00257e85caac']['path']

143378    DATA/stanford-ribonanza-rna-folding/Ribonanza_...
Name: path, dtype: object

In [5]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4, persistent_workers=True)  
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4, persistent_workers=True) 

In [6]:
for x, y in train_dataloader:
    print(x.shape)
    break

torch.Size([64, 1, 224, 224])


In [6]:
dataset.__getitem__(0)

8
(207,)
Shape of reactivity:  torch.Size([223])
after function shape: torch.Size([224, 224])
Shape of matrix:  torch.Size([224, 224])


(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 tensor([ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         -3.6000e-02,  6.8300e-01,  1.8900e-01, -5.0000e-03, -3.9000e-02,
         -1.7000e-02,  2.7000e-02, -7.0000e-03,  1.4800e-01,  2.8000e-02,
         -1.4400e-01, -6.1000e-02, -7.3000e-02,  9.0000e-03,  0.0000e+00

In [10]:
## credits to https://github.com/GitYCC/crnn-pytorch/blob/master/src/model.py
import torch.nn as nn
import torch.nn.functional as F
import torch


class CRNN(nn.Module):

    def __init__(self, img_channel, img_height, img_width,
                 map_to_seq_hidden=64, rnn_hidden=256, leaky_relu=False):
        super(CRNN, self).__init__()

        self.cnn, (output_channel, output_height, output_width) = \
            self._cnn_backbone(img_channel, img_height, img_width, leaky_relu)

        self.map_to_seq = nn.Linear(output_channel * output_height, map_to_seq_hidden)

        self.rnn1 = nn.LSTM(map_to_seq_hidden, rnn_hidden, bidirectional=True)
        self.rnn2 = nn.LSTM(2 * rnn_hidden, rnn_hidden, bidirectional=True)

        self.dense = nn.Linear(2 * rnn_hidden, 1)

    def _cnn_backbone(self, img_channel, img_height, img_width, leaky_relu):
        assert img_height % 16 == 0
        assert img_width % 4 == 0

        channels = [img_channel, 64, 128, 256, 256, 512, 512, 512]
        kernel_sizes = [3, 3, 3, 3, 3, 3, 2]
        strides = [1, 1, 1, 1, 1, 1, 1]
        paddings = [1, 1, 1, 1, 1, 1, 0]

        cnn = nn.Sequential()

        def conv_relu(i, batch_norm=False):
            # shape of input: (batch, input_channel, height, width)
            input_channel = channels[i]
            output_channel = channels[i+1]

            cnn.add_module(
                f'conv{i}',
                nn.Conv2d(input_channel, output_channel, kernel_sizes[i], strides[i], paddings[i])
            )

            if batch_norm:
                cnn.add_module(f'batchnorm{i}', nn.BatchNorm2d(output_channel))

            relu = nn.LeakyReLU(0.2, inplace=True) if leaky_relu else nn.ReLU(inplace=True)
            cnn.add_module(f'relu{i}', relu)

        # size of image: (channel, height, width) = (img_channel, img_height, img_width)
        conv_relu(0)
        
        cnn.add_module('pooling0', nn.MaxPool2d(kernel_size=(1, 2), stride=(1, 2)))
        # (64, img_height, img_width // 2)

        conv_relu(1)
        cnn.add_module('pooling1', nn.MaxPool2d(kernel_size=(1, 2), stride=(1, 2)))
        # (128, img_height , img_width // 4)

        conv_relu(2)
        conv_relu(3)
        cnn.add_module(
            'pooling2',
            nn.MaxPool2d(kernel_size=(2, 1), kernel_sizes=(3,1))
        )  # (256, img_height, img_width // 12)

        conv_relu(4, batch_norm=True)
        conv_relu(5, batch_norm=True)
        cnn.add_module(
            'pooling3',
            nn.MaxPool2d(kernel_size=(2, 1), kernel_sizes=(19,1))
        )  # (512, img_height // 16, img_width // 4)

        conv_relu(6)  # (512, img_height // 16 - 1, img_width // 4 - 1)

        output_channel, output_height, output_width = \
            channels[-1], img_height // 16 - 1, img_width // 4 - 1
        return cnn, (output_channel, output_height, output_width)

    def forward(self, images):
        # shape of images: (batch, channel, height, width)
        print(images.size)
        conv = self.cnn(images)
        batch, channel, height, width = conv.size()

        conv = conv.view(batch, channel * height, width)
        conv = conv.permute(2, 0, 1)  # (width, batch, feature)
        seq = self.map_to_seq(conv)

        recurrent, _ = self.rnn1(seq)
        recurrent, _ = self.rnn2(recurrent)

        self.output = self.dense(recurrent)
        return self.output  # shape: (seq_len, batch, num_class)
    


In [11]:
import torch
import torch.nn.functional as F

from crnn import CRNN
import numpy as np

import os
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, random_split
import torch.optim as optim
from torch.nn import CTCLoss

from dataloaders import CNNDataset
from config import *


# 📉 Define loss functions for training and evaluation
def loss_fn(output, target):
    # 🪟 Clip the target values to be within the range [0, 1]
    clipped_target = torch.clip(target, min=0, max=1)
    # 📉 Calculate the mean squared error loss
    mses = F.mse_loss(output, clipped_target, reduction='mean')
    return mses

def mae_fn(output, target):
    # 🪟 Clip the target values to be within the range [0, 1]
    clipped_target = torch.clip(target, min=0, max=1)
    # 📉 Calculate the mean absolute error loss
    maes = F.l1_loss(output, clipped_target, reduction='mean')
    return maes

def train_batch(crnn, data, optimizer, criterion, device):
    crnn.train()
    images, targets= [d.to(device) for d in data]
    logits = crnn(images)
    #log_probs = torch.nn.functional.log_softmax(logits, dim=2)
    log_probs = logits

    batch_size = images.size(0)
    input_lengths = torch.LongTensor([logits.size(0)] * batch_size)
    target_lengths = torch.flatten(target_lengths)

    loss = criterion(log_probs, targets)

    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(crnn.parameters(), 5) # gradient clipping with 5
    optimizer.step()
    return loss.item()


def main():
    epochs = 1
    #dataset = CNNDataset(data_csv=P_TRAIN_CSV, target_csv=P_TARGETS_CSV, matrix_dir=ETERNA_PKG_BPP )
    generator1 = torch.Generator().manual_seed(42)
    #train_dataset, val_dataset = random_split(dataset, [0.7, 0.3], generator1)
    #dataset = CNNDataset(data_csv=P_TRAIN_CSV, target_csv=P_TARGETS_CSV, matrix_dir=ETERNA_PKG_BPP )
    #train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4, persistent_workers=True)  
    #val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4, persistent_workers=True) 
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f'device: {device}')

    model = CRNN(img_channel=1, img_width=224, img_height=224)
    print(model)
    #if reload_checkpoint:
    #    crnn.load_state_dict(torch.load(reload_checkpoint, map_location=device))
    model.to(device)

    # 📈 Define the optimizer with learning rate and weight decay
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0003, weight_decay=5e-4)

    # 🚂 Iterate over epochs
    for epoch in range(epochs):
        train_losses = []
        train_maes = []
        model.train()
        
        # 🚞 Iterate over batches in the training dataloader
        for batch in (pbar := tqdm(train_dataloader, position=0, leave=True)):
            loss = train_batch(model, batch, optimizer,loss_fn, device)
            mae = mae_fn(out[batch.valid_mask], batch.y[batch.valid_mask])
            loss.backward()
            train_losses.append(loss.detach().cpu().numpy())
            train_maes.append(mae.detach().cpu().numpy())
            optimizer.step()
            pbar.set_description(f"Train loss {loss.detach().cpu().numpy():.4f}")
        
        # 📊 Print average training loss and MAE for the epoch
        print(f"Epoch {epoch} train loss: ", np.mean(train_losses))
        print(f"Epoch {epoch} train mae: ", np.mean(train_maes))
        
        val_losses = []
        val_maes = []
        model.eval()
        
        # 🚞 Iterate over batches in the validation dataloader
        for batch in (pbar := tqdm(val_dataloader, position=0, leave=True)):
            batch = batch.to(device)
            optimizer.zero_grad()
            out = model(batch.x, batch.edge_index)
            out = torch.squeeze(out)
            loss = loss_fn(out[batch.valid_mask], batch.y[batch.valid_mask])
            mae = mae_fn(out[batch.valid_mask], batch.y[batch.valid_mask])
            val_losses.append(loss.detach().cpu().numpy())
            val_maes.append(mae.detach().cpu().numpy())
            pbar.set_description(f"Validation loss {loss.detach().cpu().numpy():.4f}")
        
        # 📊 Print average validation loss and MAE for the epoch
        print(f"Epoch {epoch} val loss: ", np.mean(val_losses))
        print(f"Epoch {epoch} val mae: ", np.mean(val_maes))


In [12]:
main()

device: cpu


  0%|          | 0/2159 [00:01<?, ?it/s]


RuntimeError: max_pool1d: Expected 2D or 3D (batch mode) tensor with optional 0 dim batch size for input, but got:[64, 64, 224, 224]

In [8]:
targets = pd.read_csv(P_TARGETS_CSV)

In [11]:
targets.iloc[0, :-1].values.astype('float32').shape

(207,)

In [13]:
test = 'DATA/Ribonanza_bpp_files/extra_data/9/8/2/ff4dcd9bf671.txt'
import numpy as np

# Load the data from the file
data = np.loadtxt(test)

# Determine the shape of the array
max_row = int(data[:, 0].max())
max_col = int(data[:, 1].max())

# Create an empty array filled with zeros
filled_array = np.zeros((max_row, max_col))

# Fill the values from the loaded data into the empty array
for row, col, value in data:
    filled_array[int(row) - 1, int(col) - 1] = value

In [15]:
filled_array.s

(191, 207)

In [40]:
dara = pd.read_csv(P_TRAIN_CSV)

In [30]:

file_list = []
file_paths = []
for root, dirs, files in os.walk(ETERNA_PKG_BPP):
    for file in files:
        if file.endswith('.txt'):
            file_list.append(os.path.join(root, file))
            file_paths.append(file[:-4])


In [31]:
matrix_df = pd.DataFrame(columns=['sequence_id', 'path'])
matrix_df['sequence_id'] = file_paths
matrix_df['path'] = file_paths

In [39]:
len(set(matrix_df['sequence_id'].to_list()) - set(targets['sequence_id'].to_list()))

1915372

In [None]:
matrix_df.drop(matrix_df['sequence_id']!=targets['sequence_id'])

In [None]:
matrix_path = dataset.matrix_df[dataset.matrix_df['sequence_id'] == '00257e85caac']['path'][0]


In [20]:
matrix_path

str