In [2]:
import torch
from torch import nn
import pytorch_lightning as pl
import sklearn
from sklearn.model_selection import GroupKFold
from torch.utils.data import Dataset, DataLoader
import numpy as np
from xgboost import XGBClassifier as XGBC
from pytorch_lightning.loggers import WandbLogger
from torch.utils.data import TensorDataset, random_split
import torchmetrics
import wandb



In [3]:
class AveragedDNADataset(Dataset): 
    """
    Tensor DNA embedding data. Has shape (NxMxD), where N is the number of sequences, 
    M is the sequence length (variable) and D is the embedding dim (fixed).
    Averaging the M dimension: resulting shape is Nx1xD.
    """
    def __init__(self, data, labels, take_average=True): 
        """
        Parameters
        ----------
        data: list[torch.Tensor]
            A list of sequence embeddings. Dimensions of embeddings must match
        labels: torch.Tensor
            A vector of labels. 
        take_average: boolean
            Average the embeddings along the embedding dimension: (MxD) -> (1xD) 
        """
        self.take_average = take_average
        self.data = data
        self.labels = labels

        if self.take_average: 
            self.data = [torch.mean(seq, 0) for seq in self.data]
            self.data = torch.vstack(self.data)

    def __len__(self): 
        return len(self.data)

    def __getitem__(self, index): 
        return self.data[index], self.labels[index]

In [4]:
class AverageSequenceDataModule(pl.LightningDataModule): 
    def __init__(self, 
                 data, 
                 labels, 
                 train_batch_size=32, 
                 val_batch_size=32, 
                 test_batch_size=9999, 
                 split_ratio=(0.7, 0.2, 0.1)
                ): 
        super().__init__()
        self.train_batch_size = train_batch_size
        self.val_batch_size = val_batch_size
        self.test_batch_size = test_batch_size
        self.split_ratio = split_ratio
        self.data = data
        self.labels = labels

    def setup(self, stage: str): 
        self.data = [torch.mean(seq, 0) for seq in self.data]
        self.data = torch.utils.data.TensorDataset(torch.vstack(self.data), self.labels)
        
        self.train_data, self.val_data, self.test_data = torch.utils.data.random_split(self.data, self.split_ratio)

    def train_dataloader(self): 
        print("getting train dataloader")
        print("batch_size: ", self.train_batch_size)
        print("train_data: ")
        print(self.train_data)
        return DataLoader(self.train_data, batch_size=self.train_batch_size, shuffle=True)

    def val_dataloader(self): 
        return DataLoader(self.val_data, batch_size=self.val_batch_size, shuffle=True)
        
    def test_dataloader(self): 
        return DataLoader(self.test_data, batch_size=self.test_batch_size)

In [5]:
class LogisticRegression(pl.LightningModule): 
    def __init__(self, input_dim): 
        super().__init__()
        self.linear = torch.nn.Linear(input_dim, 1)
        self.criterion = nn.BCELoss()
        self.val_accuracy = torchmetrics.Accuracy(task="binary")
        self.train_accuracy = torchmetrics.Accuracy(task="binary")

        self.save_hyperparameters()

    def forward(self, x): 
        outputs = torch.sigmoid(self.linear(x))
        return outputs

    def training_step(self, batch, batch_idx): 
        x, y = batch
        y_hat = self.forward(x)
        loss = self.criterion(y_hat, y)
        acc = self.train_accuracy(y_hat, y)
        self.log("train_loss", loss)
        self.log("train_acc", acc, on_epoch=True)
        return loss


    def validation_step(self, batch, batch_idx): 
        x, y = batch
        y_hat = self.forward(x)
        loss = self.criterion(y_hat, y)
        accuracy = self.val_accuracy(y_hat, y)
        self.log("val_loss", loss)
        self.log("val_acc", accuracy, on_epoch=True)

    def test_step(self, batch, batch_idx): 
        x, y = batch
        y_hat = self.forward(x)
        loss = self.criterion(y_hat, y)
        return loss

    def configure_optimizers(self): 
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)



In [None]:
class XGBoostClassifier: 
    def __init__(self): 
        pass

    

In [6]:
N_pos = 1000
N_neg = 1000
embedding_dim = 50
sequence_length_range = (10, 50)
embedding_data_pos = [torch.Tensor(np.random.normal(loc=100, scale=1, size=(np.random.randint(*sequence_length_range), embedding_dim))) for i in range(N_pos)]
embedding_data_neg = [torch.Tensor(np.random.normal(loc=0, scale=1, size=(np.random.randint(*sequence_length_range), embedding_dim))) for i in range(N_neg)]
labels_pos = torch.Tensor(np.ones((N_pos, 1)))
labels_neg = torch.Tensor(np.zeros((N_neg, 1)))
embedding_data = embedding_data_pos + embedding_data_neg
labels = torch.vstack((labels_pos, labels_neg))
embedding_dataset = AverageSequenceDataModule(embedding_data,labels, train_batch_size=32)


In [7]:
wandb_logger = WandbLogger(project="DNA_seq_Tests", log_model="all")
lr_classifier = LogisticRegression(embedding_dim)


In [8]:

trainer = pl.Trainer(logger=wandb_logger, max_epochs=200)
trainer.fit(model=lr_classifier, datamodule=embedding_dataset)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[34m[1mwandb[0m: Currently logged in as: [33mjokpro[0m ([33mcmm-t1[0m). Use [1m`wandb login --relogin`[0m to force relogin


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/klumpi/miniconda3/envs/dnareg/lib/python3.11/site-packages/pytorch_lightning/core/optimizer.py:180: `LightningModule.configure_optimizers` returned `None`, this fit will run with no optimizer

  | Name           | Type           | Params
--------------------------------------------------
0 | linear         | Linear         | 51    
1 | criterion      | BCELoss        | 0     
2 | val_accuracy   | BinaryAccuracy | 0     
3 | train_accuracy | BinaryAccuracy | 0     
--------------------------------------------------
51        Trainable params
0         Non-trainable params
51        Total params
0.000     Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|                       | 0/2 [00:00<?, ?it/s]

/home/klumpi/miniconda3/envs/dnareg/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:492: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
/home/klumpi/miniconda3/envs/dnareg/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


                                                                                

/home/klumpi/miniconda3/envs/dnareg/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
/home/klumpi/miniconda3/envs/dnareg/lib/python3.11/site-packages/pytorch_lightning/loops/fit_loop.py:293: The number of training batches (5) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


getting train dataloader
batch_size:  32
train_data: 
<torch.utils.data.dataset.Subset object at 0x7f3c548c2850>
Epoch 0: 100%|███████████████████████| 5/5 [00:00<00:00, 126.98it/s, v_num=valf]
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                         | 0/2 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                            | 0/2 [00:00<?, ?it/s][A
Validation DataLoader 0:  50%|█████████▌         | 1/2 [00:00<00:00, 168.55it/s][A
Validation DataLoader 0: 100%|███████████████████| 2/2 [00:00<00:00, 186.07it/s][A
Epoch 1: 100%|███████████████████████| 5/5 [00:00<00:00, 175.22it/s, v_num=valf][A
Validation: |                                             | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                         | 0/2 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                            | 0/2 [00:00<?, ?it/s][A
Validation DataLoader 0:  50%|█████████▌         |

`Trainer.fit` stopped: `max_epochs=200` reached.


Epoch 199: 100%|██████████████████████| 5/5 [00:00<00:00, 20.16it/s, v_num=valf]


In [None]:
xgb_dataloader = DataLoader(embedding_dataset, batch_size=99999)
X, y = next(iter(xgb_dataloader))
xgb_classifier.fit(X, y)

In [None]:
print(xgb_classifier)

In [None]:
mean_data = [torch.mean(seq, 0) for seq in embedding_data]

In [None]:
torch.vstack(mean_data).shape

In [None]:
seqs = [torch.Tensor(np.random.randint(0, 10, (10, 1))), 
       torch.Tensor(np.random.randint(0, 10, (12, 1))), 
       torch.Tensor(np.random.randint(0, 10,(8, 1))), 
       torch.Tensor(np.random.randint(0, 10, (5, 1)))]
seq_lengths = [len(x) for x in seqs]

In [None]:
padded_seqs = pad_sequence(seqs, batch_first=True, padding_value=-999)

In [None]:
pack_padded_sequence(padded_seqs, lengths=seq_lengths, batch_first=True, enforce_sorted=False)

In [None]:
pack_padded_sequence(seqs, lengths=[len(x) for x in seqs])

In [None]:
test_tensor = torch.Tensor(np.arange(0, 360).reshape(3, 10, 12))

In [None]:
for i in test_tensor: 
    print(i)

In [None]:
mean_tensor = torch.mean(test_tensor, 1)
mean_tensor

In [None]:
mean_tensor.view(mean_tensor.size(0), -1)