In [1]:
import os
import pandas as pd
import numpy as np
import torch

from eugene.dataload._io import read_numpy, read

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

# Basic import
import eugene as eu

eu.__version__

Global seed set to 13


GPU is available: True
Number of GPUs: 1
Current GPU: 0
GPUs: NVIDIA GeForce RTX 2070


'0.0.6'

In [2]:
import gzip
import shutil
from eugene.datasets._utils import try_download_urls
def killoran17(dataset="chr1", return_sdata=True, **kwargs):
    urls_list = [
        "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/chr1.fa.gz"
    ]
    if dataset == "chr1":
        dataset = [0]
    paths = try_download_urls(dataset, urls_list, "killoran17")
    if dataset == [0]:
        paths = paths[0]
        print("Unzipping...")
        with gzip.open(paths, 'rb') as f_in:
            with open(paths[:-3], 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
                paths = paths[:-3]
    if return_sdata:
        return eu.dl.read_fasta(paths, **kwargs)
    else:
        return paths

In [3]:
sdata = killoran17()

Dataset killoran17 chr1.fa.gz has already been downloaded.
Unzipping...


In [4]:
def downsample_sdata(sdata, n=None, frac=None, copy=False):
    sdata = sdata.copy() if copy else sdata
    if n is None and frac is None:
        raise ValueError("Must specify either n or frac")
    if n is not None and frac is not None:
        raise ValueError("Must specify either n or frac, not both")
    num_seqs = sdata.n_obs
    if n is not None:
        if n > num_seqs:
            raise ValueError("n must be less than or equal to the number of sequences")
        rand_idx = np.random.choice(num_seqs, n, replace=False)
        sdata = sdata[rand_idx]
    elif frac is not None:
        if frac > 1:
            raise ValueError("frac must be less than or equal to 1")
        rand_idx = np.random.choice(num_seqs, int(num_seqs * frac), replace=False)
        sdata = sdata[rand_idx]
    return sdata
      
def remove_only_N_seqs(seqs):
    return [seq for seq in seqs if not all([x == "N" for x in seq])]  

def remove_only_N_seqs_sdata(sdata, copy=False):
    sdata = sdata.copy() if copy else sdata
    N_only_mask = np.array([all([x == "N" for x in seq]) for seq in sdata.seqs])
    sdata = sdata[~N_only_mask]
    return sdata

def seq_len_sdata(sdata, copy=False):
    sdata = sdata.copy() if copy else sdata
    sdata.seqs_annot["seq_len"] = [len(seq) for seq in sdata.seqs]
    return sdata

In [5]:
sdata_downsampled = downsample_sdata(sdata, frac=0.1, copy=True)

In [6]:
remove_only_N_seqs_sdata(sdata_downsampled, copy=False)
eu.pp.sanitize_seqs_sdata(sdata_downsampled, copy=False)

SeqData object modified:
	seqs: ['ATTAGCATACTATATACTAATAGAATTAGCATACTATATACTAATAGAAT'
 'tttcactggcctagagagctcccctctggaggaccctacaactgcagggt'
 'ccagttccagaacagttaagctgaaacctgaaaagatgactaggattagc' ...
 'GTAAAGAAGTATCCCCTCCCAGAAACATTTACTTCAAGTGAGTTAGTCAA'
 'CACTCTGCTCAGAGTCAGATACAGATAGAGCTGTTTTTGTTTTTATTTTT'
 'ggttctctgagtatatacgtaaattttagtatccagaccttttattttga'] -> 248956 seqs added


In [7]:
seq_len_sdata(sdata_downsampled, copy=False)

SeqData object with = 248956 seqs
seqs = (248956,)
names = (248956,)
rev_seqs = None
ohe_seqs = None
ohe_rev_seqs = None
seqs_annot: 'seq_len'
pos_annot: None
seqsm: None
uns: None

In [8]:
eu.pp.ohe_seqs_sdata(sdata_downsampled, copy=False)
eu.pp.train_test_split_sdata(sdata_downsampled)

One-hot encoding sequences:   0%|          | 0/248956 [00:00<?, ?it/s]

SeqData object modified:
	ohe_seqs: None -> 248956 ohe_seqs added
SeqData object modified:
    seqs_annot:
        + train_val


In [9]:
seq_len = 50
latent_dim = 128

In [10]:
import torch.nn as nn
import eugene.models.base as base

In [11]:
class Generator(nn.Module):
    def __init__(self, latent_dim, seq_len, density):
        super(Generator, self).__init__()
        self.linear = nn.Linear(latent_dim, seq_len*density)
        self.elu = nn.ELU()
        self.view = base.View((density, seq_len))
        self.res_blocks = nn.Sequential(*[base.ResidualBlock(density, density, 5, 1, 2)]*5)
        self.conv = nn.Conv1d(density, 4, 1)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        out = self.linear(x) 
        out = self.elu(out) 
        out = self.view(out) 
        out = self.res_blocks(out) 
        out = self.conv(out)
        out = self.softmax(out)
        return out

class Discriminator(nn.Module):
    def __init__(self, seq_len, density):
        super(Discriminator, self).__init__()
        self.conv = nn.Conv1d(4, density, 1)
        self.res_blocks = nn.Sequential(*[base.ResidualBlock(density, density, 5, 1, 2)]*5)
        self.view = base.View((density*seq_len,))
        self.linear = nn.Linear(density*seq_len, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.conv(x)
        out = self.res_blocks(out)
        out = self.view(out)
        out = self.linear(out)
        out = self.sigmoid(out)
        return out

In [12]:
gen = Generator(latent_dim, seq_len, density=128)
disc = Discriminator(seq_len, density=128)

In [13]:
model = eu.models.GAN(
    latent_dim=latent_dim, 
    generator=gen, 
    discriminator=disc,
    mode="wgangp",
    lambda_gp=10,
    gen_lr=1e-4,
    disc_lr=1e-4,
    n_critic=10,
)

In [14]:
eu.settings.batch_size = 128
eu.settings.dl_num_workers = 0
eu.settings.dl_pin_memory_gpu_training = True
eu.train.fit(
    model=model,
    sdata=sdata_downsampled,
    epochs=20,
    gpus=1,
    model_checkpoint_monitor=None,
    early_stopping_metric=None,
    name="test"
)

Global seed set to 13


No transforms given, assuming just need to tensorize.
No transforms given, assuming just need to tensorize.


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Missing logger folder: C:\Users\Lab\Documents\EUGENe\tests\notebooks\models\eugene_logs\test

  | Name          | Type          | Params
------------------------------------------------
0 | generator     | Generator     | 908 K 
1 | discriminator | Discriminator | 89.1 K
------------------------------------------------
997 K     Trainable params
0         Non-trainable params
997 K     Total params
3.989     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
Global seed set to 13
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

  rank_zero_deprecation(


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [39]:
%tensorboard

UsageError: Line magic function `%tensorboard` not found.


In [17]:
def generate_seqs(model, num_seqs):
    z = torch.Tensor(np.random.normal(0, 1, (num_seqs, model.latent_dim)))
    fake = model(z)
    fake_tokens = np.argmax(fake.detach().numpy(), axis=1).reshape(num_seqs, 50)
    return np.array([eu.pp.decode_seq(eu.pp._utils._token2one_hot(tokens)) for tokens in fake_tokens])

In [25]:
gen_seqs = generate_seqs(model, 1000)

In [26]:
with open("gen_seqs.txt", "w") as txt_file:
    for line in gen_seqs:
        txt_file.write("".join(line) + "\n")

---