In [1]:
import os
import pandas as pd
import numpy as np
import torch

from eugene.dataload._io import read_numpy, read

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

# Basic import
import eugene as eu

eu.__version__

Global seed set to 13


GPU is available: True
Number of GPUs: 1
Current GPU: 0
GPUs: NVIDIA GeForce RTX 2070


'0.0.6'

In [2]:
import gzip
import shutil
from eugene.datasets._utils import try_download_urls
def killoran17(dataset="chr1", return_sdata=True, **kwargs):
    urls_list = [
        "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/chr1.fa.gz"
    ]
    if dataset == "chr1":
        dataset = [0]
    paths = try_download_urls(dataset, urls_list, "killoran17")
    if dataset == [0]:
        paths = paths[0]
        print("Unzipping...")
        with gzip.open(paths, 'rb') as f_in:
            with open(paths[:-3], 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
                paths = paths[:-3]
    if return_sdata:
        return eu.dl.read_fasta(paths, **kwargs)
    else:
        return paths

In [3]:
sdata = killoran17()

Dataset killoran17 chr1.fa.gz has already been downloaded.
Unzipping...


In [4]:
def downsample_sdata(sdata, n=None, frac=None, copy=False):
    sdata = sdata.copy() if copy else sdata
    if n is None and frac is None:
        raise ValueError("Must specify either n or frac")
    if n is not None and frac is not None:
        raise ValueError("Must specify either n or frac, not both")
    num_seqs = sdata.n_obs
    if n is not None:
        if n > num_seqs:
            raise ValueError("n must be less than or equal to the number of sequences")
        rand_idx = np.random.choice(num_seqs, n, replace=False)
        sdata = sdata[rand_idx]
    elif frac is not None:
        if frac > 1:
            raise ValueError("frac must be less than or equal to 1")
        rand_idx = np.random.choice(num_seqs, int(num_seqs * frac), replace=False)
        sdata = sdata[rand_idx]
    return sdata
      
def remove_N_seqs(seqs, tolerance=False):
    if tolerance:
        return [seq for seq in seqs if not all([x == "N" for x in seq])]
    else:
        return [seq for seq in seqs if not any([x == "N" for x in seq])]  

def remove_N_seqs_sdata(sdata, tolerance=False, copy=False, repeat=1):
    sdata = sdata.copy() if copy else sdata
    # Sometimes doesn't remove every sequence its supposed to on the first time?
    for i in range(repeat):
        if tolerance:
            N_only_mask = np.array([all([x == "N" for x in seq]) for seq in sdata.seqs])
        else:
            N_only_mask = np.array([any([x == "N" for x in seq]) for seq in sdata.seqs])
        sdata = sdata[~N_only_mask]
    return sdata

def seq_len_sdata(sdata, copy=False):
    sdata = sdata.copy() if copy else sdata
    sdata.seqs_annot["seq_len"] = [len(seq) for seq in sdata.seqs]
    return sdata

In [5]:
sdata = downsample_sdata(sdata, frac=0.1, copy=True)

In [7]:
sdata = remove_N_seqs_sdata(sdata, tolerance=False, copy=False, repeat=5)
eu.pp.sanitize_seqs_sdata(sdata, copy=False)

In [8]:
eu.interpret.count_kmers_sdata(sdata, 1)

{'A': 3351326, 'T': 3364969, 'G': 2407086, 'C': 2403919}

In [9]:
seq_len_sdata(sdata, copy=False)

SeqData object with = 230546 seqs
seqs = (230546,)
names = (230546,)
rev_seqs = None
ohe_seqs = None
ohe_rev_seqs = None
seqs_annot: 'seq_len'
pos_annot: None
seqsm: None
uns: None

In [10]:
eu.pp.ohe_seqs_sdata(sdata, copy=False)
eu.pp.train_test_split_sdata(sdata)

One-hot encoding sequences:   0%|          | 0/230546 [00:00<?, ?it/s]

SeqData object modified:
	ohe_seqs: None -> 230546 ohe_seqs added
SeqData object modified:
    seqs_annot:
        + train_val


In [25]:
seq_len = 50
latent_dim = 128

import torch.nn as nn
import eugene.models.base as base

class Generator(nn.Module):
    def __init__(self, latent_dim, seq_len, density):
        super(Generator, self).__init__()
        self.linear = nn.Linear(latent_dim, seq_len*density)
        self.elu = nn.ELU()
        self.view = base.View((density, seq_len))
        self.res_blocks = nn.Sequential(*[base.ResidualBlock(density, density, 5, 1, 2)]*5)
        self.conv = nn.Conv1d(density, 4, 1)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        out = self.linear(x) 
        out = self.elu(out) 
        out = self.view(out) 
        out = self.res_blocks(out) 
        out = self.conv(out)
        out = self.softmax(out)
        return out

class Discriminator(nn.Module):
    def __init__(self, seq_len, density):
        super(Discriminator, self).__init__()
        self.conv = nn.Conv1d(4, density, 1)
        self.res_blocks = nn.Sequential(*[base.ResidualBlock(density, density, 5, 1, 2)]*5)
        self.view = base.View((density*seq_len,))
        self.linear = nn.Linear(density*seq_len, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.conv(x)
        out = self.res_blocks(out)
        out = self.view(out)
        out = self.linear(out)
        out = self.sigmoid(out)
        return out

In [10]:
gen = Generator(latent_dim, seq_len, density=128)
disc = Discriminator(seq_len, density=128)

In [11]:
model = eu.models.GAN(
    latent_dim=latent_dim, 
    generator=gen, 
    discriminator=disc,
    mode="wgangp",
    lambda_gp=10,
    gen_lr=1e-4,
    disc_lr=1e-4,
    n_critic=10,
    log_seqs_epoch=10,
)

In [13]:
eu.settings.batch_size = 128
eu.settings.dl_num_workers = 0
eu.settings.dl_pin_memory_gpu_training = True

In [16]:
eu.train.fit(
    model=model,
    sdata=sdata,
    epochs=50,
    gpus=1,
    model_checkpoint_monitor=None,
    early_stopping_metric=None,
    name="wgangp"
)

Global seed set to 13


No transforms given, assuming just need to tensorize.
No transforms given, assuming just need to tensorize.


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type          | Params
------------------------------------------------
0 | generator     | Generator     | 908 K 
1 | discriminator | Discriminator | 89.1 K
------------------------------------------------
997 K     Trainable params
0         Non-trainable params
997 K     Total params
3.989     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
Global seed set to 13
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

  rank_zero_deprecation(


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [18]:
%tensorboard

UsageError: Line magic function `%tensorboard` not found.


# Analysis

In [1]:
import os
import pandas as pd
import numpy as np
import torch

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

# Basic import
import eugene as eu

eu.__version__

Global seed set to 13


GPU is available: True
Number of GPUs: 1
Current GPU: 0
GPUs: NVIDIA GeForce RTX 2070


'0.0.6'

In [16]:
model = eu.models.GAN.load_from_checkpoint(checkpoint_path="eugene_logs/wgangp/version_0/checkpoints/epoch=49-step=72049.ckpt", hparams_file="eugene_logs/wgangp/version_0/hparams.yaml")

In [17]:
model

GAN(
  (generator): Generator(
    (linear): Linear(in_features=128, out_features=6400, bias=True)
    (elu): ELU(alpha=1.0)
    (view): View(128, 50)
    (res_blocks): Sequential(
      (0): ResidualBlock(
        (relu): ReLU()
        (conv): Conv1d(128, 128, kernel_size=(5,), stride=(1,), padding=(2,))
      )
      (1): ResidualBlock(
        (relu): ReLU()
        (conv): Conv1d(128, 128, kernel_size=(5,), stride=(1,), padding=(2,))
      )
      (2): ResidualBlock(
        (relu): ReLU()
        (conv): Conv1d(128, 128, kernel_size=(5,), stride=(1,), padding=(2,))
      )
      (3): ResidualBlock(
        (relu): ReLU()
        (conv): Conv1d(128, 128, kernel_size=(5,), stride=(1,), padding=(2,))
      )
      (4): ResidualBlock(
        (relu): ReLU()
        (conv): Conv1d(128, 128, kernel_size=(5,), stride=(1,), padding=(2,))
      )
    )
    (conv): Conv1d(128, 4, kernel_size=(1,), stride=(1,))
    (softmax): Softmax(dim=1)
  )
  (discriminator): Discriminator(
    (conv): 

In [33]:
gen_seqs = eu.interpret.generate_seqs_from_model(model, 1000)

In [20]:
with open("output/gen_seqs.txt", "w") as txt_file:
    for line in gen_seqs:
        txt_file.write("".join(line) + "\n")

### k-mer frequencies

In [22]:
gen_sdata = eu.dl.read_csv("output/gen_seqs.txt", "seqs")

In [23]:
k = 2
real_freqs = eu.interpret.count_kmers_sdata(sdata, k, True)
gen_freqs = eu.interpret.count_kmers_sdata(gen_sdata, k, True)

NameError: name 'sdata' is not defined

In [23]:
ratio = {}
for kmer in real_freqs:
    ratio[kmer] = gen_freqs[kmer] / real_freqs[kmer]

In [24]:
ratio

{'AT': 0.7687944042409162,
 'TT': 3.0548026685393257,
 'TA': 0.7991582280432729,
 'AG': 0.43275136497374334,
 'GC': 0.660240267155274,
 'CA': 0.4757463351646344,
 'AC': 0.37366231997296956,
 'CT': 1.432990130074833,
 'AA': 0.19625258268898338,
 'GA': 0.3527844537449675,
 'TC': 1.526787946188235,
 'TG': 1.3442401609534893,
 'GG': 0.6853700409638239,
 'CC': 0.6939692085205791,
 'GT': 1.577839802602694,
 'CG': 0.02773318898111392}

### Edit distance

In [25]:
n = 1000
sdata_train = downsample_sdata(sdata[np.where(sdata.seqs_annot["train_val"] == True)[0]], n)
sdata_test = downsample_sdata(sdata[np.where(sdata.seqs_annot["train_val"] != True)[0]], n)
gen_sdata = eu.dl.read_csv("gen_seqs.txt", "seqs")

In [30]:
base_distances = eu.interpret.edit_distance_sdata(sdata_train, sdata_test, True, True)
gen_distances = eu.interpret.edit_distance_sdata(gen_sdata, sdata_test, True, True)

In [31]:
print(base_distances, gen_distances)

35.438 35.197


### Latent interpolation

In [32]:
samples = 10
seqs = eu.interpret.latent_interpolation(latent_dim, samples, model=gen, normal=True, inclusive=True)

In [33]:
seqs

[array(['TCTTTTTTTCTTTTCTGCTTTCCACATTTTGATTTCCTTTTATTTCTTTC'], dtype='<U50'),
 array(['TCTTTTTTTCTTTTCTGCTTTCCACATTTTGATTGCCTTTTGTTTCTTTA'], dtype='<U50'),
 array(['TCTTTTTTTCTTTTCTGCTGTACACATTTTGATTGTCTTTTGCTTCTTTA'], dtype='<U50'),
 array(['TCTTTTTTTCTTCCCTGCTGTACACACTTTGATTGTCTTTTGCTTCTGTA'], dtype='<U50'),
 array(['TCTTTTTTTCTTCCCTGCAGTACACACTTAGATTGTCTTTTGCTTCTGTA'], dtype='<U50'),
 array(['TCTTTTCTTTTTCCTTGCAGCATACACTTAGAATGTCTTTTTCTTCTGTA'], dtype='<U50'),
 array(['TCTTTTCTTTTTCCTTGCAGCATACACTTAGAATGTCTTTTTCTTCTGTA'], dtype='<U50'),
 array(['TTTTTTCTTTTTCCTTGAAACATACACTGAGGATGTCTTTTTCTTCTGTA'], dtype='<U50'),
 array(['TTTTTTGTTTTTCCCTGAAACATACACTCAGGATGTCTTTGTGTTCTGTA'], dtype='<U50'),
 array(['TTTTTTGTTTTCCACTGACACATTCACTAAGGATGTCTTTGTGTTCAATA'], dtype='<U50'),
 array(['GTTTTTGTTTTCCACAGACAGCTTCACTAAGTATGTCTTTGTGTTCAAGA'], dtype='<U50'),
 array(['ATTTGTGTTTTCCACAGACAGCTGCCCTAAGTATGTCTTTGTATTCAAGA'], dtype='<U50')]

### Negative latent space

In [124]:
num_seqs = 1000
z = torch.normal(0, 1, (num_seqs, latent_dim))

In [125]:
def complement_edit_distances(z, num_seqs):
    z_neg = -z

    seqs = eu.interpret.seqs_from_tensor(model(z), num_seqs)
    seqs_neg = eu.interpret.seqs_from_tensor(model(z_neg), num_seqs)

    bases = {"A" : "T", "T" : "A", "C" : "G", "G" : "C"}
    for idx, seq in enumerate(seqs_neg):
        seq = list(seq)
        for _idx, x in enumerate(seq):
            seq[_idx] = bases[x]
        seq = ''.join(seq)
        seqs_neg[idx] = seq

    distances = []
    for x in range(len(seqs)):
        distances.append(eu.interpret.edit_distance(seqs[x], seqs_neg[x]))

    return sum(distances) / len(distances)


In [126]:
complement_edit_distances(z, num_seqs)

34.894