In [1]:
import pandas as pd
import numpy as np
import torch

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

# Basic import
import eugene as eu
eu.__version__

Global seed set to 13


'0.0.6'

# Test data

In [2]:
sdata = eu.datasets.random1000()

In [3]:
eu.pp.ohe_seqs_sdata(sdata)

One-hot encoding sequences:   0%|          | 0/1000 [00:00<?, ?it/s]

SeqData object modified:
	ohe_seqs: None -> 1000 ohe_seqs added


In [4]:
eu.pp.reverse_complement_seqs_sdata(sdata)

SeqData object modified:
	ohe_rev_seqs: None -> 1000 ohe_rev_seqs added


# Clean up dinuc shuffle

In [5]:
# dinuc_shuffle
from eugene.preprocess._utils import _string_to_char_array, _char_array_to_string, _tokens_to_one_hot, _one_hot_to_tokens

# concise versions
from eugene.preprocess._utils import _one_hot2token, _token2one_hot

In [6]:
# Seqs to use
seqs = sdata.seqs
seq = seqs[0]
ohe_seqs = sdata.ohe_seqs
ohe_seq = ohe_seqs[0]
ohe_seq_T = ohe_seq.T

In [7]:
# Keep these from dincu_shuffle.py
char_array = _string_to_char_array(seq)
re_seq = _char_array_to_string(char_array)
seq, char_array, re_seq

('TGCGAGGCCATGGCTCATGAGTTCTAAGGATGCGAATAACACAAAAAGCCGCGATCTTAAACGTTCTACACTTCTAAGGTCTGCATGAGCGAACCGAAAC',
 array([84, 71, 67, 71, 65, 71, 71, 67, 67, 65, 84, 71, 71, 67, 84, 67, 65,
        84, 71, 65, 71, 84, 84, 67, 84, 65, 65, 71, 71, 65, 84, 71, 67, 71,
        65, 65, 84, 65, 65, 67, 65, 67, 65, 65, 65, 65, 65, 71, 67, 67, 71,
        67, 71, 65, 84, 67, 84, 84, 65, 65, 65, 67, 71, 84, 84, 67, 84, 65,
        67, 65, 67, 84, 84, 67, 84, 65, 65, 71, 71, 84, 67, 84, 71, 67, 65,
        84, 71, 65, 71, 67, 71, 65, 65, 67, 67, 71, 65, 65, 65, 67],
       dtype=int8),
 'TGCGAGGCCATGGCTCATGAGTTCTAAGGATGCGAATAACACAAAAAGCCGCGATCTTAAACGTTCTACACTTCTAAGGTCTGCATGAGCGAACCGAAAC')

In [8]:
# Match these with concise versions
tokens = _one_hot_to_tokens(ohe_seq_T)
re_ohe_seq = _tokens_to_one_hot(tokens, one_hot_dim=4)
ohe_seq_T[:5], tokens, re_ohe_seq[:5]

(array([[0, 0, 0, 1],
        [0, 0, 1, 0],
        [0, 1, 0, 0],
        [0, 0, 1, 0],
        [1, 0, 0, 0]], dtype=int8),
 array([3, 2, 1, 2, 0, 2, 2, 1, 1, 0, 3, 2, 2, 1, 3, 1, 0, 3, 2, 0, 2, 3,
        3, 1, 3, 0, 0, 2, 2, 0, 3, 2, 1, 2, 0, 0, 3, 0, 0, 1, 0, 1, 0, 0,
        0, 0, 0, 2, 1, 1, 2, 1, 2, 0, 3, 1, 3, 3, 0, 0, 0, 1, 2, 3, 3, 1,
        3, 0, 1, 0, 1, 3, 3, 1, 3, 0, 0, 2, 2, 3, 1, 3, 2, 1, 0, 3, 2, 0,
        2, 1, 2, 0, 0, 1, 1, 2, 0, 0, 0, 1]),
 array([[0., 0., 0., 1.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [1., 0., 0., 0.]]))

In [9]:
# Concise versions
tokens = _one_hot2token(ohe_seq)
re_ohe_seq = _token2one_hot(tokens, vocab="DNA")
ohe_seq[:5], tokens, re_ohe_seq[:5]

(array([[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
         0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
         1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
         0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
         0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0],
        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
         0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
         0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
         0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
         0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1],
        [0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
         0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
        

In [10]:
from eugene import settings
# Added concise
def dinuc_shuffle_seq(
    seq, 
    num_shufs=None, 
    rng=None
):
    """
    Creates shuffles of the given sequence, in which dinucleotide frequencies
    are preserved.

    If `seq` is a string, returns a list of N strings of length L, each one
    being a shuffled version of `seq`. If `seq` is a 2D np array, then the
    result is an N x L x D np array of shuffled versions of `seq`, also
    one-hot encoded. If `num_shufs` is not specified, then the first dimension
    of N will not be present (i.e. a single string will be returned, or an L x D
    array).

    Parameters
    ----------
    seq : str
        The sequence to shuffle.
    num_shufs : int, optional
        The number of shuffles to create. If None, only one shuffle is created.
    rng : np.random.RandomState, optional
        The random number generator to use. If None, a new one is created.

    Returns
    -------
    list of str or np.array
        The shuffled sequences.

    Note
    ----
    This function comes from DeepLIFT's dinuc_shuffle.py.
    """
    if type(seq) is str or type(seq) is np.str_:
        arr = _string_to_char_array(seq)
    elif type(seq) is np.ndarray and len(seq.shape) == 2:
        seq_len, one_hot_dim = seq.shape
        arr = _one_hot2token(seq)
    else:
        raise ValueError("Expected string or one-hot encoded array")
    if not rng:
        rng = np.random.RandomState(seed=settings.seed)

    # Get the set of all characters, and a mapping of which positions have which
    # characters; use `tokens`, which are integer representations of the
    # original characters
    chars, tokens = np.unique(arr, return_inverse=True)

    # For each token, get a list of indices of all the tokens that come after it
    shuf_next_inds = []
    for t in range(len(chars)):
        mask = tokens[:-1] == t  # Excluding last char
        inds = np.where(mask)[0]
        shuf_next_inds.append(inds + 1)  # Add 1 for next token

    if type(seq) is str or type(seq) is np.str_:
        all_results = []
    else:
        all_results = np.empty(
            (num_shufs if num_shufs else 1, seq_len, one_hot_dim), dtype=seq.dtype
        )

    for i in range(num_shufs if num_shufs else 1):
        # Shuffle the next indices
        for t in range(len(chars)):
            inds = np.arange(len(shuf_next_inds[t]))
            inds[:-1] = rng.permutation(len(inds) - 1)  # Keep last index same
            shuf_next_inds[t] = shuf_next_inds[t][inds]

        counters = [0] * len(chars)

        # Build the resulting array
        ind = 0
        result = np.empty_like(tokens)
        result[0] = tokens[ind]
        for j in range(1, len(tokens)):
            t = tokens[ind]
            ind = shuf_next_inds[t][counters[t]]
            counters[t] += 1
            result[j] = tokens[ind]

        if type(seq) is str or type(seq) is np.str_:
            all_results.append(_char_array_to_string(chars[result]))
        else:
            all_results[i] = _token2one_hot(chars[result])
    return all_results if num_shufs else all_results[0]


def dinuc_shuffle_seqs(seqs, num_shufs=None, rng=None):
    """
    Shuffle the sequences in `seqs` in the same way as `dinuc_shuffle_seq`.
    If `num_shufs` is not specified, then the first dimension of N will not be
    present (i.e. a single string will be returned, or an L x D array).

    Parameters
    ----------
    seqs : np.ndarray
        Array of sequences to shuffle
    num_shufs : int, optional
        Number of shuffles to create, by default None
    rng : np.random.RandomState, optional
        Random state to use for shuffling, by default None

    Returns
    -------
    np.ndarray
        Array of shuffled sequences

    Note
    -------
    This is taken from DeepLIFT
    """
    if not rng:
        rng = np.random.RandomState(seed=settings.seed)

    if type(seqs) is str or type(seqs) is np.str_:
        seqs = [seqs]

    all_results = []
    for i in range(len(seqs)):
        all_results.append(dinuc_shuffle_seq(seqs[i], num_shufs=num_shufs, rng=rng))
    return np.array(all_results)

In [11]:
eu.pp.decode_seq(ohe_seqs[0])

'TGCGAGGCCATGGCTCATGAGTTCTAAGGATGCGAATAACACAAAAAGCCGCGATCTTAAACGTTCTACACTTCTAAGGTCTGCATGAGCGAACCGAAAC'

In [12]:
eu.pp.decode_seqs(dinuc_shuffle_seqs(ohe_seqs))

Decoding sequences:   0%|          | 0/1000 [00:00<?, ?it/s]

array(['TTATAAATGAAAGATGGCGCGTAAACAGACGAACTCAACCTCCCAAGCTTGCGCTGGCGCTTGTCTAATCAGGCAGGAACTTCATCGTATGAACGAGAAC',
       'CCAGTTCTGCCGGCCTACAGAACTCGCCTGTGCGTGGAGCGGAAGCTGAAGGTTATTGGCGTGTTCTGCTATCGCAGCTAGCGTCTGTTCAAAATAGACA',
       'TGTCTTCCGCCTGCAACGACCTCTACGCGTGACCTAGGTCCCGCGGACTCTGTCTCACCCTTTTATGTGGAGGGAGTTGTTTGTTTGCTGATGCCTTGGC',
       'ATAGAGTCTGAACGCATAGATTACAATCCGTCGTAGCCCGTTACAGTGATTGCTTCGCAGGTTGTCCTATTCTTCACTCCGACACGTTCATCCGTGAGGG',
       'CCCTGACCGCTCCTCCATTCGTTTTCGAAACAAGTGTTGCGCTCCGCAGACGAGGTTACAGTTTACATGTCGCGATTCACATGCACACAGTAAGTGCCAG',
       'GGGCTGTGCACGTATTTAACTACTTTGTTGTGCCACTTTACTGGTGACGTAGAAGCACGCGTATGATGAGGTAGCCCGTAACCCCCGTCCGTCCAATCCC',
       'GTCCACGCTCAATAAAACACTGTCCACGAGCCCACTCCCAGCAATATCGCAGCATCGAAACACCTCTTGTTCTCTAGGGTTCATTCGGACCCCCTAGGTT',
       'CAGGGAAAATTCACCAGAATAAGAGATCCGAAGAGGTTAGGCCATGAGTCACACTGATGCCGAAAATCTATAATCTGGGGCCTTCAATTGTAATAGCCAT',
       'TCGGGAATGAAAAGGCCGGATTGCGCGATAATTGTAGCGCGTTAGAGCTGTTCCCGCAGTTCTACTTAGACAATCGGGTTATTACCCAGTGGTTCGCTTA',
 

---