# Testing `eugene.dataset` Module 

**Authorship:**
Adam Klie, *06/23/2022*
***
**Description:**
Notebook to test the `dataset` module of the `eugene` package
***

In [1]:
import numpy as np
import pandas as pd

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

# Basic import
import eugene as eu
eu.__version__

Global seed set to 13


'0.1.0'

In [51]:
eu.settings.datasetdir = "/cellar/users/aklie/projects/EUGENE/tests/_data/datasets/"

In [59]:
eu.utils.generate_random_data(num_seqs=1000, seq_len=66, num_outputs=10, dataset_name="random1000_10")

In [7]:
eu.utils.generate_random_data(num_seqs=1000, seq_len=66, out_dir="../_data/datasets/")

In [18]:
num_seqs = 1000
num_outputs = 10

In [30]:
n_digits = len(str(num_seqs-1))
ids = np.array(["seq{num:0{width}}".format(num=i, width=n_digits) for i in range(num_seqs)])

In [19]:
labels = np.random.randint(0,2,size=(num_seqs, num_outputs))
activities = np.random.rand(num_seqs, num_outputs)

In [20]:
label_cols = ["LABEL_{}".format(i) for i in range(num_outputs)]
activity_cols = ["ACTIVITY_{}".format(i) for i in range(num_outputs)]

In [45]:
test_dict = dict(dict(NAME=ids), **dict(zip(label_cols, labels.T)), **dict(zip(activity_cols, activities.T)))

In [46]:
test_dict.keys()

dict_keys(['NAME', 'LABEL_0', 'LABEL_1', 'LABEL_2', 'LABEL_3', 'LABEL_4', 'LABEL_5', 'LABEL_6', 'LABEL_7', 'LABEL_8', 'LABEL_9', 'ACTIVITY_0', 'ACTIVITY_1', 'ACTIVITY_2', 'ACTIVITY_3', 'ACTIVITY_4', 'ACTIVITY_5', 'ACTIVITY_6', 'ACTIVITY_7', 'ACTIVITY_8', 'ACTIVITY_9'])

In [47]:
pd.DataFrame(test_dict)

Unnamed: 0,NAME,LABEL_0,LABEL_1,LABEL_2,LABEL_3,LABEL_4,LABEL_5,LABEL_6,LABEL_7,LABEL_8,...,ACTIVITY_0,ACTIVITY_1,ACTIVITY_2,ACTIVITY_3,ACTIVITY_4,ACTIVITY_5,ACTIVITY_6,ACTIVITY_7,ACTIVITY_8,ACTIVITY_9
0,seq000,0,1,0,0,1,0,1,1,1,...,0.136091,0.393341,0.856728,0.371197,0.039131,0.062918,0.171280,0.726156,0.044187,0.288193
1,seq001,1,0,1,1,0,0,0,1,1,...,0.759417,0.321257,0.235287,0.259573,0.202452,0.309004,0.895309,0.759696,0.893332,0.580223
2,seq002,0,0,0,1,0,1,0,0,0,...,0.355752,0.744815,0.159422,0.120629,0.773472,0.617851,0.929572,0.190477,0.293232,0.547462
3,seq003,0,0,1,1,0,0,0,1,1,...,0.737131,0.166221,0.137907,0.420561,0.513999,0.805506,0.059836,0.971746,0.513194,0.622146
4,seq004,0,0,0,0,1,1,1,1,0,...,0.003090,0.514462,0.924040,0.358115,0.285167,0.609111,0.050351,0.631610,0.922408,0.479885
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,seq995,0,1,1,0,0,1,1,1,0,...,0.856105,0.058945,0.791553,0.975106,0.775342,0.050280,0.659470,0.742035,0.643288,0.581103
996,seq996,0,0,1,1,1,1,1,0,0,...,0.089650,0.500784,0.498494,0.041080,0.630354,0.653317,0.224752,0.262105,0.153604,0.857420
997,seq997,0,1,0,0,0,0,0,1,0,...,0.275583,0.101739,0.886046,0.727605,0.327248,0.678102,0.255925,0.333556,0.676320,0.026159
998,seq998,0,1,1,1,0,1,1,0,0,...,0.428197,0.598415,0.535759,0.440924,0.270214,0.497965,0.159741,0.895670,0.512714,0.933207


# `get_dataset_info`

In [2]:
eu.datasets.get_dataset_info()

Unnamed: 0_level_0,n_seqs,url,description
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
random1000,1000,,A randomly generated set of 1000 sequences
ols,460800,TBD,This library is a collection of sequences from...
Khouiery,10,https://www.sciencedirect.com/science/article/...,This data comes from genomic sequences of Cion...


In [5]:
sdata = eu.datasets.random1000()

Kept 1000 sequences with targets, dropped 0 sequences with no targets


In [6]:
eu.pp.prepare_data(sdata)

  0%|          | 0/3 [00:00<?, ?it/s]

SeqData object modified:
	rev_seqs: None -> 1000 rev_seqs added
	ohe_seqs: None -> 1000 ohe_seqs added
	ohe_rev_seqs: None -> 1000 ohe_rev_seqs added
    seqs_annot:
        + TRAIN


In [4]:
eu.datasets.ols()

AttributeError: module 'eugene.datasets' has no attribute 'ols'

In [12]:
eu.datasets.Khoueiry10()

Kept 20 sequences with targets, dropped 0 sequences with no targets


SeqData object with = 20 seqs
seqs = (20,)
names = (20,)
rev_seqs = None
ohe_seqs = None
ohe_rev_seqs = None
seqs_annot: 'TARGETS'
pos_annot: None

In [3]:
eu.datasets.deBoer20(0, rev_comp=True)

Dataset deBoer20 GSE104878_20160503_average_promoter_ELs_per_seq_atLeast100Counts.csv.gz has already been dowloaded.
Kept 9982 sequences with targets, dropped 0 sequences with no targets


SeqData object with = 9982 seqs
seqs = (9982,)
names = (9982,)
rev_seqs = (9982,)
ohe_seqs = None
ohe_rev_seqs = None
seqs_annot: 'TARGETS'
pos_annot: None

---