# Jores et al 2021 Extract-Transform-Load
**Authorship:**
Adam Klie, *08/11/2022*
***
**Description:**
Notebook to extract, transform, and load data from the Jores et al (2021) dataset.
***

In [1]:
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

import os
import eugene as eu
eu.settings.dataset_dir = "../../_data/datasets"

Global seed set to 13
2022-08-12 00:14:07.583999: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-12 00:14:07.584046: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# Download and load in the dataset to a raw `SeqData` object

In [2]:
# Load in the downloaded datasets from the manuscript Github repo
sdata_leaf_raw = eu.datasets.jores21(dataset="leaf")
sdata_proto_raw = eu.datasets.jores21(dataset="proto")

Dataset jores21 CNN_test_leaf.tsv has already been dowloaded.
Dataset jores21 CNN_train_leaf.tsv has already been dowloaded.
Dataset jores21 CNN_train_proto.tsv has already been dowloaded.
Dataset jores21 CNN_test_proto.tsv has already been dowloaded.


In [3]:
# Concatenate the datasets into a combined SeqData object
sdata_combined_raw = eu.dl.concat([sdata_leaf_raw, sdata_proto_raw], keys=["leaf", "proto"])

In [4]:
# Save raw versions of these three
sdata_leaf_raw.write_h5sd(os.path.join(eu.settings.dataset_dir, "jores21", "leaf_raw.h5sd"))
sdata_proto_raw.write_h5sd(os.path.join(eu.settings.dataset_dir, "jores21", "proto_raw.h5sd"))
sdata_combined_raw.write_h5sd(os.path.join(eu.settings.dataset_dir, "jores21", "combined_raw.h5sd"))

# Transform the input data in the combined SeqData object

In [5]:
# Add reverse complement sequences and one-hot encoded sequences (forward and reverse complement)
eu.pp.reverse_complement_data(sdata_combined_raw)
eu.pp.one_hot_encode_data(sdata_combined_raw)

SeqData object modified:
	rev_seqs: None -> 147966 rev_seqs added
SeqData object modified:
	ohe_seqs: None -> 147966 ohe_seqs added
	ohe_rev_seqs: None -> 147966 ohe_rev_seqs added


In [8]:
sdata_combined_raw.ohe_seqs

array([[[0., 0., 0., 1.],
        [0., 0., 1., 0.],
        [1., 0., 0., 0.],
        ...,
        [0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.]],

       [[0., 0., 1., 0.],
        [0., 0., 0., 1.],
        [1., 0., 0., 0.],
        ...,
        [0., 0., 0., 1.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.]],

       [[0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [1., 0., 0., 0.],
        ...,
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.]],

       ...,

       [[1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        ...,
        [0., 0., 1., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.]],

       [[0., 0., 0., 1.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        ...,
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.]],

       [[0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        ...,
        [0., 0., 1., 0.],
        [1., 0.

In [10]:
sdata_combined_raw.write_h5sd(os.path.join(eu.settings.dataset_dir, "jores21", "combined_processed.h5sd"))

In [18]:
# Split each of the three datases into training and train sets
sdata_leaf_train = sdata_leaf_raw[sdata_leaf_raw["set"] == "train"]
sdata_proto_train = sdata_proto_raw[sdata_proto_raw["set"] == "train"]
sdata_combined_train = sdata_combined_raw[sdata_combined_raw["set"] == "train"]
sdata_leaf_test = sdata_leaf_raw[sdata_leaf_raw["set"] == "test"]
sdata_proto_test = sdata_proto_raw[sdata_proto_raw["set"] == "test"]
sdata_combined_test = sdata_combined_raw[sdata_combined_raw["set"] == "test"]

In [20]:

sdata_leaf_train.write_h5sd(os.path.join(eu.settings.dataset_dir, "jores21", "leaf_processed_train.h5sd"))
sdata_proto_train.write_h5sd(os.path.join(eu.settings.dataset_dir, "jores21", "proto_processed_train.h5sd"))
sdata_combined_train.write_h5sd(os.path.join(eu.settings.dataset_dir, "jores21", "combined_processed_train.h5sd"))
sdata_leaf_test.write_h5sd(os.path.join(eu.settings.dataset_dir, "jores21", "leaf_processed_test.h5sd"))
sdata_proto_test.write_h5sd(os.path.join(eu.settings.dataset_dir, "jores21", "proto_processed_test.h5sd"))
sdata_combined_test.write_h5sd(os.path.join(eu.settings.dataset_dir, "jores21", "combined_processed_test.h5sd"))