# Jores et al 2021 Extract-Transform-Load
**Authorship:**
Adam Klie, *05/18/2023*
***
**Description:**
Notebook to extract, transform, and load (ETL) data from the Jores et al (2021) dataset.
***

In [None]:
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

In [67]:
# General imports
import os
import numpy as np
import pandas as pd
import xarray as xr

# EUGENe imports
from eugene import preprocess as pp
from eugene import settings
settings.dataset_dir = "/cellar/users/aklie/data/eugene/revision/"

# EUGENe packages
import seqdatasets
import seqdata as sd

# Download and load in the dataset to a raw `SeqData` object

In [68]:
# Load in the downloaded datasets from the manuscript Github repo
sdata_leaf = seqdatasets.jores21("leaf", batch_size=10000)
sdata_proto = seqdatasets.jores21("proto", batch_size=10000)

Dataset jores21 CNN_test_leaf.tsv has already been downloaded.
Dataset jores21 CNN_train_leaf.tsv has already been downloaded.


0it [00:00, ?it/s]

72158it [00:03, 22440.95it/s]


Dataset jores21 CNN_train_proto.tsv has already been downloaded.
Dataset jores21 CNN_test_proto.tsv has already been downloaded.


75808it [00:02, 28312.88it/s]


In [59]:
da = xr.DataArray(
    np.arange(6).reshape(2, 3), [("x", ["a", "b"]), ("y", [10, 20, 30])]
)

In [61]:
ds = da.to_dataset(name="foo")

In [62]:
ds2 = xr.Dataset({"bar": ("x", [1, 2, 3, 4]), "x": list("abcd")})

In [64]:
xr.merge([ds, ds2], join="left")

In [57]:
sdata_leaf

Unnamed: 0,Array,Chunk
Bytes,563.73 kiB,78.12 kiB
Shape,"(72158,)","(10000,)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 563.73 kiB 78.12 kiB Shape (72158,) (10000,) Dask graph 8 chunks in 2 graph layers Data type float64 numpy.ndarray",72158  1,

Unnamed: 0,Array,Chunk
Bytes,563.73 kiB,78.12 kiB
Shape,"(72158,)","(10000,)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,563.73 kiB,78.12 kiB
Shape,"(72158,)","(10000,)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 563.73 kiB 78.12 kiB Shape (72158,) (10000,) Dask graph 8 chunks in 2 graph layers Data type object numpy.ndarray",72158  1,

Unnamed: 0,Array,Chunk
Bytes,563.73 kiB,78.12 kiB
Shape,"(72158,)","(10000,)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,563.73 kiB,78.12 kiB
Shape,"(72158,)","(10000,)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 563.73 kiB 78.12 kiB Shape (72158,) (10000,) Dask graph 8 chunks in 2 graph layers Data type object numpy.ndarray",72158  1,

Unnamed: 0,Array,Chunk
Bytes,563.73 kiB,78.12 kiB
Shape,"(72158,)","(10000,)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,563.73 kiB,78.12 kiB
Shape,"(72158,)","(10000,)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 563.73 kiB 78.12 kiB Shape (72158,) (10000,) Dask graph 8 chunks in 2 graph layers Data type object numpy.ndarray",72158  1,

Unnamed: 0,Array,Chunk
Bytes,563.73 kiB,78.12 kiB
Shape,"(72158,)","(10000,)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,563.73 kiB,78.12 kiB
Shape,"(72158,)","(10000,)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 563.73 kiB 78.12 kiB Shape (72158,) (10000,) Dask graph 8 chunks in 2 graph layers Data type object numpy.ndarray",72158  1,

Unnamed: 0,Array,Chunk
Bytes,563.73 kiB,78.12 kiB
Shape,"(72158,)","(10000,)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray


In [65]:
sdata_leaf

Unnamed: 0,barcodes,type,chromosome,start,end,strand,GC,UTR,mutations,sequence
0,36,miRNA,5,9362651,9362820,-,0.200000,True,,GAATATAACGAAAGTAGTACTTAATTTGTTTACATAATTTTATTTT...
1,1,miRNA,1,11230608,11230777,-,0.223529,True,,TTACGTTACTGTTAAAGACTAGTTCATGACTAGTTTAACTCAATTT...
2,3,miRNA,4,11963074,11963243,-,0.223529,True,,CATTAAGTAAACTCTGAAAGCAATATAAATAGATGAAAGCAAACGA...
3,1,miRNA,3,17418611,17418780,+,0.235294,True,,GAGTAGTTCCAAATATTTTTCATTTGACAAAAGTTTTAAAAAAAAA...
4,1,miRNA,3,22677987,22678156,+,0.241176,True,,GTCACTTAAAACAAAATGTCTATACAAATACTATTAACATAAAACA...
...,...,...,...,...,...,...,...,...,...,...
79833,4,protein_coding,5,136449470,136449639,+,0.817647,True,,CGCCGCAGCTAGCTGCCAGACGCGCGCCCGCCGCTACCTGCTGCCG...
79834,1,protein_coding,2,178392288,178392457,-,0.823529,False,,CGACACCACCGCGGGCGAACGCGCTGTGGCCTGCGGCTCCCGGCGA...
79835,7,protein_coding,7,90818879,90819048,+,0.823529,True,,TCCTCGCCCCACCATCAGCGGCCCTGGCGCGCGGGCGCGCAGGCAC...
79836,3,protein_coding,8,5443009,5443178,-,0.835294,True,164T>A,GCTCTCTCCGGCTGCTCCGCCCGCGCCGCCCTGACGTCGCTGGCCC...


In [69]:
def concat_seqdatas(seqdatas, keys):
    for i, s in enumerate(seqdatas):
        s["batch"] = keys[i]
    return xr.concat(seqdatas, dim="_sequence")
sdata_combined = concat_seqdatas([sdata_leaf, sdata_proto], keys=["leaf", "proto"])

In [70]:
# Give each sequence a unique ID
pp.make_unique_ids_sdata(sdata_leaf)
pp.make_unique_ids_sdata(sdata_proto)
#pp.make_unique_ids_sdata(sdata_combined)

# TODO: Can we merge metadata with XArray dataset?

In [23]:
# Download metadata, promoters to evolve and motifs in MEME format (TODO: add to jores21 datasets module function)
#!wget https://static-content.springer.com/esm/art%3A10.1038%2Fs41477-021-00932-y/MediaObjects/41477_2021_932_MOESM3_ESM.xlsx -O /cellar/users/aklie/data/eugene/revision/jores21/41477_2021_932_MOESM3_ESM.xlsx
#!wget https://raw.githubusercontent.com/tobjores/Synthetic-Promoter-Designs-Enabled-by-a-Comprehensive-Analysis-of-Plant-Core-Promoters/main/analysis/validation_sequences/promoters_for_evolution.tsv -O /cellar/users/aklie/data/eugene/revision/jores21/promoters_for_evolution.tsv
#!wget https://raw.githubusercontent.com/tobjores/Synthetic-Promoter-Designs-Enabled-by-a-Comprehensive-Analysis-of-Plant-Core-Promoters/main/data/misc/CPEs.meme -O /cellar/users/aklie/data/eugene/revision/jores21/CPEs.meme
#!wget https://raw.githubusercontent.com/tobjores/Synthetic-Promoter-Designs-Enabled-by-a-Comprehensive-Analysis-of-Plant-Core-Promoters/main/data/misc/TF-clusters.meme -O /cellar/users/aklie/data/eugene/revision/jores21/TF-clusters.meme

In [55]:
# Load some other sequence features
smetadata = pd.read_excel(
    os.path.join(settings.dataset_dir, "jores21", "41477_2021_932_MOESM3_ESM.xlsx"),
    sheet_name=0, 
    skiprows=3, 
)
smetadata.drop(columns=["species", "gene"], inplace=True)
smetadata["sequence"].isin(sdata_combined["seq"].to_numpy().astype("U")).sum()

76712

In [56]:
smetadata

Unnamed: 0,barcodes,type,chromosome,start,end,strand,GC,UTR,mutations,sequence
0,36,miRNA,5,9362651,9362820,-,0.200000,True,,GAATATAACGAAAGTAGTACTTAATTTGTTTACATAATTTTATTTT...
1,1,miRNA,1,11230608,11230777,-,0.223529,True,,TTACGTTACTGTTAAAGACTAGTTCATGACTAGTTTAACTCAATTT...
2,3,miRNA,4,11963074,11963243,-,0.223529,True,,CATTAAGTAAACTCTGAAAGCAATATAAATAGATGAAAGCAAACGA...
3,1,miRNA,3,17418611,17418780,+,0.235294,True,,GAGTAGTTCCAAATATTTTTCATTTGACAAAAGTTTTAAAAAAAAA...
4,1,miRNA,3,22677987,22678156,+,0.241176,True,,GTCACTTAAAACAAAATGTCTATACAAATACTATTAACATAAAACA...
...,...,...,...,...,...,...,...,...,...,...
79833,4,protein_coding,5,136449470,136449639,+,0.817647,True,,CGCCGCAGCTAGCTGCCAGACGCGCGCCCGCCGCTACCTGCTGCCG...
79834,1,protein_coding,2,178392288,178392457,-,0.823529,False,,CGACACCACCGCGGGCGAACGCGCTGTGGCCTGCGGCTCCCGGCGA...
79835,7,protein_coding,7,90818879,90819048,+,0.823529,True,,TCCTCGCCCCACCATCAGCGGCCCTGGCGCGCGGGCGCGCAGGCAC...
79836,3,protein_coding,8,5443009,5443178,-,0.835294,True,164T>A,GCTCTCTCCGGCTGCTCCGCCCGCGCCGCCCTGACGTCGCTGGCCC...


In [None]:

# Add the sequence annotations to the combined SeqData object
sdata_combined.seqs_annot = sdata_combined.seqs_annot.merge(smetadata, on="sequence", how="left")
sdata_combined.seqs_annot.drop(columns=["sequence", "UTR"], inplace=True)

In [71]:
# One-hot encoded sequences
pp.ohe_seqs_sdata(sdata_leaf)
pp.ohe_seqs_sdata(sdata_proto)
#pp.ohe_seqs_sdata(sdata_combined)

In [37]:
# Subset back to leaf and proto objects
#leaf_inds = np.where(sdata_combined["batch"] == "leaf")[0]
#proto_inds = np.where(sdata_combined["batch"] break== "proto")[0]
#sdata_leaf = sdata_combined.isel(_sequence=leaf_inds)
#sdata_proto = sdata_combined.isel(_sequence=proto_inds)

In [77]:
# Split each of the three datases into training and train sets
leaf_train_inds = np.where(sdata_leaf["set"] == "train")[0]
leaf_test_inds = np.where(sdata_leaf["set"] == "test")[0]
proto_train_inds = np.where(sdata_proto["set"] == "train")[0]
proto_test_inds = np.where(sdata_proto["set"] == "test")[0]
combined_train_inds = np.where(sdata_combined["set"] == "train")[0]
combined_test_inds = np.where(sdata_combined["set"] == "test")[0]
sdata_leaf_train = sdata_leaf.isel(_sequence=leaf_train_inds)
sdata_leaf_test = sdata_leaf.isel(_sequence=leaf_test_inds)
sdata_proto_train = sdata_proto.isel(_sequence=proto_train_inds)
sdata_proto_test = sdata_proto.isel(_sequence=proto_test_inds)
sdata_combined_train = sdata_combined.isel(_sequence=combined_train_inds)
sdata_combined_test = sdata_combined.isel(_sequence=combined_test_inds)

In [78]:
pp.train_test_split_sdata(sdata_leaf_train, test_size=0.1)
pp.train_test_split_sdata(sdata_proto_train, test_size=0.1)
#pp.train_test_split_sdata(sdata_combined_train, test_size=0.1)

In [81]:
sdata_leaf_test["gene"][0].to_numpy()

array('AT5G03425', dtype='<U9')

In [82]:
for sdata in [sdata_leaf_train, sdata_leaf_test, sdata_proto_train, sdata_proto_test]: # , sdata_combined_train, sdata_combined_test]:
    for v in list(sdata.coords.keys()):
        if sdata.coords[v].dtype == object:
            sdata.coords[v] = sdata.coords[v].astype("U")
    for v in list(sdata.variables.keys()):
        if sdata[v].dtype == object:
            sdata[v] = sdata[v].astype("U")

In [86]:
sdata_leaf_test.reset_encoding()

AttributeError: 'Dataset' object has no attribute 'reset_encoding'

In [85]:
sdata_leaf_test["gene"][0].to_numpy()

array('AT5G03425', dtype='<U9')

In [56]:
sdata_leaf_train.where(~sdata_leaf_train["train_val"].compute(), drop=True)["id"].to_numpy()

array(['seq07162', 'seq07167', 'seq07173', ..., 'seq72140', 'seq72145',
       'seq72148'], dtype=object)

In [45]:
sdata_leaf_train["id"].to_numpy()

array(['seq07154', 'seq07155', 'seq07156', ..., 'seq72155', 'seq72156',
       'seq72157'], dtype='<U8')

In [50]:
sdata_leaf_train["seq"].to_numpy()

NameError: name 'sdata_leaf_train' is not defined

In [59]:
sdata_leaf_test["seq"].to_numpy()

array(['TGAGTGAAGGCAGAATTGACCCATGCAGCTTCCTTTCTTTCACCACTCACTTGCTAGGAAACTACAAAAATAGAAAAAGAAAACTCACGGCAACCAAAAACGCGAACTCCTAGAGGGTTTCGAACACTTTGAAATTTGTATCAGACATCAAATGAAATCTTTAACTTCTT',
       'GTATATATATAGGCGAGGAGAGTTAAGGCCAAGGGAGCCATGGATAGCTAGGCGAGGAGGGAGAAGAAGAGAAGCAAACAGAGCAAAGAACAATGGCAAGCAGAGCAGAGGATGGTGCCATGAGCCGCATGTCCTTGTGTGAACACAGAAGAGGGCGAGAATCACAGTGC',
       'CTAAGCAGAGGAGCTAGCTAGGAAGAGGTACGGTGCTGGCTAAGCTAGCTAGATCCATCGTCCATGGAACTGAGAGCAGCAGCTACCTATATATCTAGCTGGTTTTCTAACGACGATGACGACGACCGCGGGACTAGCATGATGCAGCTAGCTGAACACAGTTGTAGGCA',
       ...,
       'CGAGGAGGCGCCCGTGAGCAGGAGAGCCCCGGCCGCCGCGGCGCTCTCCAGCCGGCGGATTGGCGGTCGTGGAGGGGGCGCGAGGAGGAGCCTCAGCGTCCGCGTCGGTACAGCTTCAGGAGGCACCGGTCGGGCGCAGCGGAGTGGGCGGCGGTGGTGCTGTTCATGGT',
       'CGCGGGAGTCCGAGATCTGAAGGCGCGGGAGGAAGGGAACGGGCGAGAGGATCTCCAGCAGCCCGCGAGCGGGCGCGGCGGTTAACGCGTCCCAGATCCCGGCGCGGCGCGGCGCCTCCCGCCCGGGCCCTGCCGGCGAGAGCAGCGAGGCGCGCGGGCGGCCCGATGAG',
       'GCGCTTTTTCCCCACCCGTCGCGGTCGGCGGGGGCTCCATGCCCGCGGGGCCACGTGGGAGGCCAGGGCGCCGAA

In [58]:
sdata_leaf_train["seq"].to_numpy()

array(['CGTTTGGGTATGGACATTTAGACTTGTCGTGTTCCTGATGCCTCCCATTCCTATGGTTCTTAGGTGCTCCTTCCTCTTCCTTTCGCTAGCGCAATTGATTTAGTGATGAACACAATATACATTCCAAAGCACATAGTTAGATGAGAGCCTGATGGCAATTGGCAAGTCAG',
       'GTTAAGTACCATATAGATGATATTTGTTAAGTAGTAAGTCACTCAAAGTTTGAGTTTGGGTTTGAGTTTGAGTTTGAGTTTGAGTTTGAGAGACAAAAGATTACTACAAGAAGATTGTTAAACAAAAATGGAACACTAATTTCCGGAGCCACGGTCGTTGTTGGCTGCTG',
       'TGCAAGCACTTCACGAGGCACTATTTTGAGAGCTTTTGTGAAGTGTTTGGGGGAACTCTTGATGCCACTAACCATTTGCTAGAGGTTGGCCAAGTGTGTGCTACATGAGCCAAGTGTGTGCTACATGAAATGATTTGAACACAAATCACATTTGGAAATTATATATCAAG',
       ...,
       'GCCGCCCCGCGATTTCTTCAGCTGCCGATCGAGTGGCCGGGGAGGCGCATTGCCTCCGCTTCCACCGGGAGGGCCTCGCCTGACCGGCGCTCCCGGCGCCGCCTTGGCTCCCATGACGTGCGCGCGCGCCGAGGACGCCATCTCGCCCGCCGCCGCCGCCGCCGCCGCCG',
       'TGCCGGCTGCCGCGCGCGGGGCGGGACGCGGGAGGCGGACGGGTCAACACCCCACCCGTAGAGCACTCGAGGCCGAGCGGACGGAAGCGGGGGGTCACCCGGCCCGGCTCGGCAAAGCCGCAGCCGGGGGCCGGCGCGCGGGTGCACGAGCGCATGCACGACCGGATGGA',
       'GCGCAGCGCAGAGCCGCAGACCGCGACCGCGACCTCCGTCCGTCCCCGCGCGCAGGCGGGCGCACATGCTCGCTC

In [41]:
sdata_leaf_train = sdata_leaf_train.chunk({"_sequence": -1})
sdata_leaf_test = sdata_leaf_test.chunk({"_sequence": -1})
sdata_proto_train = sdata_proto_train.chunk({"_sequence": -1})
sdata_proto_test = sdata_proto_test.chunk({"_sequence": -1})

In [66]:
sdata_leaf_test

NameError: name 'sdata_leaf_test' is not defined

In [42]:
sdata_leaf_train.to_zarr(os.path.join(settings.dataset_dir, "jores21", "jores21_leaf_train.zarr"), mode="w")
sdata_leaf_test.to_zarr(os.path.join(settings.dataset_dir, "jores21", "jores21_leaf_test.zarr"), mode="w")
sdata_proto_train.to_zarr(os.path.join(settings.dataset_dir, "jores21", "jores21_proto_train.zarr"), mode="w")
sdata_proto_test.to_zarr(os.path.join(settings.dataset_dir, "jores21", "jores21_proto_test.zarr"), mode="w")

<xarray.backends.zarr.ZarrStore at 0x1552ec240890>

---