# Jores et al 2021 Extract-Transform-Load
**Authorship:**
Adam Klie, *05/18/2023*
***
**Description:**
Notebook to extract, transform, and load (ETL) data from the Jores et al (2021) dataset.
***

In [None]:
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

In [32]:
# General imports
import os
import numpy as np
import pandas as pd
import xarray as xr

# EUGENe imports
from eugene import preprocess as pp
from eugene import settings
settings.dataset_dir = "/cellar/users/aklie/data/eugene/revision/"

# EUGENe packages
import seqdatasets
import seqdata as sd

# Download and load in the dataset to a raw `SeqData` object

In [33]:
# Load in the downloaded datasets from the manuscript Github repo
sdata_leaf = seqdatasets.jores21("leaf", batch_size=10000)
sdata_proto = seqdatasets.jores21("proto", batch_size=10000)

Dataset jores21 CNN_test_leaf.tsv has already been downloaded.
Dataset jores21 CNN_train_leaf.tsv has already been downloaded.


72158it [00:09, 7678.88it/s]


Dataset jores21 CNN_train_proto.tsv has already been downloaded.
Dataset jores21 CNN_test_proto.tsv has already been downloaded.


75808it [00:06, 11461.03it/s]


In [34]:
def concat_seqdatas(seqdatas, keys):
    for i, s in enumerate(seqdatas):
        s["batch"] = keys[i]
    return xr.concat(seqdatas, dim="_sequence")
sdata_combined = concat_seqdatas([sdata_leaf, sdata_proto], keys=["leaf", "proto"])

In [35]:
# Give each sequence a unique ID
pp.make_unique_ids_sdata(sdata_leaf)
pp.make_unique_ids_sdata(sdata_proto)
#pp.make_unique_ids_sdata(sdata_combined)

# TODO: Can we merge metadata with XArray dataset?

In [23]:
# Download metadata, promoters to evolve and motifs in MEME format (TODO: add to jores21 datasets module function)
#!wget https://static-content.springer.com/esm/art%3A10.1038%2Fs41477-021-00932-y/MediaObjects/41477_2021_932_MOESM3_ESM.xlsx -O /cellar/users/aklie/data/eugene/revision/jores21/41477_2021_932_MOESM3_ESM.xlsx
#!wget https://raw.githubusercontent.com/tobjores/Synthetic-Promoter-Designs-Enabled-by-a-Comprehensive-Analysis-of-Plant-Core-Promoters/main/analysis/validation_sequences/promoters_for_evolution.tsv -O /cellar/users/aklie/data/eugene/revision/jores21/promoters_for_evolution.tsv
#!wget https://raw.githubusercontent.com/tobjores/Synthetic-Promoter-Designs-Enabled-by-a-Comprehensive-Analysis-of-Plant-Core-Promoters/main/data/misc/CPEs.meme -O /cellar/users/aklie/data/eugene/revision/jores21/CPEs.meme
#!wget https://raw.githubusercontent.com/tobjores/Synthetic-Promoter-Designs-Enabled-by-a-Comprehensive-Analysis-of-Plant-Core-Promoters/main/data/misc/TF-clusters.meme -O /cellar/users/aklie/data/eugene/revision/jores21/TF-clusters.meme

In [6]:
# Load some other sequence features
smetadata = pd.read_excel(
    os.path.join(settings.dataset_dir, "jores21", "41477_2021_932_MOESM3_ESM.xlsx"),
    sheet_name=0, 
    skiprows=3, 
)
smetadata.drop(columns=["species", "gene"], inplace=True)
smetadata["sequence"].isin(sdata_combined["seq"].to_numpy().astype("U")).sum()
# Add the sequence annotations to the combined SeqData object
sdata_combined.seqs_annot = sdata_combined.seqs_annot.merge(smetadata, on="sequence", how="left")
sdata_combined.seqs_annot.drop(columns=["sequence", "UTR"], inplace=True)

In [36]:
# One-hot encoded sequences
pp.ohe_seqs_sdata(sdata_leaf)
pp.ohe_seqs_sdata(sdata_proto)
#pp.ohe_seqs_sdata(sdata_combined)

In [37]:
# Subset back to leaf and proto objects
#leaf_inds = np.where(sdata_combined["batch"] == "leaf")[0]
#proto_inds = np.where(sdata_combined["batch"] break== "proto")[0]
#sdata_leaf = sdata_combined.isel(_sequence=leaf_inds)
#sdata_proto = sdata_combined.isel(_sequence=proto_inds)

In [38]:
# Split each of the three datases into training and train sets
leaf_train_inds = np.where(sdata_leaf["set"] == "train")[0]
leaf_test_inds = np.where(sdata_leaf["set"] == "test")[0]
proto_train_inds = np.where(sdata_proto["set"] == "train")[0]
proto_test_inds = np.where(sdata_proto["set"] == "test")[0]
combined_train_inds = np.where(sdata_combined["set"] == "train")[0]
combined_test_inds = np.where(sdata_combined["set"] == "test")[0]
sdata_leaf_train = sdata_leaf.isel(_sequence=leaf_train_inds)
sdata_leaf_test = sdata_leaf.isel(_sequence=leaf_test_inds)
sdata_proto_train = sdata_proto.isel(_sequence=proto_train_inds)
sdata_proto_test = sdata_proto.isel(_sequence=proto_test_inds)
sdata_combined_train = sdata_combined.isel(_sequence=combined_train_inds)
sdata_combined_test = sdata_combined.isel(_sequence=combined_test_inds)

In [39]:
pp.train_test_split_sdata(sdata_leaf_train, test_size=0.1)
pp.train_test_split_sdata(sdata_proto_train, test_size=0.1)
#pp.train_test_split_sdata(sdata_combined_train, test_size=0.1)

In [40]:
for sdata in [sdata_leaf_train, sdata_leaf_test, sdata_proto_train, sdata_proto_test]: # , sdata_combined_train, sdata_combined_test]:
    for v in list(sdata.coords.keys()):
        if sdata.coords[v].dtype == object:
            sdata.coords[v] = sdata.coords[v].astype("unicode")
    for v in list(sdata.variables.keys()):
        if sdata[v].dtype == object:
            sdata[v] = sdata[v].astype("unicode")

In [41]:
sdata_leaf_train = sdata_leaf_train.chunk({"_sequence": -1})
sdata_leaf_test = sdata_leaf_test.chunk({"_sequence": -1})
sdata_proto_train = sdata_proto_train.chunk({"_sequence": -1})
sdata_proto_test = sdata_proto_test.chunk({"_sequence": -1})

In [42]:
sdata_leaf_train.to_zarr(os.path.join(settings.dataset_dir, "jores21", "jores21_leaf_train.zarr"), mode="w")
sdata_leaf_test.to_zarr(os.path.join(settings.dataset_dir, "jores21", "jores21_leaf_test.zarr"), mode="w")
sdata_proto_train.to_zarr(os.path.join(settings.dataset_dir, "jores21", "jores21_proto_train.zarr"), mode="w")
sdata_proto_test.to_zarr(os.path.join(settings.dataset_dir, "jores21", "jores21_proto_test.zarr"), mode="w")

<xarray.backends.zarr.ZarrStore at 0x1552ec240890>

---