# Jores et al 2021 Extract-Transform-Load
**Authorship:**
Adam Klie (last updated: *06/08/2023*)
***
**Description:**
Notebook to extract, transform, and load (ETL) data from the Jores et al (2021) dataset.
***

In [None]:
# General imports
import os
import sys
import numpy as np
import pandas as pd

# EUGENe imports
import eugene as eu
from eugene import dataload as dl
from eugene import preprocess as pp
from eugene import settings
settings.dataset_dir = "/cellar/users/aklie/data/eugene/revision/"

# EUGENe packages
import seqdatasets
import seqdata as sd

# Print versions
print(f"Python version: {sys.version}")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"Eugene version: {eu.__version__}")
print(f"SeqDatasets version: {seqdatasets.__version__}")
print(f"SeqData version: {sd.__version__}")

# Download and load in the dataset to a raw `SeqData` object

In [None]:
# Load in the downloaded datasets from the manuscript Github repo
sdata_leaf_raw = seqdatasets.jores21("leaf", batch_size=10000)
sdata_proto_raw = seqdatasets.jores21("proto", batch_size=10000)

In [None]:
# Download metadata, promoters to evolve and motifs in MEME format. Uncomment and run with your own path.
#!wget https://static-content.springer.com/esm/art%3A10.1038%2Fs41477-021-00932-y/MediaObjects/41477_2021_932_MOESM3_ESM.xlsx -O /cellar/users/aklie/data/eugene/revision/jores21/41477_2021_932_MOESM3_ESM.xlsx
#!wget https://raw.githubusercontent.com/tobjores/Synthetic-Promoter-Designs-Enabled-by-a-Comprehensive-Analysis-of-Plant-Core-Promoters/main/analysis/validation_sequences/promoters_for_evolution.tsv -O /cellar/users/aklie/data/eugene/revision/jores21/promoters_for_evolution.tsv
#!wget https://raw.githubusercontent.com/tobjores/Synthetic-Promoter-Designs-Enabled-by-a-Comprehensive-Analysis-of-Plant-Core-Promoters/main/data/misc/CPEs.meme -O /cellar/users/aklie/data/eugene/revision/jores21/CPEs.meme
#!wget https://raw.githubusercontent.com/tobjores/Synthetic-Promoter-Designs-Enabled-by-a-Comprehensive-Analysis-of-Plant-Core-Promoters/main/data/misc/TF-clusters.meme -O /cellar/users/aklie/data/eugene/revision/jores21/TF-clusters.meme

In [None]:
# Concatenate the datasets into a combined SeqData object
sdata_combined = dl.concat_seqdatas([sdata_leaf_raw, sdata_proto_raw], keys=["leaf", "proto"])
pp.make_unique_ids_sdata(sdata_combined)

In [None]:
# Load some other sequence features
smetadata = pd.read_excel(
    os.path.join(settings.dataset_dir, "jores21", "41477_2021_932_MOESM3_ESM.xlsx"),
    sheet_name=0, 
    skiprows=3, 
)
smetadata.drop(columns=["species", "gene"], inplace=True)

In [None]:
# Add the sequence annotations to the combined SeqData object
dl.add_obs(
    sdata=sdata_combined,
    obs=smetadata,
    left_on="seq",
    right_on="sequence",
)

#  Transform the input data in the combined SeqData object

In [None]:
# One-hot encoded sequences
pp.ohe_seqs_sdata(sdata_combined)

In [None]:
# Load in set variable for splitting (otherwise you will get an error whentrying to use boolean dask arrays)
sdata_combined["set"] = sdata_combined["set"].load()

In [None]:
# Subset back to leaf and proto objects
sdata_leaf = sdata_combined.sel(_sequence=sdata_combined['batch'] == 'leaf')
sdata_proto = sdata_combined.sel(_sequence=sdata_combined['batch'] == 'proto')

In [None]:
# Split each of the three datases into training and train sets
sdata_leaf_train = sdata_leaf.sel(_sequence=sdata_leaf["set"] == "train")
sdata_leaf_test = sdata_leaf.sel(_sequence=sdata_leaf["set"] == "test")
sdata_proto_train = sdata_proto.sel(_sequence=sdata_proto["set"] == "train")
sdata_proto_test = sdata_proto.sel(_sequence=sdata_proto["set"] == "test")

In [None]:
# Add in a column signifying whether the sequence is in the training set or the validation set
pp.train_test_split_sdata(sdata_leaf_train, test_size=0.1)
pp.train_test_split_sdata(sdata_proto_train, test_size=0.1)

In [None]:
# Save all the sets
sd.to_zarr(sdata_leaf_train, os.path.join(settings.dataset_dir, "jores21", "jores21_leaf_train.zarr"), load_first=True, mode="w")
sd.to_zarr(sdata_leaf_test, os.path.join(settings.dataset_dir, "jores21", "jores21_leaf_test.zarr"), load_first=True, mode="w")
sd.to_zarr(sdata_proto_train, os.path.join(settings.dataset_dir, "jores21", "jores21_proto_train.zarr"), load_first=True, mode="w")
sd.to_zarr(sdata_proto_test, os.path.join(settings.dataset_dir, "jores21", "jores21_proto_test.zarr"), load_first=True, mode="w")

# DONE!

---

# Scratch

## Test to make sure the save was successful

In [None]:
settings.dataset_dir = "/cellar/users/aklie/data/eugene/revision/jores21"
for zarr in ["jores21_leaf_train.zarr", "jores21_proto_train.zarr", "jores21_leaf_test.zarr", "jores21_proto_test.zarr"]:
    print(zarr)
    sdata = sd.open_zarr(os.path.join(settings.dataset_dir, zarr))
    print(sdata.dims["_sequence"])
    print(np.unique(sdata["set"].values, return_counts=True))
    if "train_val" in sdata.data_vars:
        print(np.unique(sdata["train_val"].values, return_counts=True))
    else:
        print("No train_val column found")
    print(np.unique(sdata["sp"].values, return_counts=True))
    print(sdata["id"].values[:5])
    print(np.unique(sdata["batch"].values, return_counts=True))