# Jores et al 2021 Extract-Transform-Load
**Authorship:**
Adam Klie, *05/18/2023*
***
**Description:**
Notebook to extract, transform, and load (ETL) data from the Jores et al (2021) dataset.
***

In [None]:
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

In [20]:
import os
import numpy as np
import pandas as pd
from eugene import settings
from eugene import datasets
from eugene import preprocess as pp
import seqdata
settings.dataset_dir = "/cellar/users/aklie/data/eugene/revision/"

In [10]:
import importlib
importlib.reload(seqdata)

<module 'seqdata' from '/cellar/users/aklie/projects/ML4GLand/SeqData/seqdata/__init__.py'>

# Download and load in the dataset to a raw `SeqData` object

In [11]:
# Load in the downloaded datasets from the manuscript Github repo
sdata_leaf_raw = datasets.jores21(dataset="leaf")
sdata_proto_raw = datasets.jores21(dataset="proto")

Path /cellar/users/aklie/data/eugene/revision/jores21 does not exist, creating new folder.
Downloading jores21 CNN_test_leaf.tsv to /cellar/users/aklie/data/eugene/revision/jores21...
Finished downloading CNN_test_leaf.tsv
Downloading jores21 CNN_train_leaf.tsv to /cellar/users/aklie/data/eugene/revision/jores21...
Finished downloading CNN_train_leaf.tsv
Downloading jores21 CNN_train_proto.tsv to /cellar/users/aklie/data/eugene/revision/jores21...
Finished downloading CNN_train_proto.tsv
Downloading jores21 CNN_test_proto.tsv to /cellar/users/aklie/data/eugene/revision/jores21...
Finished downloading CNN_test_proto.tsv


In [13]:
# Download metadata, promoters to evolve and motifs in MEME format (TODO: add to jores21 datasets module function)
!wget https://static-content.springer.com/esm/art%3A10.1038%2Fs41477-021-00932-y/MediaObjects/41477_2021_932_MOESM3_ESM.xlsx -O /cellar/users/aklie/data/eugene/revision/jores21/41477_2021_932_MOESM3_ESM.xlsx
!wget https://raw.githubusercontent.com/tobjores/Synthetic-Promoter-Designs-Enabled-by-a-Comprehensive-Analysis-of-Plant-Core-Promoters/main/analysis/validation_sequences/promoters_for_evolution.tsv -O /cellar/users/aklie/data/eugene/revision/jores21/promoters_for_evolution.tsv
!wget https://raw.githubusercontent.com/tobjores/Synthetic-Promoter-Designs-Enabled-by-a-Comprehensive-Analysis-of-Plant-Core-Promoters/main/data/misc/CPEs.meme -O /cellar/users/aklie/data/eugene/revision/jores21/CPEs.meme
!wget https://raw.githubusercontent.com/tobjores/Synthetic-Promoter-Designs-Enabled-by-a-Comprehensive-Analysis-of-Plant-Core-Promoters/main/data/misc/TF-clusters.meme -O /cellar/users/aklie/data/eugene/revision/jores21/TF-clusters.meme

--2023-05-18 17:19:00--  https://static-content.springer.com/esm/art%3A10.1038%2Fs41477-021-00932-y/MediaObjects/41477_2021_932_MOESM3_ESM.xlsx
Resolving static-content.springer.com (static-content.springer.com)... 146.75.92.95
Connecting to static-content.springer.com (static-content.springer.com)|146.75.92.95|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19786625 (19M) [application/vnd.openxmlformats-officedocument.spreadsheetml.sheet]
Saving to: ‘/cellar/users/aklie/data/eugene/revision/jores21/41477_2021_932_MOESM3_ESM.xlsx’


2023-05-18 17:19:00 (270 MB/s) - ‘/cellar/users/aklie/data/eugene/revision/jores21/41477_2021_932_MOESM3_ESM.xlsx’ saved [19786625/19786625]

--2023-05-18 17:19:01--  https://raw.githubusercontent.com/tobjores/Synthetic-Promoter-Designs-Enabled-by-a-Comprehensive-Analysis-of-Plant-Core-Promoters/main/analysis/validation_sequences/promoters_for_evolution.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.

In [14]:
# Concatenate the datasets into a combined SeqData object
sdata_combined_raw = seqdata.concat([sdata_leaf_raw, sdata_proto_raw], keys=["leaf", "proto"])
sdata_combined_raw.make_names_unique()

In [15]:
# Give a quick check of this dataset
sdata_combined_raw.seqs_annot.head()

Unnamed: 0,set,sp,gene,enrichment,batch,index
seq000000,test,At,AT5G03425,-0.537451,leaf,seq00000
seq000001,test,Sb,ENSRNA049996419,4.108127,leaf,seq00001
seq000002,test,Zm,ENSRNA049997541,-0.718419,leaf,seq00002
seq000003,test,Sb,ENSRNA049996013,1.340581,leaf,seq00003
seq000004,test,At,AT4G04125,4.227307,leaf,seq00004


In [16]:
# Load some other sequence features
smetadata = pd.read_excel(
    os.path.join(settings.dataset_dir, "jores21", "41477_2021_932_MOESM3_ESM.xlsx"),
    sheet_name=0, 
    skiprows=3, 
)
smetadata.drop(columns=["species", "gene"], inplace=True)
smetadata["sequence"].isin(sdata_combined_raw.seqs).sum()

76712

In [17]:
# Add the sequence annotations to the combined SeqData object
sdata_combined_raw["sequence"] = sdata_combined_raw.seqs
sdata_combined_raw.seqs_annot = sdata_combined_raw.seqs_annot.merge(smetadata, on="sequence", how="left")
sdata_combined_raw.seqs_annot.drop(columns=["sequence", "UTR"], inplace=True)

In [18]:
# Save raw versions of these three
sdata_leaf_raw.write_h5sd(os.path.join(settings.dataset_dir, "jores21", "leaf_raw.h5sd"))
sdata_proto_raw.write_h5sd(os.path.join(settings.dataset_dir, "jores21", "proto_raw.h5sd"))
sdata_combined_raw.write_h5sd(os.path.join(settings.dataset_dir, "jores21", "combined_raw.h5sd"))

# Transform the input data in the combined SeqData object

In [21]:
# Add reverse complement sequences and one-hot encoded sequences (forward and reverse complement)
pp.ohe_seqs_sdata(sdata_combined_raw)
pp.reverse_complement_seqs_sdata(sdata_combined_raw)

HBox(children=(FloatProgress(value=0.0, description='One-hot encoding sequences', max=147966.0, style=Progress…


SeqData object modified:
	ohe_seqs: None -> 147966 ohe_seqs added
SeqData object modified:
	ohe_rev_seqs: None -> 147966 ohe_rev_seqs added


In [22]:
# Save the combined processed SeqData object
sdata_combined_raw.write_h5sd(os.path.join(settings.dataset_dir, "jores21", "combined_processed.h5sd"))

In [23]:
# Subset back to leaf and proto objects
sdata_leaf_processed = sdata_combined_raw[sdata_combined_raw["batch"] == "leaf"]
sdata_proto_processed = sdata_combined_raw[sdata_combined_raw["batch"] == "proto"]

In [24]:
# Split each of the three datases into training and train sets
sdata_leaf_train = sdata_leaf_processed[sdata_leaf_processed["set"] == "train"]
sdata_proto_train = sdata_proto_processed[sdata_proto_processed["set"] == "train"]
sdata_combined_train = sdata_combined_raw[sdata_combined_raw["set"] == "train"]
sdata_leaf_test = sdata_leaf_processed[sdata_leaf_processed["set"] == "test"]
sdata_proto_test = sdata_proto_processed[sdata_proto_processed["set"] == "test"]
sdata_combined_test = sdata_combined_raw[sdata_combined_raw["set"] == "test"]

In [25]:
# Add in a column signifying whether the sequence is in the training set or the validation set
pp.train_test_split_sdata(sdata_leaf_train, train_key="train_val", split=0.9)
pp.train_test_split_sdata(sdata_proto_train, train_key="train_val", split=0.9)

SeqData object modified:
    seqs_annot:
        + train_val
SeqData object modified:
    seqs_annot:
        + train_val


In [26]:
# Save all the sets
sdata_leaf_processed.write_h5sd(os.path.join(settings.dataset_dir, "jores21", "leaf_processed.h5sd"))
sdata_proto_processed.write_h5sd(os.path.join(settings.dataset_dir, "jores21", "proto_processed.h5sd"))
sdata_leaf_train.write_h5sd(os.path.join(settings.dataset_dir, "jores21", "leaf_processed_train.h5sd"))
sdata_proto_train.write_h5sd(os.path.join(settings.dataset_dir, "jores21", "proto_processed_train.h5sd"))
sdata_combined_train.write_h5sd(os.path.join(settings.dataset_dir, "jores21", "combined_processed_train.h5sd"))
sdata_leaf_test.write_h5sd(os.path.join(settings.dataset_dir, "jores21", "leaf_processed_test.h5sd"))
sdata_proto_test.write_h5sd(os.path.join(settings.dataset_dir, "jores21", "proto_processed_test.h5sd"))
sdata_combined_test.write_h5sd(os.path.join(settings.dataset_dir, "jores21", "combined_processed_test.h5sd"))

---