# Ray et al 2013 Extract-Transform-Load
**Authorship:**
Adam Klie, *08/11/2022*
***
**Description:**
Notebook to extract, transform, and load (ETL) data from the Ray et al (2013) dataset.
***

In [None]:
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import eugene as eu
eu.settings.dataset_dir = "/cellar/users/aklie/data/eugene"

# Download and load in the dataset to a raw `SeqData` object

In [None]:
# Load in the downloaded data, or download it if it's not there
sdata_raw = eu.datasets.ray13()

In [None]:
# Subset to set type (A -- training or B -- testing)
sdata_setA_raw = sdata_raw[sdata_raw.seqs_annot["Probe_Set"] == "SetA"]
sdata_setB_raw = sdata_raw[sdata_raw.seqs_annot["Probe_Set"] == "SetB"]

In [None]:
# Save raw versions of these three
sdata_raw.write_h5sd(os.path.join(eu.settings.dataset_dir, eu.settings.dataset_dir, "ray13", "norm_raw.h5sd"))
sdata_setA_raw.write_h5sd(os.path.join(eu.settings.dataset_dir, eu.settings.dataset_dir, "ray13", "norm_setA_raw.h5sd"))
sdata_setB_raw.write_h5sd(os.path.join(eu.settings.dataset_dir, eu.settings.dataset_dir, "ray13", "norm_setB_raw.h5sd"))

# Preprocess the training set

## Preprocess the seqs
- Padded elements of sequences are replaced with a one hot encoded value of 0.25 spanning each base.

In [None]:
eu.pp.ohe_seqs_sdata(sdata_setA_raw, vocab="RNA", seq_align="center", fill_value=0.25)
eu.pp.ohe_seqs_sdata(sdata_setB_raw, vocab="RNA", seq_align="center", fill_value=0.25)

## Preprocess the targets
- The values of probe intensities are clamped at 99.95% percentile per binding protein to eliminate outliers and balance the data.
- The probe intensities are normalized to a mean of 0 and a standard deviation of 1.

In [None]:
# Get a list of only the target columns
target_mask = sdata_setA_raw.seqs_annot.columns.str.contains("RNCMPT")
target_cols = sdata_setA_raw.seqs_annot.columns[target_mask]
random_cols = np.random.choice(target_cols, 9)
len(target_cols)

In [None]:
# Split only those training sequences in SetA into train and validation sets
eu.pp.train_test_split_sdata(sdata_setA_raw, train_key="train_val", split=0.8)

In [None]:
# Plot the distribution of the targets
eu.pl.violinplot(
    sdata_setA_raw, 
    keys=random_cols
)

In [None]:
# Clamp the targets based on percentiles
eu.pp.clamp_targets_sdata(sdata_setA_raw, percentile=0.9995, target_keys=target_cols, train_key="train_val", store_clamp_nums=True)

In [None]:
# Check the distribution of the clamped targets
sdata_setA_raw.seqs_annot[random_cols].describe()

In [None]:
# Make sure they match up with stored values
sdata_setA_raw.uns["clamp_nums"][random_cols]

In [None]:
# Plot the distribution of the clamped targets
eu.pl.violinplot(
    sdata_setA_raw, 
    keys=random_cols
)

In [None]:
# Scale the targets have mean 0 and variance 1
eu.pp.scale_targets_sdata(sdata_setA_raw, target_keys=target_cols, train_key="train_val", suffix=False, store_scaler=True)

In [None]:
# Check the distribution of the scaled targets, should be approximately normal but not exactly
sdata_setA_raw.seqs_annot[target_cols].describe()

In [None]:
# Plot the distribution of the scaled targets
eu.pl.violinplot(
    sdata_setA_raw, 
    keys=random_cols
)

# Preprocess the test set
- We need to apply the clamping numbers from the training set to the test set.
- We need to apply the mean and standard deviation from the training set to the test set.

In [None]:
# Apply the same clamping to the test set
eu.pp.clamp_targets_sdata(sdata_setB_raw, target_keys=target_cols, clamp_nums=sdata_setA_raw.uns["clamp_nums"])

In [None]:
# Check the clamping
sdata_setB_raw.seqs_annot[random_cols].describe()

In [None]:
# Apply the same scaling to the test set
eu.pp.scale_targets_sdata(sdata_setB_raw, target_keys=target_cols, scaler=sdata_setA_raw.uns["scaler"], suffix=False)

In [None]:
# Check the scaling
sdata_setB_raw.seqs_annot[random_cols].describe()

In [None]:
# Take subset for testing, only for tests/use_cases/ray13
sdata_setA_sub = sdata_setA_raw[:100]
sdata_setB_sub = sdata_setB_raw[:100]

In [None]:
# Save the processed data
sdata_setA_sub.write_h5sd(os.path.join(eu.settings.dataset_dir, eu.settings.dataset_dir, "ray13", "norm_setA_sub_ST.h5sd"))
sdata_setB_sub.write_h5sd(os.path.join(eu.settings.dataset_dir, eu.settings.dataset_dir, "ray13", "norm_setB_sub_ST.h5sd"))
sdata_setA_raw.write_h5sd(os.path.join(eu.settings.dataset_dir, eu.settings.dataset_dir, "ray13", "norm_setA_processed_ST.h5sd"))
sdata_setB_raw.write_h5sd(os.path.join(eu.settings.dataset_dir, eu.settings.dataset_dir, "ray13", "norm_setB_processed_ST.h5sd"))

# Generate multitask ready data
 - With single task training, we can just filter out NaNs and train on the remaining data.
 - We can't do this for multitask training, so we need to generate a separate `SeqData` object where there are no NaNs.

In [None]:
# Get the columns that you would keep if you removed columns with a certain percentage of missing values
nan_cutoff = 0.01
nan_percents = sdata_setA_raw.seqs_annot[target_cols].isna().sum(axis=0).sort_values(ascending=False)/sdata_setA_raw.seqs_annot.shape[0]
remove_cols = nan_percents[nan_percents > nan_cutoff].index
keep_cols = target_cols.drop(remove_cols)

In [None]:
# Make a copy of the training data and subset it to only the columns with < nan_cutoff missing values
sdata_setA_MT = sdata_setA_raw.copy()
sdata_setA_MT.seqs_annot = sdata_setA_MT.seqs_annot.drop(remove_cols, axis=1)

In [None]:
# Get rid of any sequences that have missing values in the remaining target columns
keep_rows = np.where(sdata_setA_MT.seqs_annot[keep_cols].isna().sum(axis=1) == 0)[0]
sdata_setA_MT = sdata_setA_MT[keep_rows]

In [None]:
# We also need to remove the columns from the Set B object, but we don't need to remove any rows since we can just ignore those in the evaluation stage
sdata_setB_MT = sdata_setB_raw.copy()
sdata_setB_MT.seqs_annot = sdata_setB_MT.seqs_annot.drop(remove_cols, axis=1)

In [None]:
# Double check that the shapes make sense (Set A object has 2 extra columns, one set and one for train/val split. Set B object has 1 extra column, jus the set)
sdata_setA_MT.seqs_annot.shape, sdata_setB_MT.seqs_annot.shape

In [None]:
# Check if copy worked
sdata_setA_raw.seqs_annot.shape, sdata_setB_raw.seqs_annot.shape

In [None]:
# Doubke check that there are no missing values in the remaining columns
sdata_setA_MT.seqs_annot[keep_cols].isna().sum().sum()

In [None]:
# Take subset for testing
sdata_setA_MT_sub = sdata_setA_MT[:100]
sdata_setB_MT_sub = sdata_setB_MT[:100]

In [None]:
sdata_setA_MT_sub.write_h5sd(os.path.join(eu.settings.dataset_dir, eu.settings.dataset_dir, "ray13", "norm_setA_sub_MT.h5sd"))
sdata_setB_MT_sub.write_h5sd(os.path.join(eu.settings.dataset_dir, eu.settings.dataset_dir, "ray13", "norm_setB_sub_MT.h5sd"))
sdata_setA_MT.write_h5sd(os.path.join(eu.settings.dataset_dir, eu.settings.dataset_dir, "ray13", "norm_setA_processed_MT.h5sd"))
sdata_setB_MT.write_h5sd(os.path.join(eu.settings.dataset_dir, eu.settings.dataset_dir, "ray13", "norm_setB_processed_MT.h5sd"))

# Generating a presence/absence matrix per probe
- We need to generate a presence/absence matrix per probe to use for evaluation
    - This presence/absence matrix is a binary matrix where the rows are all possible k-mers and the columns are probes.
    - The value of a cell is 1 if the k-mer is present in that probe and 0 otherwise.

> **Note**
> Each one of these matrices takes about 15 minutes to generate!

In [None]:
# Helper function to generate a presence/absence matrix
from eugene.evaluate.utils import generate_all_possible_kmers, kmer_in_seqs

In [None]:
# Generate all possible 7-mers and check
a_probes = pd.Series(sdata_setA_raw.seqs)
a_probes_MT = pd.Series(sdata_setA_MT.seqs)
b_probes = pd.Series(sdata_setB_raw.seqs)
kmers = generate_all_possible_kmers(n=7, alphabet="ACGU")
len(a_probes), len(a_probes_MT), len(b_probes), len(kmers)

In [None]:
# Generate the Set A presence/absence matrix
a_hits = np.array([a_probes.str.contains(kmer).astype(int).values for i, kmer in tqdm(enumerate(kmers), desc="Searching for kmers in probes", total=len(kmers))])
np.save(os.path.join(eu.settings.dataset_dir, "ray13", "setA_binary_ST"), a_hits)
a_hits.shape, np.all((a_hits == 1).sum(axis=1) >= 155)

In [None]:
# Generate the Set A presence/absence matrix
a_hits_MT = np.array([a_probes_MT.str.contains(kmer).astype(int).values for i, kmer in tqdm(enumerate(kmers), desc="Searching for kmers in probes", total=len(kmers))])
np.save(os.path.join(eu.settings.dataset_dir, "ray13", "setA_binary_MT"), a_hits)
a_hits_MT.shape, np.all((a_hits_MT == 1).sum(axis=1) >= 155)

In [None]:
# Generate the Set B presence/absence matrix
b_hits = np.array([b_probes.str.contains(kmer).astype(int).values for i, kmer in tqdm(enumerate(kmers), desc="Searching for kmers in probes", total=len(kmers))])
np.save(os.path.join(eu.settings.dataset_dir, "ray13", "setB_binary"), b_hits)
b_hits.shape,  np.all((b_hits == 1).sum(axis=1) >= 155)

---