# Ray et al 2013 Exploratory Data Analysis
**Authorship:**
Adam Klie (last updated: *06/09/2023*)
***
**Description:**
Notebook to perform a brief exploratory data analysis (EDA) on the Ray et al (2013) dataset.
***

In [None]:
# General imports
import os
import sys
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# EUGENe imports
import eugene as eu
from eugene import plot as pl
from eugene import settings
settings.dataset_dir = "/cellar/users/aklie/data/eugene/revision/"

# EUGENe packages
import seqdatasets
import seqdata as sd
import seqpro as sp

# Print versions
print(f"Python version: {sys.version}")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"Xarray version: {xr.__version__}")
print(f"Eugene version: {eu.__version__}")
print(f"SeqDatasets version: {seqdatasets.__version__}")
print(f"SeqData version: {sd.__version__}")
print(f"SeqPro version: {sp.__version__}")

# Download and load in the dataset to a raw `SeqData` object

In [None]:
# Load in the downloaded dataset from the RNAcomplete supplementary website and double check that the index and names loaded in correctly
sdata = seqdatasets.ray13(batch_size=10000)

In [None]:
# Pull out the columns from seqs_annot to be used as targets (based on motif ID)
column_keys = pd.Index(sdata.data_vars.keys())
target_mask = column_keys.str.contains("RNCMPT")
target_cols = column_keys[target_mask]
len(target_cols)

# Sequence information

In [None]:
# Grab sequence lengths and plot distribution
sdata["seq_len"] = xr.DataArray(sp.length(sdata["seq"].values), dims="_sequence")
pl.histplot(
    sdata, 
    vars="seq_len", 
    orient="h"
)
plt.show()

# Target information

In [None]:
# Lets take a look at the distributions of a few randomly selected RBPs across all probes
pl.violinplot(
    sdata, 
    vars=list(np.random.choice(target_cols, 9))
)
plt.show()

In [None]:
# Plot the number NaN values across target columns and across sequences
fig, ax = plt.subplots(nrows=2, ncols=1)
sdata[target_cols].to_dataframe().isna().sum(axis=1).plot(kind="hist", ax=ax[0])
sdata[target_cols].to_dataframe().isna().sum(axis=0).plot(kind="hist", ax=ax[1])
plt.show()

In [None]:
# Determine the number of targets that would be included in the dataset if we dropped those with > 1% NaN values
nan_percents = sdata[target_cols].to_dataframe().isna().sum(axis=0).sort_values(ascending=False)/sdata.dims["_sequence"]
cleaned_annot = sdata[target_cols].to_dataframe().drop(nan_percents[nan_percents > 0.01].index, axis=1)
cleaned_cols = target_cols.drop(nan_percents[nan_percents > 0.01].index)
cleaned_annot.head()

In [None]:
# Determine how many sequences would be included in the dataset if we dropped those with any NaN values
len(np.where(cleaned_annot[cleaned_cols].isna().sum(axis=1) == 0)[0])

# DONE!

---

# Scratch

In [None]:
sdata = sd.open_zarr(os.path.join(settings.dataset_dir, "ray13", "ray13_norm.zarr"))
sdata