# Ray et al 2013 Exploratory Data Analysis
**Authorship:**
Adam Klie, *08/27/2022*
***
**Description:**
Notebook to perform a brief exploratory data analysis (EDA) on the Ray et al (2013) dataset.
***

In [None]:
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import eugene as eu
eu.settings.dataset_dir = "/cellar/users/aklie/data/eugene"

# Download and load in the dataset to a raw `SeqData` object

In [None]:
# Load in the downloaded dataset from the RNAcomplete supplementary website and double check that the index and names loaded in correctly
sdata = eu.datasets.ray13()
np.all(sdata.names == sdata.seqs_annot.index)

In [None]:
# Pull out the columns from seqs_annot to be used as targets (based on motif ID)
target_mask = sdata.seqs_annot.columns.str.contains("RNCMPT")
target_cols = sdata.seqs_annot.columns[target_mask]
len(target_cols)

# Sequence information

In [None]:
# Grab sequence lengths and plot distribution
sdata["seq_len"] = [len(seq) for seq in sdata.seqs]
eu.pl.histplot(
    sdata, 
    keys="seq_len", 
    orient="h"
)
plt.show()

# Target information

In [None]:
# Lets take a look at the distributions of a few randomly selected RBPs across all probes
eu.pl.violinplot(
    sdata, 
    keys=np.random.choice(target_cols, 9)
)
plt.show()

In [None]:
# Plot the number NaN values across target columns and across sequences
fig, ax = plt.subplots(nrows=2, ncols=1)
sdata.seqs_annot.isna().sum(axis=1).plot(kind="hist", ax=ax[0])
sdata.seqs_annot.isna().sum(axis=0).plot(kind="hist", ax=ax[1])
plt.show()

In [None]:
# Determine the number of targets that would be included in the dataset if we dropped those with > 1% NaN values
nan_percents = sdata.seqs_annot[target_cols].isna().sum(axis=0).sort_values(ascending=False)/sdata.seqs_annot.shape[0]
cleaned_annot = sdata.seqs_annot.drop(nan_percents[nan_percents > 0.01].index, axis=1)
cleaned_cols = target_cols.drop(nan_percents[nan_percents > 0.01].index)
cleaned_annot.head()

In [None]:
# Determine how many sequences would be included in the dataset if we dropped those with any NaN values
len(np.where(cleaned_annot[cleaned_cols].isna().sum(axis=1) == 0)[0])

---