# Kopp et al 2021 Exploratory Data Analysis
**Authorship:**
Adam Klie (last updated: *06/10/2023*)
***
**Description:**
This notebook is meant to be a quick exploratory data analysis of the Kopp21 et al (2021) dataset. The goal is to get a feel for the data and to identify any issues that may need to be addressed before we can begin to analyze the data.
***

In [None]:
# General imports
import os
import sys
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# EUGENe imports
import eugene as eu
from eugene import plot as pl
from eugene import settings
settings.dataset_dir = "/cellar/users/aklie/data/eugene/revision/kopp21"

# EUGENe packages
import seqdatasets
import seqdata as sd
import seqpro as sp

# Print versions
print(f"Python version: {sys.version}")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"Xarray version: {xr.__version__}")
print(f"Eugene version: {eu.__version__}")
print(f"SeqDatasets version: {seqdatasets.__version__}")
print(f"SeqData version: {sd.__version__}")
print(f"SeqPro version: {sp.__version__}")

# Data Extraction

In [None]:
# Load data
sdata = sd.open_zarr(os.path.join(settings.dataset_dir, 'kopp21.zarr'))
sdata

In [None]:
# Create sequence length variable
sdata["seq_len"] = xr.DataArray(sp.length(sdata['seq'].values), dims=["_sequence"])

# Data Visualization

In [None]:
# Double check that the sequences are the same length
pl.histplot(
    sdata, 
    vars="seq_len", 
    orient="h",
    bins=10,
    figsize=(4,4)
)

In [None]:
def Set_Chr_Nr_ (Chr):
    """ Sort by chromosome """
    if Chr: 
        New = Chr[3:]
        if New == 'X': New = 23
        elif New == 'Y': New = 24
        elif New == 'M': New = 25
        else: New = int(New)
    else:
        New = 0
    return New
ord = sorted(np.unique(sdata["chrom"].values), key=lambda x: Set_Chr_Nr_(x))

In [None]:
# Check the distribution of sequences on chromosomes
pl.countplot(
    sdata,
    vars="chrom",
    xtick_rot=90,
    order=ord
)

# DONE!

---

# Scratch