In [1]:
import os
import numpy as np
import pandas as pd
import eugene as eu

Global seed set to 13


In [2]:
import sys
sys.path.append("/cellar/users/aklie/projects/use_cases/king20/")
from utils import *

In [3]:
dataset_dir = "/cellar/users/aklie/data/eugene/"

# Synthetic sequences

In [111]:
sdata = king20("SYN", dataset_dir=dataset_dir, return_sdata=True)

Design spreadsheet already exists at /cellar/users/aklie/data/eugene/king20/design.xlsx
Model spreadsheet already exists at /cellar/users/aklie/data/eugene/king20/models.xlsx


In [112]:
# Add this to sanitize seqs
sdata = sdata[~np.array([seq is np.nan for seq in sdata.seqs])]

In [8]:
seq_len_sdata(sdata)

In [115]:
# One-hot-encode, though this is no bueno
eu.pp.ohe_seqs_sdata(sdata, maxlen=sdata["seq_len"].max())

One-hot encoding sequences:   0%|          | 0/636 [00:00<?, ?it/s]

SeqData object modified:
	ohe_seqs: None -> 636 ohe_seqs added


In [116]:
# Add function registry for normalization
sdata["log_CRE_normalized_mean"] = np.log2(sdata["CRE_normalized_mean"])

In [118]:
# Clean up features to use
sdata.seqs_annot = sdata.seqs_annot.replace({True: 1, False: 0})

In [119]:
# Train and test split
eu.pp.train_test_split_sdata(sdata, split=0.5)

SeqData object modified:
    seqs_annot:
        + train_val


In [120]:
# Drop nan values
nan_mask = np.isnan(sdata["log_CRE_normalized_mean"])
sdata = sdata[~nan_mask]

In [121]:
sdata

SeqData object with = 636 seqs
seqs = (624,)
names = (624,)
rev_seqs = None
ohe_seqs = (624, 4, 80)
ohe_rev_seqs = None
seqs_annot: 'Element_id_simplified', 'Class', 'CRE_normalized_mean', 'O_presence', 'S_presence', 'K_presence', 'E_presence', 'Position.4_O', 'Position.4_S', 'Position.4_K', 'Position.4_E', 'Position.3_O', 'Position.3_S', 'Position.3_K', 'Position.3_E', 'Position.2_O', 'Position.2_S', 'Position.2_K', 'Position.2_E', 'Position.1_O', 'Position.1_S', 'Position.1_K', 'Position.1_E', 'seq_len', 'log_CRE_normalized_mean', 'train_val'
pos_annot: None
seqsm: None
uns: None

In [125]:
sdata.write_h5sd(os.path.join(dataset_dir, "king20", "SYN.h5sd"))

# Genomic sequences

In [88]:
sdata = king20("GEN", dataset_dir=dataset_dir, return_sdata=True)

Design spreadsheet already exists at /cellar/users/aklie/data/eugene/king20/design.xlsx
Model spreadsheet already exists at /cellar/users/aklie/data/eugene/king20/models.xlsx
Downloading king20 gkmsvm fasta to /cellar/users/aklie/data/eugene/king20/GEN.fasta
Finished downloading king20 gkmsvm fasta to /cellar/users/aklie/data/eugene/king20/GEN.fasta


In [89]:
sdata.seqs_annot["Class"] = [row[-1] for row in sdata.seqs_annot.index.str.split("_", 1)]

In [90]:
sdata["Class"].value_counts()

All_Mutated    407
Genomic        403
Name: Class, dtype: int64

In [91]:
sdata = sdata[sdata["Class"] == "Genomic"]

In [92]:
seq_len_sdata(sdata)

In [93]:
# One-hot-encode, though this is no bueno
eu.pp.ohe_seqs_sdata(sdata, maxlen=sdata["seq_len"].max())

One-hot encoding sequences:   0%|          | 0/403 [00:00<?, ?it/s]

SeqData object modified:
	ohe_seqs: None -> 403 ohe_seqs added


In [94]:
# Add function registry for normalization
sdata["log_CRE_norm_expression_WT_all"] = np.log2(sdata["CRE_norm_expression_WT_all"])

In [95]:
# Drop nan values
nan_mask = np.isnan(sdata["log_CRE_norm_expression_WT_all"])
sdata = sdata[~nan_mask]

In [57]:
# Get top 25% of CRE and bottom 25% of CRE
top_25 = sdata["CRE_norm_expression_WT_all"].quantile(0.75)

In [58]:
top_25_mask = sdata["CRE_norm_expression_WT_all"] > top_25
bottom_75_mask = sdata["CRE_norm_expression_WT_all"] <= top_25

In [59]:
top_25_mask.sum(), bottom_75_mask.sum()

(203, 607)

In [60]:
# Annotate if in top 25%
sdata["top_25"] = top_25_mask.astype(int)

In [61]:
sdata["top_25"].value_counts()

0    607
1    203
Name: top_25, dtype: int64

In [62]:
eu.pp.train_test_split_sdata(sdata, split=0.7)

In [63]:
pd.crosstab(sdata["train_val"], sdata["top_25"])

top_25,0,1
train_val,Unnamed: 1_level_1,Unnamed: 2_level_1
False,182,61
True,425,142


In [64]:
sdata.write_h5sd(os.path.join(dataset_dir, "king20", "GEN.h5sd"))

## gkm-SVM sequence prep

In [103]:
sdata

SeqData object with = 202 seqs
seqs = (202,)
names = (202,)
rev_seqs = None
ohe_seqs = (202, 4, 82)
ohe_rev_seqs = None
seqs_annot: 'CRE_norm_expression_WT_all', 'range', 'Class', 'seq_len', 'log_CRE_norm_expression_WT_all', 'top_25'
pos_annot: None
seqsm: None
uns: None

In [97]:
# Get top 25% of CRE and bottom 25% of CRE
top_25 = sdata["CRE_norm_expression_WT_all"].quantile(0.75)
bottom_25 = sdata["CRE_norm_expression_WT_all"].quantile(0.25)

In [98]:
top_25_mask = sdata["CRE_norm_expression_WT_all"] > top_25
bottom_25_mask = sdata["CRE_norm_expression_WT_all"] < bottom_25

In [99]:
# Keep only top 25% and bottom 25%
sdata = sdata[top_25_mask | bottom_25_mask]

# Annotate if in top 25%
sdata["top_25"] = top_25_mask.astype(int)

In [100]:
sdata["top_25"].value_counts()

1    101
0    101
Name: top_25, dtype: int64

In [104]:
eu.pp.train_test_split_sdata(sdata, split=0.8)

SeqData object modified:
    seqs_annot:
        + train_val


In [108]:
eu.external.gkm_svm.to_fasta(
    sdata,
    target_key="top_25",
    train_key="train_val",
    out_dir=os.path.join(dataset_dir, "king20", "fasta"),
    file_name="GEN"
)

In [107]:
eu.external.gkm_svm.to_fasta(
    sdata,
    target_key="log_CRE_norm_expression_WT_all",
    train_key="train_val",
    out_dir=os.path.join(dataset_dir, "king20", "fasta"),
    task="regression",
    file_name="GEN"
)

In [109]:
sdata.write_h5sd(os.path.join(dataset_dir, "king20", "GEN_gkm_svm.h5sd"))

# DONE!

---