# 2021 OLS Exact Syntax Match Sequences Preprocessing Notebook

**Authorship:**
Adam Klie *11/17/2021*
***

**Description:**
This data comes from genomic sequences of *Ciona intestinalis* that closely match the OLS library sequences. These sequences were electroporated into *Ciona* embryos and activity was assayed under the microscope. Classified as **functional** if anything except non-functional under the microscope.

***
**TODOs:**
 - <font color='green'> Load in and preprocess excel sheet </font>
 - <font color='green'> Preprocess into FASTA, labels and seq ids</font>
 - <font color='green'> Preprocess into ohe </font>
 - <font color='green'> Clean up notebook </font>
***

# Set-up

In [122]:
# Classics
import os
import numpy as np
import pandas as pd
import tqdm
import pickle

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

import sys
sys.path.append("/cellar/users/aklie/projects/EUGENE/bin/")
import project_utils
import otx_enhancer_utils

In [124]:
# Training stats for mixed encodings
PREPROCESS = "0.09-0.4"  # String defining the preprocessing for saving
SPLIT = 0.9  # Split into training and test sets

# Load data to preprocess

In [125]:
# Load the excel spreadsheet with the sequences and functional annotation
dataset = pd.read_excel("data4adam.xlsx", sheet_name=0)
dataset.columns = ["NAME", "SEQ", "FXN_LABEL", "FXN_DESCRIPTION"]
dataset["TILE"] = "Full"
dataset["SEQ_LEN"] = dataset["SEQ"].apply(len)
dataset.head()

Unnamed: 0,NAME,SEQ,FXN_LABEL,FXN_DESCRIPTION,TILE,SEQ_LEN
0,EM.Ci1,CCTATCTTAGATATTGAACAATTTCCTTTTCGAAAACTTCCGTTAG...,1,WT Control,Full,53
1,EM.Dr1,TTTTCCAGTTGCGATATTTTTGGTATTTATCTTCAGGAAATCACTA...,1,b-lineage only,Full,61
2,EM.Ggd1,CAGATATGGTACCTTGCGATATTATAGTCAGGAAGCATGGAAAGAA...,0,No expr,Full,53
3,EM.Mm6,AATATCTGTTTATCACCAAAGGAAGTCAAGACAGGAACTATGCAGA...,0,No expr,Full,58
4,EM.Hs9,CATATCTACAAAGGAAGTAACCATACGATAGTATAAAATGATAGAA...,0,No expr,Full,65


In [126]:
# Save non-tiled dataframe in standard format
dataset.to_csv("2021_OLS_Exact_Syntax_Match.tsv", index=False, sep="\t")

# Double check valid labeling
dataset["FXN_LABEL"].value_counts()

1    7
0    6
Name: FXN_LABEL, dtype: int64

# Save labels, IDs, and Seqs

### **Binary Labels**
0 (non-functional) and 1 (functional). These are all the same

In [127]:
# Save the labels
if not os.path.isdir("binary"):
    os.makedirs("binary")

y = dataset["FXN_LABEL"].values
np.savetxt("binary/y_binary.txt", X=y, fmt="%d")

!wc -l binary/*

13 binary/y_binary.txt


### **Identifiers**
Name of the sequence to identify it

In [128]:
# Save the ids
if not os.path.isdir("id"):
    os.makedirs("id")
    
ID = dataset["NAME"].values
np.savetxt("id/id.txt", X=ID, fmt="%s")

!wc -l id/*

13 id/id.txt


### **Sequences**
ACGT...

In [129]:
# Save the seqs
if not os.path.isdir("seqs"):
    os.makedirs("seqs")
    
X_seqs = dataset["SEQ"].values
np.savetxt("seqs/seqs.txt", X=X_seqs, fmt="%s")

!wc -l seqs/*

13 seqs/seqs.txt


# Preprocess and save different feature sets

## **<u>Sequence feature idea 1 </u>**: One hot encoding block features
 - linker_{1-5} could be a one hot encoded vector of length 6 that can be any of L1-L5 --> e.g., [0, 1, 0, 0, 0] encodes S2
 - TFBS_{1-5} could be a one hot encoded vector of length 10 that can be G{1-3}R, G{1-3}F, E{1,2}F, E{1,2}R
 - **Note**: L6 is only S6

In [130]:
#TODO Would need to also include the closest linker without being reduntant?

## **<u>Sequence feature idea 2 </u>**: Mixed encodings 

In [142]:
X_mixed1s, X_mixed2s, X_mixed3s, valid_idxs = otx_enhancer_utils.mixed_encode(dataset)
X_mixed1s.shape, X_mixed2s.shape, X_mixed3s.shape, len(valid_idxs)

13it [00:00, 106.34it/s]


((13, 21), (13, 26), (13, 21), 13)

In [143]:
X_mixed1s[0], X_mixed2s[0], X_mixed3s[0]

(array([ 0.        ,  0.        ,  0.        ,  0.44555489, -1.        ,
         0.        ,  1.        ,  0.32103457,  5.        ,  1.        ,
         0.        ,  0.39163576,  7.        ,  1.        ,  0.        ,
         0.58195404,  2.        ,  0.        ,  1.        ,  0.84677173,
         0.        ]),
 array([ 0.        ,  0.        ,  0.        ,  0.44555489, -1.        ,
        -1.        ,  0.        ,  0.        ,  0.32103457,  1.        ,
         5.        ,  0.39163576, -1.        ,  0.        ,  0.        ,
         7.        ,  0.58195404, -1.        ,  0.        ,  0.        ,
         2.        ,  0.        ,  0.        ,  0.84677173,  1.        ,
         0.        ]),
 array([ 0.        ,  0.        ,  0.44555489,  0.        , -1.        ,
         0.        ,  0.32103457,  1.        ,  5.        ,  0.39163576,
         0.        ,  0.        ,  7.        ,  0.58195404,  0.        ,
         0.        ,  2.        ,  0.        ,  0.84677173,  1.        ,
     

### *Mixed 1.0*
 - Replace binding sites using dictionary
 - Separate based on these binding sites and add create "dummy variables"
 - Get lengths of linkers around binding sites

In [144]:
# Load in training stats
with open("../2021_OLS_Library/mixed_1.0/{}_X-train-{}_stats.pickle".format(PREPROCESS, SPLIT), 'rb') as handle:
    train_stats = pickle.load(handle)
scale_indeces = train_stats["indeces"]

# Full sequences
X_mixed1s[:, scale_indeces] -= train_stats["means"]
X_mixed1s[:, scale_indeces] /= train_stats["stds"]

# Save the vals
if not os.path.isdir("mixed_1.0"):
    os.makedirs("mixed_1.0")
    
np.save("mixed_1.0/{}_{}-split_X-test_mixed-1.0".format(PREPROCESS, SPLIT), X_mixed1s)
np.savetxt("mixed_1.0/valid_id.txt", X=ID[valid_idxs], fmt="%s")

!ls -l mixed_1.0/*

-rw-r--r-- 1 aklie carter-users 2312 Nov 27 17:36 mixed_1.0/0.09-0.4_0.9-split_X-test_mixed-1.0.npy
-rw-r--r-- 1 aklie carter-users 2312 Nov 26 13:50 mixed_1.0/0.18-0.4_0.9-split_X-test_mixed-1.0.npy
-rw-r--r-- 1 aklie carter-users   92 Nov 26 23:02 mixed_1.0/id-valid.txt
-rw-r--r-- 1 aklie carter-users   92 Nov 27 17:36 mixed_1.0/valid_id.txt


### *Mixed 2.0*
 - Replace binding sites using dictionary
 - 4 bit vector for each binding site [ets_affinity ets_orientation gata_affinity gata_orientation] - ties together the identity to affinity
 - Get lengths of linkers around binding sites

In [145]:
# Load in training stats
with open("../2021_OLS_Library/mixed_2.0/{}_X-train-{}_stats.pickle".format(PREPROCESS, SPLIT), 'rb') as handle:
    train_stats = pickle.load(handle)
scale_indeces = train_stats["indeces"]

# Full sequences
X_mixed2s[:, scale_indeces] -= train_stats["means"]
X_mixed2s[:, scale_indeces] /= train_stats["stds"]

# Save the vals
if not os.path.isdir("mixed_2.0"):
    os.makedirs("mixed_2.0")
    
np.save("mixed_2.0/{}_{}-split_X-test_mixed-2.0".format(PREPROCESS, SPLIT), X_mixed2s)
np.savetxt("mixed_2.0/valid_id.txt", X=ID[valid_idxs], fmt="%s")

!ls -l mixed_2.0/*

-rw-r--r-- 1 aklie carter-users 2832 Nov 27 17:36 mixed_2.0/0.09-0.4_0.9-split_X-test_mixed-2.0.npy
-rw-r--r-- 1 aklie carter-users 2832 Nov 26 13:51 mixed_2.0/0.18-0.4_0.9-split_X-test_mixed-2.0.npy
-rw-r--r-- 1 aklie carter-users   92 Nov 26 23:02 mixed_2.0/id-valid.txt
-rw-r--r-- 1 aklie carter-users   92 Nov 27 17:36 mixed_2.0/valid_id.txt


### *Mixed 3.0*
 - Replace binding sites using dictionary
 - 3 bit vector for each binding site [ets_affinity gata_affinity orientation] - ties together the identity to affinity while removing redundant info from mixed-2.0
 - Get lengths of linkers around binding sites

In [146]:
# Load in training stats
with open("../2021_OLS_Library/mixed_3.0/{}_X-train-{}_stats.pickle".format(PREPROCESS, SPLIT), 'rb') as handle:
    train_stats = pickle.load(handle)
scale_indeces = train_stats["indeces"]

# Full sequences
X_mixed3s[:, scale_indeces] -= train_stats["means"]
X_mixed3s[:, scale_indeces] /= train_stats["stds"]
# Save the vals
if not os.path.isdir("mixed_3.0"):
    os.makedirs("mixed_3.0")
    
np.save("mixed_3.0/{}_{}-split_X-test_mixed-3.0".format(PREPROCESS, SPLIT), X_mixed3s)
np.savetxt("mixed_3.0/valid_id.txt", X=ID[valid_idxs], fmt="%s")

!ls -l mixed_3.0/*

-rw-r--r-- 1 aklie carter-users 2312 Nov 27 17:36 mixed_3.0/0.09-0.4_0.9-split_X-test_mixed-3.0.npy
-rw-r--r-- 1 aklie carter-users 2312 Nov 26 13:52 mixed_3.0/0.18-0.4_0.9-split_X-test_mixed-3.0.npy
-rw-r--r-- 1 aklie carter-users   92 Nov 26 23:02 mixed_3.0/id-valid.txt
-rw-r--r-- 1 aklie carter-users   92 Nov 27 17:36 mixed_3.0/valid_id.txt


## **<u>Sequence feature idea 3 </u>**: Use the actual sequence (one-hot encoded)
 - One hot encoded sequence: each position is encoded as a 1-D vector of size 4 e.g., AT is [[1,0,0,0], [0,0,0,1]]
 - Generally, we will get inputs of size (len(seq) X 4). The above example would be of size 2x4
 - Can also save the string seqs in case those are also useful down the line

**Q** Are all sequences the same length

In [135]:
# Check the lengths of sequences to make sure they are all the same
dataset["SEQ"].apply(len).value_counts()

53    4
58    3
65    2
63    2
54    1
61    1
Name: SEQ, dtype: int64

**Answer**: Nope

### *Forward Encoding*

**Full sequences**

In [136]:
# Get the sequences only
X_seqs = [seq.upper().strip() for seq in dataset["SEQ"].values]
X_ohe_seq = project_utils.ohe_seqs(X_seqs)

# Save the seqs
if not os.path.isdir("ohe_seq"):
    os.makedirs("ohe_seq")
    
# Save in binary format
np.save("ohe_seq/X_ohe-seq", X_ohe_seq)

# Quick check
X_seqs[0][:5], X_ohe_seq[0][:5]

100%|██████████| 13/13 [00:00<00:00, 548.72it/s]

Encoded 13 seqs
Checking all 13 seqs for proper encoding
Sequence encoding was great success



  if l < 1000:


('CCTAT',
 array([[0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.]]))

### *Reverse encoding*

**Full sequences**

In [137]:
# Get the reverse encodings
X_rev_seqs = np.array(["".join({'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}.get(base, base) for base in reversed(seq)) for seq in X_seqs])
X_ohe_rev_seq = project_utils.ohe_seqs(X_rev_seqs)

# Save the seqs
if not os.path.isdir("ohe_seq"):
    os.makedirs("ohe_seq")
    
# Save in binary format
np.save("ohe_seq/X_ohe-seq-rev", X_ohe_rev_seq)

# Quick check
X_rev_seqs[0][:5], X_ohe_rev_seq[0][:5]

100%|██████████| 13/13 [00:00<00:00, 641.72it/s]

Encoded 13 seqs
Checking all 13 seqs for proper encoding
Sequence encoding was great success



  if l < 1000:


('GTTAT',
 array([[0., 0., 1., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.]]))

## **<u>Sequence feature idea 4 </u>**: Use the actual sequence (as fasta)

In [138]:
# Save full sequences in fasta format
if not os.path.isdir("fasta"):
    os.makedirs("fasta")
    
file = open("fasta/X_fasta.fa", "w")
for i, enh in dataset.iterrows():
    file.write(">" + ID[i] + "\n" + X_seqs[i].upper().strip() + "\n")
file.close()

!wc -l fasta/*

26 fasta/X_fasta.fa


# Final Checks

In [139]:
!tree -L 2

[38;5;33m.[0m
├── 2021_OLS_Exact_Syntax_Match.ipynb
├── 2021_OLS_Exact_Syntax_Match.tsv
├── [38;5;33mbinary[0m
│   └── y_binary.txt
├── data4adam.xlsx
├── [38;5;33mfasta[0m
│   └── X_fasta.fa
├── [38;5;33mid[0m
│   └── id.txt
├── [38;5;33mmixed_1.0[0m
│   ├── 0.09-0.4_0.9-split_X-test_mixed-1.0.npy
│   ├── 0.18-0.4_0.9-split_X-test_mixed-1.0.npy
│   └── valid_id.txt
├── [38;5;33mmixed_2.0[0m
│   ├── 0.09-0.4_0.9-split_X-test_mixed-2.0.npy
│   ├── 0.18-0.4_0.9-split_X-test_mixed-2.0.npy
│   └── valid_id.txt
├── [38;5;33mmixed_3.0[0m
│   ├── 0.09-0.4_0.9-split_X-test_mixed-3.0.npy
│   ├── 0.18-0.4_0.9-split_X-test_mixed-3.0.npy
│   └── valid_id.txt
├── [38;5;33mohe_seq[0m
│   ├── X_ohe-seq.npy
│   └── X_ohe-seq-rev.npy
└── [38;5;33mseqs[0m
    └── seqs.txt

8 directories, 18 files


In [140]:
%%bash
head fasta/X_fasta.fa
head -n 5 id/id.txt

>EM.Ci1
CCTATCTTAGATATTGAACAATTTCCTTTTCGAAAACTTCCGTTAGAGATAAC
>EM.Dr1
TTTTCCAGTTGCGATATTTTTGGTATTTATCTTCAGGAAATCACTAAAGGCATGTTATCTT
>EM.Ggd1
CAGATATGGTACCTTGCGATATTATAGTCAGGAAGCATGGAAAGAAAGATAGC
>EM.Mm6
AATATCTGTTTATCACCAAAGGAAGTCAAGACAGGAACTATGCAGAATATTTGATATG
>EM.Hs9
CATATCTACAAAGGAAGTAACCATACGATAGTATAAAATGATAGAACAAAAGTGTCTTTGGAATT
EM.Ci1
EM.Dr1
EM.Ggd1
EM.Mm6
EM.Hs9


# Scratch

In [71]:
def findEtsAndGataCores(seq, cores={"ETS_FORWARD": ["GGAA","GGAT"], "ETS_REVERSE": ["TTCC", "ATCC"], "GATA_FORWARD": ["GATA"], "GATA_REVERSE": ["TATC"]}):
    core_pos = {}
    #core_pos = {"ETS":[], "GATA":[]}
    #core_orient = {"ETS":[], "GATA":[]}
    for i in range(len(seq)):
        if seq[i:i+4] in cores["ETS_FORWARD"]:
            #core_pos["ETS"].append(i)
            #core_orient["ETS"].append("F")
            core_pos.setdefault(i, []).append("ETS")
            core_pos[i].append("F")
            
        elif seq[i:i+4] in cores["ETS_REVERSE"]:
            #core_pos["ETS"].append(i)
            #core_orient["ETS"].append("R")
            core_pos.setdefault(i, []).append("ETS")
            core_pos[i].append("R")
            
        elif seq[i:i+4] in cores["GATA_FORWARD"]:
            #core_pos["GATA"].append(i)
            #core_orient["GATA"].append("F")
            core_pos.setdefault(i, []).append("GATA")
            core_pos[i].append("F")
            
        elif seq[i:i+4] in cores["GATA_REVERSE"]:
            #core_pos["GATA"].append(i)
            #core_orient["GATA"].append("R")
            core_pos.setdefault(i, []).append("GATA")
            core_pos[i].append("R")
     
    return core_pos
    #return core_pos, core_orient

def findTFBSAffinity(seq, cores, ets_aff_file, gata_aff_file):
    #bs_affs = {"ETS":[], "GATA":[]}
    #bs = {"ETS":[], "GATA":[]}
    ets_aff = otx_enhancer_utils.loadEtsAff(ets_aff_file)
    gata_aff = otx_enhancer_utils.loadGata6Aff(gata_aff_file)
    
    for pos in cores.keys():
        cores[pos].append(seq[pos-2:pos+6])
        if cores[pos][0] == "ETS":
            cores[pos].append(ets_aff[seq[pos-2:pos+6]])
        elif cores[pos][0] == "GATA":
            cores[pos].append(gata_aff[seq[pos-2:pos+6]])
    return cores
    #for tf in cores.keys():
    #    for pos in cores[tf]:
    #        if tf == "ETS":
    #            bs["ETS"].append(seq[pos-2:pos+6])
    #            bs_affs["ETS"].append(ets_aff[seq[pos-2:pos+6]])
    #        elif tf == "GATA":
    #            bs["GATA"].append(seq[pos-2:pos+6])
    #            bs_affs["GATA"].append(ets_aff[seq[pos-2:pos+6]])
    #return bs, bs_affs

def findSpacingBetweenTFBS(cores):
    #bs_before_spacings = {}
    #bs_after_spacings = {}
    #all_core_pos = sorted(list(set().union(*cores.values())))
    sorted_core_pos = sorted(list(test_cores.keys()))
    previous_pos = 0
    for i, pos in enumerate(sorted_core_pos):
        #print(i, pos, pos-2, pos+5, end=" ")
        if i == 0:
            cores[pos].append((pos-2)-(previous_pos))
            #print((pos-2)-previous_pos)
        else:
            cores[pos].append((pos-2)-(previous_pos)-1)
            #print((pos-2)-previous_pos-1)
        previous_pos = pos+5
    return cores

def defineTFBS(seq):
    tfbs = findEtsAndGataCores(seq)
    tfbs = findTFBSAffinity(seq, tfbs, "../auxiliary/parsed_Ets1_8mers.txt", "../auxiliary/parsed_Gata6_3769_contig8mers.txt")
    tfbs = findSpacingBetweenTFBS(tfbs)
    return tfbs

def hamming_distance(string1, string2): 
    distance = 0
    L = len(string1)
    if L != len(string2):
        return np.inf
    for i in range(L):
        if string1[i] != string2[i]:
            distance += 1
    return distance


def findClosestOLSMatch(tfbs_dict, match_dict):
    for pos, bs in tfbs_dict.items():
        seq = bs[2]
        closest_match = None
        #print(seq)
        min_distance = np.inf
        for key, val in match_dict.items():
            #print(key, val)
            if key == "G2F":
                dist = hamming_distance(seq[1:], match_dict[key])
            elif key == "G2R":
                dist = hamming_distance(seq[:-1], match_dict[key])
            else:
                dist = hamming_distance(seq, match_dict[key])
            if dist < min_distance:
                min_distance = dist
                closest_match = key
        tfbs_dict[pos].append(closest_match)
        tfbs_dict[pos].append(match_dict[closest_match])
        tfbs_dict[pos].append(min_distance)
    return tfbs_dict

In [125]:
test_seq = dataset["seq"].iloc[0]
test_tfbs = defineTFBS(test_seq)
findClosestOLSMatch(test_tfbs, OLS_SITE_SEQ)

{2: ['GATA', 'R', 'CCTATCTT', 0.44555488676087596, 0, 'G3R', 'CCTATCTT', 0],
 9: ['GATA', 'F', 'TAGATATT', 0.32103457131865104, -1, 'G2F', 'AGATATT', 0],
 22: ['ETS', 'R', 'ATTTCCTT', 0.39163576347437207, 5, 'E2R', 'ATTTCCTT', 0],
 37: ['ETS', 'R', 'ACTTCCGT', 0.5819540373459362, 7, 'E1R', 'ACTTCCGT', 0],
 47: ['GATA', 'F', 'GAGATAAC', 0.8467717279226579, 2, 'G1F', 'GAGATAAC', 0]}

In [8]:
test_seq = dataset["seq"].iloc[0]
test_cores, test_orients = findEtsAndGataCores(test_seq)
print(test_cores, test_orients)
for key in test_cores.keys():
    for value in test_cores[key]:
        print(key + ":" + str(value), test_seq[value:value+4], end="\t")
print()
test_bs, test_bs_affs = v
print(test_bs, test_bs_affs)

{'ETS': [22, 37], 'GATA': [2, 9, 47]} {'ETS': ['R', 'R'], 'GATA': ['R', 'F', 'F']}
ETS:22 TTCC	ETS:37 TTCC	GATA:2 TATC	GATA:9 GATA	GATA:47 GATA	
{'ETS': ['ATTTCCTT', 'ACTTCCGT'], 'GATA': ['CCTATCTT', 'TAGATATT', 'GAGATAAC']} {'ETS': [0.39163576347437207, 0.5819540373459362], 'GATA': [0.10379684367660279, 0.107313254961063, 0.10241362293259179]}


In [122]:
# Load dictionaries
ETS_AFF = otx_enhancer_utils.loadEtsAff("../auxiliary/parsed_Ets1_8mers.txt")
GATA_AFF = otx_enhancer_utils.loadGata6Aff("../auxiliary/parsed_Gata6_3769_contig8mers.txt")
OLS_SITE_AFF = otx_enhancer_utils.loadBindingSiteName2affinities("../auxiliary/bindingSiteName2affinities.pkl")
OLS_SITE_SEQ = otx_enhancer_utils.loadBindingSiteName2affinities("../auxiliary/siteName2bindingSiteSequence.pkl")

In [133]:
otx_enhancer_utils.defineTFBS(dataset["seq"].iloc[0])

{2: ['GATA', 'R', 'CCTATCTT', 0.44555488676087596, 0, 'G3R', 'CCTATCTT', 0],
 9: ['GATA', 'F', 'TAGATATT', 0.32103457131865104, -1, 'G2F', 'AGATATT', 0],
 22: ['ETS', 'R', 'ATTTCCTT', 0.39163576347437207, 5, 'E2R', 'ATTTCCTT', 0],
 37: ['ETS', 'R', 'ACTTCCGT', 0.5819540373459362, 7, 'E1R', 'ACTTCCGT', 0],
 47: ['GATA', 'F', 'GAGATAAC', 0.8467717279226579, 2, 'G1F', 'GAGATAAC', 0]}

In [115]:
mixed1_encoding = []
for i, (row_num, enh_data) in tqdm.tqdm(enumerate(dataset.iterrows())):
    seq = enh_data["SEQ"]
    enh_tfbs = otx_enhancer_utils.defineTFBS(seq)
    enh_encoding = []
    for pos, tfbs in enh_tfbs.items():
        if tfbs[0] == "ETS":
            enh_encoding += [tfbs[4], "E", tfbs[1], tfbs[3]]
        elif tfbs[0] == "GATA":
            enh_encoding += [tfbs[4], "G", tfbs[1], tfbs[3]]
    enh_encoding.append(len(seq)-(pos+5)-1)
    mixed1_encoding.append(enh_encoding)

# Replace strings with one hot encoding
X_mixed1 = (
    pd.DataFrame(mixed1_encoding)
    .replace({"G": 0, "E": 1, "R": 0, "F": 1})
)
X_mixed1 = X_mixed1.values

# Load in training stats
with open("../2021_OLS_Library/mixed_1.0/{}_X-train-{}_stats.pickle".format(PREPROCESS, SPLIT), 'rb') as handle:
    train_stats = pickle.load(handle)

# Z-score test set
scale_indeces = train_stats["indeces"]
X_mixed1[:, scale_indeces] -= train_stats["means"]
X_mixed1[:, scale_indeces] /= train_stats["stds"]

# Save the vals
if not os.path.isdir("mixed_1.0"):
    os.makedirs("mixed_1.0")
    
np.save("mixed_1.0/{}_{}-split_X-test_mixed-1.0".format(PREPROCESS, SPLIT), X_mixed1)

!ls -l mixed_1.0

X_mixed1.shape, mixed1_encoding[0], X_mixed1[0]

13it [00:02,  5.05it/s]


total 3
-rw-r--r-- 1 aklie carter-users 2312 Nov 26 11:36 0.18-0.4_0.9-split_X-test_mixed-1.0.npy


((13, 21),
 [0,
  'G',
  'R',
  0.44555488676087596,
  -1,
  'G',
  'F',
  0.32103457131865104,
  5,
  'E',
  'R',
  0.39163576347437207,
  7,
  'E',
  'R',
  0.5819540373459362,
  2,
  'G',
  'F',
  0.8467717279226579,
  0],
 array([-1.22502958,  0.        ,  0.        , -0.4896724 , -1.46094887,
         0.        ,  1.        , -1.06454571, -0.01487938,  1.        ,
         0.        , -0.71835913,  0.46118317,  1.        ,  0.        ,
         0.21853976, -0.73632483,  0.        ,  1.        ,  1.51181242,
        -2.96917909]))

In [116]:
mixed2_encoding = []
for i, (row_num, enh_data) in tqdm.tqdm(enumerate(dataset.iterrows())):
    seq = enh_data["SEQ"]
    enh_tfbs = otx_enhancer_utils.defineTFBS(seq)
    enh_encoding = []
    for pos, tfbs in enh_tfbs.items():
        if tfbs[0] == "ETS":
            enh_encoding += [tfbs[4], tfbs[3], tfbs[1], 0, 0]
        elif tfbs[0] == "GATA":
            enh_encoding += [tfbs[4], 0, 0, tfbs[3], tfbs[1]]
    enh_encoding.append(len(seq)-(pos+5)-1)
    mixed2_encoding.append(enh_encoding)

# Replace strings with one hot encoding
X_mixed2 = (
    pd.DataFrame(mixed2_encoding)
    .replace({"G": 0, "E": 1, "R": 0, "F": 1})
)
X_mixed2 = X_mixed2.values

# Load in training stats
with open("../2021_OLS_Library/mixed_2.0/{}_X-train-{}_stats.pickle".format(PREPROCESS, SPLIT), 'rb') as handle:
    train_stats = pickle.load(handle)

# Z-score test set
scale_indeces = train_stats["indeces"]
X_mixed2[:, scale_indeces] -= train_stats["means"]
X_mixed2[:, scale_indeces] /= train_stats["stds"]

# Save the vals
if not os.path.isdir("mixed_2.0"):
    os.makedirs("mixed_2.0")
    
np.save("mixed_2.0/{}_{}-split_X-test_mixed-2.0".format(PREPROCESS, SPLIT), X_mixed1)

!ls -l mixed_2.0

X_mixed2.shape, mixed2_encoding[0], X_mixed2[0]

13it [00:02,  5.17it/s]


total 3
-rw-r--r-- 1 aklie carter-users 2312 Nov 26 11:36 0.18-0.4_0.9-split_X-test_mixed-2.0.npy


((13, 26),
 [0,
  0,
  0,
  0.44555488676087596,
  'R',
  -1,
  0,
  0,
  0.32103457131865104,
  'F',
  5,
  0.39163576347437207,
  'R',
  0,
  0,
  7,
  0.5819540373459362,
  'R',
  0,
  0,
  2,
  0,
  0,
  0.8467717279226579,
  'F',
  0],
 array([-1.22502958,  0.        ,  0.        ,  0.44555489,  0.        ,
        -1.46094887,  0.        ,  0.        ,  0.32103457,  1.        ,
        -0.01487938,  0.39163576,  0.        ,  0.        ,  0.        ,
         0.46118317,  0.58195404,  0.        ,  0.        ,  0.        ,
        -0.73632483,  0.        ,  0.        ,  0.84677173,  1.        ,
        -2.96917909]))

In [117]:
mixed3_encoding = []
for i, (row_num, enh_data) in tqdm.tqdm(enumerate(dataset.iterrows())):
    seq = enh_data["SEQ"]
    enh_tfbs = otx_enhancer_utils.defineTFBS(seq)
    enh_encoding = []
    for pos, tfbs in enh_tfbs.items():
        if tfbs[0] == "ETS":
            enh_encoding += [tfbs[4], tfbs[3], 0, tfbs[1]]
        elif tfbs[0] == "GATA":
            enh_encoding += [tfbs[4], 0, tfbs[3], tfbs[1]]
    enh_encoding.append(len(seq)-(pos+5)-1)
    mixed3_encoding.append(enh_encoding)

# Replace strings with one hot encoding
X_mixed3 = pd.DataFrame(mixed3_encoding).replace({"R": -1, "F": 1})
X_mixed3 = X_mixed3.values

# Load in training stats
with open("../2021_OLS_Library/mixed_3.0/{}_X-train-{}_stats.pickle".format(PREPROCESS, SPLIT), 'rb') as handle:
    train_stats = pickle.load(handle)

# Z-score test set
scale_indeces = train_stats["indeces"]
X_mixed3[:, scale_indeces] -= train_stats["means"]
X_mixed3[:, scale_indeces] /= train_stats["stds"]

# Save the vals
if not os.path.isdir("mixed_3.0"):
    os.makedirs("mixed_3.0")
    
np.save("mixed_3.0/{}_{}-split_X-test_mixed-3.0".format(PREPROCESS, SPLIT), X_mixed1)

!ls -l mixed_3.0

X_mixed3.shape, mixed3_encoding[0], X_mixed3[0]

13it [00:02,  5.24it/s]


total 3
-rw-r--r-- 1 aklie carter-users 2312 Nov 26 11:36 0.18-0.4_0.9-split_X-test_mixed-3.0.npy


((13, 21),
 [0,
  0,
  0.44555488676087596,
  'R',
  -1,
  0,
  0.32103457131865104,
  'F',
  5,
  0.39163576347437207,
  0,
  'R',
  7,
  0.5819540373459362,
  0,
  'R',
  2,
  0,
  0.8467717279226579,
  'F',
  0],
 array([-1.22502958,  0.        ,  0.44555489, -1.        , -1.46094887,
         0.        ,  0.32103457,  1.        , -0.01487938,  0.39163576,
         0.        , -1.        ,  0.46118317,  0.58195404,  0.        ,
        -1.        , -0.73632483,  0.        ,  0.84677173,  1.        ,
        -2.96917909]))

# References