In [1]:
import eugene as eu
import pandas as pd
import numpy as np
import pickle
from tqdm.auto import tqdm
import pyranges as pr

Global seed set to 13


In [2]:

SiteName2bindingSiteSequence_file =  "../../../_data/siteName2bindingSiteSequence.pkl"
ets_aff_file = "../../../_data/parsed_Ets1_8mers.txt"
gata_aff_file = "../../../_data/parsed_Gata6_3769_contig8mers.txt"
def loadSiteName2bindingSiteSequence(file=SiteName2bindingSiteSequence_file, pickle_obj=True):
    if pickle_obj:
        with open(file, 'rb') as handle:
            b = pickle.load(handle)
        return b
    else:
        print("Only pickles at this time")

# Load Ets1 affinities into a dictionary with keys being all possible 8-mers and values being binding affinities (consensus=1)
def loadEtsAff(file):
    ref = file
    Seq2EtsAff  = {line.split('\t')[0]:float(line.split('\t')[1]) for line in open(ref,'r').readlines()}
    return Seq2EtsAff


# Load Gata6 Badis 2009 affinities into a dictionary with keys being all possible 8-mers and values being binding affinities (consensus=1)
def loadGata6Aff(file):
    ref = file
    Seq2GataAff = {line.split('\t')[0]:float(line.split('\t')[1]) for line in open(ref,'r').readlines()}
    return Seq2GataAff

def merge_dict(dict1, dict2):
    keys = set(dict1.keys()).union(dict2.keys())
    output = {k:max(dict1.get(k,float('-inf')), dict2.get(k, float('-inf'))) for k in keys}
    return output

In [3]:
# Motifs and names
site_dict = loadSiteName2bindingSiteSequence()
for k in list(site_dict.keys()):
    if k.startswith('S'):
        del site_dict[k]

# Affinities
ets_aff = loadEtsAff(ets_aff_file)
gata_aff = loadGata6Aff(gata_aff_file)
aff_dict = merge_dict(ets_aff, gata_aff)

In [4]:
def flatten(l):
    return [item for sublist in l for item in sublist]

def flip(map):
    return {v: k for k, v in map.items()}

def find_motifs_seq(seq, motifs, motif_names=None, starting_pos=0, rev_comp=True):
    """Function to find motifs and annotate the position and orientation of motifs in sequences
    
    Users should be able to specify an exact motif or pass in motif to search for.
    Can make use of the JASPAR database to find motifs.
    
    
    """
    if isinstance(motifs, dict):
        motif_names = list(motifs.keys())
        motifs = list(motifs.values())
    if rev_comp:
        rev_motifs = list(eu.pp.reverse_complement_seqs(motifs, verbose=False))
        all_motifs = motifs + rev_motifs
        orientations = ["F"] * len(motifs) + ["R"] * len(rev_motifs)
        if motif_names is not None:
            all_motif_names = motif_names + motif_names
        else:
            all_motif_names = [f"motif{i}" for i in range(len(motifs))] + [f"motif{i}" for i in range(len(motifs))]
    else:
        all_motifs = motifs
        orientations = ["F"] * len(motifs)
        if motif_names is not None:
            all_motif_names = motif_names
        else:
            all_motif_names = [f"motif{i}" for i in range(len(motifs))]
    longest_motif = max(motifs, key=len)
    shortest_motif = min(motifs, key=len)
    motif_name_dict = dict(zip(all_motifs, all_motif_names))
    motif_orient_dict = dict(zip(all_motifs, orientations))
    motif_hits_dict = {}
    for i in range(starting_pos, len(seq)-len(shortest_motif)+1):
        for j in range(len(longest_motif), len(shortest_motif)-1, -1):
            motif = seq[i:i+j]
            if motif in all_motifs:
                #print(motif_name_dict[motif], motif_orient_dict[motif], i)
                motif_hits_dict.setdefault(i, []).append(motif_name_dict[motif])
                motif_hits_dict.setdefault(i, []).append(motif_orient_dict[motif])
    return motif_hits_dict

def find_affinities_seq(motif_hits, seq, affinity_dict, left_ext=0, right_ext=0):
    """Function to find affinities of sequence motif hits. Must have run find_motifs first.

    Will probably need to have users supply motif-affinity mapping or rely on some database. I can ask about this
    """
    if isinstance(left_ext, int):
        left_ext = [left_ext] * len(motif_hits)
    if isinstance(right_ext, int):
        right_ext = [right_ext] * len(motif_hits)
    for i, pos in enumerate(motif_hits.keys()):
        start = pos - left_ext[i]
        end = pos + right_ext[i]
        if start < 0:
            offset = abs(start)
            start = 0
            end += offset
        if end > len(seq):
            offset = end - len(seq)
            end = len(seq)
            start -= offset
        seqlet = seq[start:end]
        motif_hits[pos].append(seqlet)
        motif_hits[pos].append(affinity_dict[seqlet])
    return motif_hits 

def find_spacings_seq(motif_hits, left_ext=0, right_ext=0):
    """Function to find spacings between motifs. Must have run find_motifs first.
    
    Pretty straightforward. Just need to find the distance between motifs. But this will be a bit tricky
    for overlapping motifs and we need to provide multiple ways to handle this.
    """
    if isinstance(left_ext, int):
        left_ext = [left_ext] * len(motif_hits)
    if isinstance(right_ext, int):
        right_ext = [right_ext] * len(motif_hits)
    sorted_hits = sorted(motif_hits.keys())
    previous_pos = 0
    for i, pos in enumerate(sorted_hits):
        if i == 0:
            spacing = (pos-left_ext[i])-(previous_pos) 
        else:
            spacing = (pos-left_ext[i])-(previous_pos)-1 
        if spacing < 0:
            spacing = 0
        motif_hits[pos].append(spacing)
        previous_pos = pos+right_ext[i]-1
    return motif_hits

def define_seq(seq, motifs, motif_names=None, affinity_dict=None, left_ext=0, right_ext=0, starting_pos=0, rev_comp=True):
    """Functon to define sequences based on motif hits and spacings.

    This wiil most likely return a dictionary in uns or something. This will stop short of full on encodings but is 
    kind of the precursor step to those encodings.
    
    """
    motif_hits = find_motifs_seq(seq, motifs, motif_names, starting_pos, rev_comp)
    if affinity_dict is not None:
        motif_hits = find_affinities_seq(motif_hits, seq, affinity_dict, left_ext, right_ext)
    motif_hits = find_spacings_seq(motif_hits, left_ext, right_ext)
    return motif_hits

def annotate_with_motifs_sdata(sdata, motifs, motif_names=None, affinity_dict=None, left_ext=0, right_ext=0, starting_pos=0, rev_comp=True, copy=False):
    """Convert a list of names and sequences to a dictionary of pyRanges objects"""
    sdata = sdata.copy() if copy else sdata
    d = {"Chromosome": [], "Start": [], "End": [], "Strand": [], "Name": [], "Affinity": [], "Spacing": []}
    seqs = sdata.seqs
    names = sdata.names
    for i, seq in tqdm(enumerate(seqs)):
        feature_def = define_seq(seq, motifs, motif_names, affinity_dict, left_ext, right_ext, starting_pos, rev_comp)
        name = names[i]
        for key in feature_def.keys():
            if key - left_ext < 0:
                start = 0
                offset = abs(key - left_ext)
            else:
                start = key - left_ext
            if (int(key-left_ext) + len(feature_def[key][2])-1) > len(seq):
                end = len(seq)
            else:
                end = int(key-left_ext) + len(feature_def[key][2])-1
            d["Chromosome"].append(name)
            d["Start"].append(int(start))
            d["End"].append(end)
            d["Strand"].append("+" if feature_def[key][1] == "F" else "-")
            d["Name"].append(feature_def[key][0])
            d["Affinity"].append(feature_def[key][3])
            d["Spacing"].append(feature_def[key][4])
    sdata.pos_annot = pr.from_dict(d)
    return sdata if copy else None

In [5]:
sdataframe = eu.dl.read_csv("../../../_data/ols_mini.tsv", seq_col="SEQ", sep="\t", return_dataframe=True)
sdata = eu.dl.SeqData(seqs=sdataframe["SEQ"], seqs_annot=sdataframe[sdataframe.columns.drop("SEQ")])
sdata.names = sdataframe["NAME"]
seq = sdata.seqs[12]
name = sdata.names[12]
motifs = ["GGAA", "GGAT", "GATA"]
motif_names = ["ETS", "ETS", "GATA"]

In [6]:
# Test single seq functions
#motif_hits = find_motifs_seq(seq, motifs, motif_names=motif_names, starting_pos=2)
#motif_hits = find_affinities(motif_hits, seq, aff_dict, left_ext=2, right_ext=6)
#motif_hits = find_spacings(motif_hits, left_ext=2, right_ext=5)
motif_hits = define_seq(seq, motifs, motif_names=motif_names, affinity_dict=aff_dict, left_ext=2, right_ext=6, starting_pos=0)
motif_hits

{1: ['GATA', 'F', 'AGATATTC', 0.22349790163100094, 0],
 21: ['GATA', 'F', 'AAGATAGG', 0.44555488676087596, 12],
 36: ['GATA', 'R', 'GTTATCTC', 0.8467717279226579, 7],
 49: ['ETS', 'F', 'ACGGAAGT', 0.5819540373459362, 5],
 59: ['ETS', 'F', 'AAGGAAAT', 0.39163576347437207, 2]}

In [7]:
annotate_with_motifs_sdata(sdata, motifs=motifs, motif_names=motif_names, affinity_dict=aff_dict, left_ext=2, right_ext=6, starting_pos=0)

0it [00:00, ?it/s]

In [8]:
sdata.pos_annot

Unnamed: 0,Chromosome,Start,End,Strand,Name,Affinity,Spacing
0,S1-E1F-S2-E2F-S3-G3F-S4-G2R-S5-G1R-S6,12,19,+,ETS,0.581954,12
1,S1-E1F-S2-E2F-S3-G3F-S4-G2R-S5-G1R-S6,22,29,+,ETS,0.391636,2
2,S1-E1F-S2-E2F-S3-G3F-S4-G2R-S5-G1R-S6,37,44,+,GATA,0.445555,7
3,S1-E1F-S2-E2F-S3-G3F-S4-G2R-S5-G1R-S6,50,57,-,GATA,0.268038,5
4,S1-E1F-S2-E2F-S3-G3F-S4-G2R-S5-G1R-S6,57,64,-,GATA,0.846772,0
...,...,...,...,...,...,...,...
5060,S5-G3R-S4-G2R-S2-G1F-S3-E1F-S1-E2R-S6,22,29,+,GATA,0.846772,1
5061,S5-G3R-S4-G2R-S2-G1F-S3-E1F-S1-E2R-S6,37,44,+,ETS,0.581954,7
5062,S5-G3R-S4-G2R-S2-G1F-S3-E1F-S1-E2R-S6,0,7,-,GATA,0.445555,0
5063,S5-G3R-S4-G2R-S2-G1F-S3-E1F-S1-E2R-S6,13,20,-,GATA,0.270857,5


In [10]:
def slice_sdata_in_place(sdata, mask):
    """Slice a SeqData object in place"""
    if sdata.names is not None:
        sdata.names = sdata.names[~mask]
    if sdata.seqs is not None:
        sdata.seqs = sdata.seqs[~mask]
        #sdata.n_obs = len(sdata.seqs)
    if sdata.seqs_annot is not None:
        sdata.seqs_annot = sdata.seqs_annot[~mask]
    if sdata.ohe_seqs is not None:
        sdata.ohe_seqs = sdata.ohe_seqs[~mask]
        #sdata.n_obs = len(sdata.ohe_seqs)
    if sdata.ohe_rev_seqs is not None:
        sdata.ohe_rev_seqs = sdata.ohe_rev_seqs[~mask]
    return None

def convert_pos_annot_to_mtx(sdata, seqsm_key="raw_encoding", copy=False):
    """Convert a pyRanges object to a mtx"""
    df = sdata.pos_annot.df
    df = df.set_index("Chromosome").sort_values(by= "Start")
    df = df.loc[sdata.names]
    encodings = df.groupby("Chromosome").apply(lambda x: np.concatenate(x[["Spacing", "Name", "Strand", "Affinity"]].values)).values
    sdata.seqsm[seqsm_key] = encodings

def fix_jagged_array_sdata(sdata, seqsm_key="raw_encoding", strategy="remove", copy=False):
    """Fix jagged array by padding with 0s"""
    arr = sdata.seqsm[seqsm_key]
    if strategy == "remove":
        mismatched_dims = np.array([len(row) != len(arr[0]) for row in arr])
        arr = arr[~mismatched_dims]
        sdata = sdata[~mismatched_dims]
        sdata.seqsm[f"{seqsm_key}_cleaned"] = np.stack(arr)
        return sdata
    elif strategy == "hack":
        min_len = min([len(x) for x in arr])
        arr = [x[:min_len] for x in arr]
    elif strategy == "pad":
        max_len = max([len(x) for x in arr])
        arr = [np.pad(x, (0, max_len-len(x)), "constant") for x in arr]
    sdata = sdata.copy() if copy else sdata
    sdata.seqsm[f"{seqsm_key}_cleaned"] = np.stack(arr)
    return sdata if copy else None

In [11]:
convert_pos_annot_to_mtx(sdata, seqsm_key="raw_encoding")
sdata = fix_jagged_array_sdata(sdata, seqsm_key="raw_encoding", strategy="remove")

In [14]:
sdata.seqsm["raw_encoding_cleaned"][0]

array([12, 'ETS', '+', 0.5819540373459362, 2, 'ETS', '+',
       0.39163576347437207, 7, 'GATA', '+', 0.44555488676087596, 5,
       'GATA', '-', 0.26803787076916263, 0, 'GATA', '-',
       0.8467717279226579], dtype=object)

In [17]:
sdata

SeqData object with = 935 seqs
seqs = (935,)
names = (935,)
rev_seqs = None
ohe_seqs = None
ohe_rev_seqs = None
seqs_annot: 'NAME', 'MPRA_FXN', 'MICROSCOPE_FXN', 'ACTIVITY_SUMRNA_NUMDNA', 'SEQ_LEN', 'linker_1', 'TFBS_1', 'linker_2', 'TFBS_2', 'linker_3', 'TFBS_3', 'linker_4', 'TFBS_4', 'linker_5', 'TFBS_5', 'linker_6'
pos_annot: PyRanges object with 5065 features
seqsm: 'raw_encoding', 'raw_encoding_cleaned'
uns: None

In [151]:
X = sdata.seqsm["raw_encoding_cleaned"]
y = sdata.seqs_annot["MPRA_FXN"].fillna(0).values

In [152]:
X = pd.DataFrame(X).replace({"ETS": 0, "GATA": 1})
X = X.replace({"+": 0, "-": 1})

In [153]:
from sklearn.linear_model import LogisticRegression

In [154]:
clf = LogisticRegression(random_state=13)

In [155]:
clf.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=13)

In [156]:
prob_thresh = 0.5
y_tr_probs = clf.predict_proba(X_train)[:, 1]
y_probs = clf.predict_proba(X_test)[:, 1]
y_tr_preds = (y_tr_probs >= prob_thresh).astype(int)
y_preds = (y_probs >= prob_thresh).astype(int)

NameError: name 'X_train' is not defined

In [42]:
data.values.dtype

dtype('float64')

In [160]:
# Function to return a dictionary with the position of the first nucleotide of every GATA and ETS core fond in an input sequence. Also includes orientation
def findEtsAndGataCores(seq, cores={"ETS_FORWARD": ["GGAA", "GGAT"], "ETS_REVERSE": ["TTCC", "ATCC"], "GATA_FORWARD": ["GATA"], "GATA_REVERSE": ["TATC"]}):
    core_pos = {}
    for i in range(2, len(seq)-5):
        if seq[i:i+4] in cores["ETS_FORWARD"]:
            core_pos.setdefault(i, []).append("ETS")
            core_pos[i].append("F")

        elif seq[i:i+4] in cores["ETS_REVERSE"]:
            core_pos.setdefault(i, []).append("ETS")
            core_pos[i].append("R")

        elif seq[i:i+4] in cores["GATA_FORWARD"]:
            core_pos.setdefault(i, []).append("GATA")
            core_pos[i].append("F")

        elif seq[i:i+4] in cores["GATA_REVERSE"]:
            core_pos.setdefault(i, []).append("GATA")
            core_pos[i].append("R")
    return core_pos

# Function to add the affinity and sequence of the binding site cores identified by findEtsAndGataCores()
def findTFBSAffinity(seq, cores, ets_aff_file="../datasets/parsed_Ets1_8mers.txt", gata_aff_file="../datasets/parsed_Gata6_3769_contig8mers.txt"):
    #ets_aff = loadEtsAff(ets_aff_file)
    #gata_aff = loadGata6Aff(gata_aff_file)
    for pos in cores.keys():
        cores[pos].append(seq[pos-2:pos+6])
        if cores[pos][0] == "ETS":
            cores[pos].append(ets_aff[seq[pos-2:pos+6]])
        elif cores[pos][0] == "GATA":
            cores[pos].append(gata_aff[seq[pos-2:pos+6]])
    return cores

# Function to add the spacing between binding sites given a core dictionary. Specifically adds the distance from the start of each binding site to the last binding site
def findSpacingBetweenTFBS(cores):
    sorted_core_pos = sorted(list(cores.keys()))
    previous_pos = 0
    for i, pos in enumerate(sorted_core_pos):
        if i == 0:
            cores[pos].append((pos-2)-(previous_pos))
        else:
            cores[pos].append((pos-2)-(previous_pos)-1)
        previous_pos = pos+5
    return cores

In [None]:
def find_motifs_seqs(seqs, motifs, motif_names=None, starting_pos=0, rev_comp=True):
    """Function to find motifs and annotate the position and orientation of motifs in sequences
    
    Users should be able to specify an exact motif or pass in motif to search for.
    Can make use of the JASPAR database to find motifs.
    
    
    """
    for seq in seqs:
        find_motifs_seq(seq, motifs, motif_names, starting_pos, rev_comp)

In [402]:
cores = findEtsAndGataCores(seq)
cores = findTFBSAffinity(seq, cores)
findSpacingBetweenTFBS(cores)

{21: ['GATA', 'F', 'AAGATAGG', 0.44555488676087596, 19],
 36: ['GATA', 'R', 'GTTATCTC', 0.8467717279226579, 7],
 49: ['ETS', 'F', 'ACGGAAGT', 0.5819540373459362, 5],
 59: ['ETS', 'F', 'AAGGAAAT', 0.39163576347437207, 2]}

In [None]:
def encode_seq(encoding_type = ""):
    """Function to encode a single sequence based on a specified encoding type.

    Ideas for encoding types:
    - kmer frequencies
    - motif "mixed" encodings
    - based on columns of seqs
    """
    pass

def encode_seqs():
    pass

def encode_seqs_sdata():
    pass

---