Some helper functions for breaking a gene down into kmers

## Helpers

In [3]:
import pandas as pd
import requests
import random

In [4]:
def get_gene(gene_id):
    r = requests.get("https://rest.ensembl.org/sequence/id/{}?content-type=text/plain".format(gene_id))
    return r.text

In [8]:
malat1=get_gene('ENSMUSG00000092341')

In [9]:
len(malat1)

6983

In [10]:
get_gene('ENST00000237014')

'ACAGAAGTCCACTCATTCTTGGCAGGATGGCTTCTCATCGTCTGCTCCTCCTCTGCCTTGCTGGACTGGTATTTGTGTCTGAGGCTGGCCCTACGGTGAGTGTTTCTGTGACATCCCATTCCTACATTTAAGATTCACGCTAAATGAAGTAGAAGTGACTCCTTCCAGCTTTGCCAACCAGCTTTTATTACTAGGGCAAGGGTACCCAGCATCTATTTTTAATATAATTAATTCAAACTTCAAAAAGAATGAAGTTCCACTGAGCTTACTGAGCTGGGACTTGAACTCTGAGCATTCTACCTCATTGCTTTGGTGCATTAGGTTTGTAATATCTGGTACCTCTGTTTCCTCAGATAGATGATAGAAATAAAGATATGATATTAAGGAAGCTGTTAATACTGAATTTTCAGAAAAGTATCCCTCCATAAAATGTATTTGGGGGACAAACTGCAGGAGATTATATTCTGGCCCTATAGTTATTCAAAACGTATTTATTGATTAATCTTTAAAAGGCTTAGTGAACAATATTCTAGTCAGATATCTAATTCTTAAATCCTCTAGAAGAATTAACTAATACTATAAAATGGGTCTGGATGTAGTTCTGACATTATTTTATAACAACTGGTAAGAGGGAGTGACTATAGCAACAACTAAAATGATCTCAGGAAAACCTGTTTGGCCCTATGTATGGTACATTACATCTTTTCAGTAATTCCACTCAAATGGAGACTTTTAACAAAGCAACTGTTCTCAGGGGACCTATTTTCTCCCTTAAAATTCATTATACACATCCCTGGTTGATAGCAGTGTGTCTGGAGGCAGAAACCATTCTTGCTTTGGAAACAATTACGTCTGTGTTATACTGAGTAGGGAAGCTCATTAATTGTCGACACTTACGTTCCTGATAATGGGATCAGTGTGTAATTCTTGTTTCGCTCCAGATTTCTAATACCACAAAGAATAAATCCTTTCACTCTGATCAATTTTGTTAACTTCTCA

In [8]:
def revcomp(seq):
    return seq.translate(str.maketrans('ACGTacgt', 'TGCAtgca'))[::-1]

In [9]:
revcomp("AAACCGGTT")

'AACCGGTTT'

In [6]:
def get_oligo_sequences(sequence, oligo_length):
    seqs=[]
    for i in range (0, len(sequence)-oligo_length):
        seqs.append( (i,revcomp(sequence[i:i+(oligo_length)])))
    return seqs    

In [8]:
malat1_seqs=get_oligo_sequences(malat1, 16)

In [9]:
len(malat1_seqs)

6967

In [10]:
len(malat1_seqs[8812][1])

IndexError: list index out of range

In [11]:
malat1_seqs[8812][1]

IndexError: list index out of range

In [12]:
def has_kmer_at_position(oligo_seq, kmer_tuple):
    #print(oligo_seq[kmer_tuple[0]:kmer_tuple[0]+len(kmer_tuple[1])])
    return oligo_seq[kmer_tuple[0]:kmer_tuple[0]+len(kmer_tuple[1])]==kmer_tuple[1]

In [13]:
has_kmer_at_position('CTTGATTGGGGAAAAA', (0, "CTT"))

True

In [14]:
has_kmer_at_position('CTTGATTGGGGAAAAA', (0, "TT"))

False

In [15]:
has_kmer_at_position('CTTGATTGGGGAAAAA', (1, "TT"))

True

In [16]:
has_kmer_at_position('CTTGATTGGGGAAAAA', (11, "AAAAA"))

True

In [17]:
kmer_list=[(4, "AAAG"), (5, "GCGC"), (6, "CGCA"), (7, "TTTA")]

In [18]:
def get_seqs_with_kmers(sequences, kmer_list):
    found=[]
    for seq in sequences:
        kmers_found=[]
        for kmer in kmer_list:
            if(has_kmer_at_position(seq[1], kmer)):
                kmers_found.append(kmer)
        if(len(kmers_found)):       
            found.append((seq, kmers_found))
    return found                 

In [19]:
get_seqs_with_kmers(malat1_seqs, kmer_list)

[((94, 'CAATCTTTTTAATTAA'), [(7, 'TTTA')]),
 ((110, 'AATTCTTTTTACTGCT'), [(7, 'TTTA')]),
 ((128, 'GCTTAAGTTTAGAGTT'), [(7, 'TTTA')]),
 ((216, 'CAAACTGTTTAAATAA'), [(7, 'TTTA')]),
 ((296, 'ACCTCCCTTTACAATC'), [(7, 'TTTA')]),
 ((319, 'AGAACCTTTTAGAACT'), [(7, 'TTTA')]),
 ((354, 'CCTTCGTTTTAATCTA'), [(7, 'TTTA')]),
 ((370, 'GATTCTATTTAGGTAA'), [(7, 'TTTA')]),
 ((392, 'TTACTGTTTTAAATGC'), [(7, 'TTTA')]),
 ((400, 'CTACAACTTTACTGTT'), [(7, 'TTTA')]),
 ((439, 'TCAATCTTTTAAAACT'), [(7, 'TTTA')]),
 ((478, 'AAGGATTTTTATAACG'), [(7, 'TTTA')]),
 ((493, 'TACATGCGCCAGTCGA'), [(5, 'GCGC')]),
 ((564, 'GGCGTGCGCCACCATC'), [(5, 'GCGC')]),
 ((571, 'GATTAAAGGCGTGCGC'), [(4, 'AAAG')]),
 ((814, 'TTCTAAAGATGCTTTG'), [(4, 'AAAG')]),
 ((840, 'CTGTGCCTTTAAGTAC'), [(7, 'TTTA')]),
 ((917, 'ATCCAAAGTTGTCTTA'), [(4, 'AAAG')]),
 ((986, 'TTCTGACTTTATATCT'), [(7, 'TTTA')]),
 ((1003, 'CTTTAAAGATAATTTC'), [(4, 'AAAG')]),
 ((1009, 'TTATGGCTTTAAAGAT'), [(7, 'TTTA')]),
 ((1054, 'GTCTAAAGACTCTTCT'), [(4, 'AAAG')]),
 ((1113,

## Run on Swag's toxic kmers

In [20]:
trimer_df=pd.read_csv("../data/trimer_feature_list_v2.csv", index_col=0)
trimer_df.head(20)

Unnamed: 0,MI,log10_p_value_MI,population_density,frequency_in_class_0.0,enrichment_in_class_0.0,frequency_in_class_1.0,enrichment_in_class_1.0,fraction_dev_logALT
AAA:0,0.00359,-0.68995,0.014286,0.020833,0.458333,0.00495,-0.653465,-0.092451
AAA:1,0.004754,-1.130507,0.016327,0.024306,0.488715,0.00495,-0.696782,-0.148209
AAA:2,0.011057,-5.001729,0.014286,0.024306,0.701389,0.0,-1.0,-0.120435
AAA:3,0.004754,-0.939031,0.016327,0.024306,0.488715,0.00495,-0.696782,-0.168124
AAA:4,0.012654,-8.191739,0.016327,0.027778,0.701389,0.0,-1.0,-0.18947
AAA:5,0.009465,-2.577193,0.012245,0.020833,0.701389,0.0,-1.0,-0.183645
AAA:6,0.000721,-0.448806,0.014286,0.017361,0.215278,0.009901,-0.306931,-0.101804
AAA:7,0.00359,-0.574955,0.014286,0.020833,0.458333,0.00495,-0.653465,-0.109904
AAA:8,0.00359,-0.633215,0.014286,0.020833,0.458333,0.00495,-0.653465,-0.170692
AAA:9,0.012654,-7.305155,0.016327,0.027778,0.701389,0.0,-1.0,-0.200655


In [21]:
trimer_df.shape

(896, 8)

In [22]:
def get_kmers_above_ALT(df, mi_cutoff, logALT_cutoff):
    ind=df.loc[(df.loc[:, "MI"] > mi_cutoff)  & (df.loc[:, "fraction_dev_logALT"] > logALT_cutoff) ].index.values
    splits=map(lambda i:  i.split(":"), ind)
    tuples=map(lambda s: (int(s[1]), s[0]), splits)
    return list(tuples)
    #return df.loc[(df.loc[:, "MI"] > mi_cutoff)  & (df.loc[:, "fraction_dev_logALT"] > logALT_cutoff) ]

In [23]:
toxic_kmers=get_kmers_above_ALT(trimer_df, 0.01, .74)

In [24]:
toxic_kmers

[(1, 'CAT'),
 (3, 'CCA'),
 (0, 'CTT'),
 (0, 'TCC'),
 (2, 'TCC'),
 (2, 'TCT'),
 (3, 'TCT'),
 (9, 'TCT'),
 (3, 'TGC'),
 (3, 'TGT'),
 (6, 'TGT'),
 (0, 'TTG')]

In [25]:
len(toxic_kmers)

12

In [26]:
toxic_oligos=get_seqs_with_kmers(malat1_seqs, toxic_kmers)
len(toxic_oligos)

1526

In [27]:
def kmer_tuple_to_string(tuples):
    out=[]
    for t in tuples:
        out.append("{}:{}".format(t[1], t[0]))
    return out    

In [28]:
kmer_tuple_to_string([(5, "GTG")])

['GTG:5']

In [29]:
toxic_kmers_str=list(map(lambda x: kmer_tuple_to_string([x])[0], toxic_kmers))
toxic_kmers_str

['CAT:1',
 'CCA:3',
 'CTT:0',
 'TCC:0',
 'TCC:2',
 'TCT:2',
 'TCT:3',
 'TCT:9',
 'TGC:3',
 'TGT:3',
 'TGT:6',
 'TTG:0']

In [30]:
toxic_oligos_df=pd.DataFrame(columns=["Oligo Sequence", "Position in MALAT", "Num KMERs", "KMERs found",  "KMER MI", "KMER fraction_dev_logALT"])

In [31]:
toxic_oligos_df.loc[:,"Oligo Sequence"]=list(map(lambda x: x[0][1], toxic_oligos))

In [32]:
toxic_oligos_df.loc[:,"Position in MALAT"]=list(map(lambda x: x[0][0], toxic_oligos))
toxic_oligos_df.loc[:,"Num KMERs"]=list(map(lambda x: len(x[1]), toxic_oligos))
toxic_oligos_df.loc[:,"KMERs found"]=list(map(lambda x: kmer_tuple_to_string(x[1]), toxic_oligos))

  return array(a, dtype, copy=False, order=order)


In [33]:
toxic_oligos_df.loc[:,"KMER MI"]=list(map(lambda kmer: list(map(lambda x: trimer_df.loc[x, "MI"], kmer)), toxic_oligos_df.loc[:,"KMERs found"]))

In [34]:
toxic_oligos_df.loc[:,"KMER fraction_dev_logALT"]=list(map(lambda kmer: list(map(lambda x: trimer_df.loc[x, "fraction_dev_logALT"], kmer)), toxic_oligos_df.loc[:,"KMERs found"]))

In [35]:
toxic_oligos_df.shape

(1526, 6)

In [36]:
toxic_oligos_df.head(20)

Unnamed: 0,Oligo Sequence,Position in MALAT,Num KMERs,KMERs found,KMER MI,KMER fraction_dev_logALT
0,CGCTGCCTGAATGCCT,1,1,[TGC:3],[0.011126493274650632],[0.9182542924874876]
1,GCTCTCGCTGCCTGAA,6,1,[TCT:2],[0.011126493274650632],[0.7407236672648675]
2,TGCTCTCGCTGCCTGA,7,1,[TCT:3],[0.018168716200871116],[0.890582820716125]
3,CTCTGCTCTCGCTGCC,10,1,[TGC:3],[0.011126493274650632],[0.9182542924874876]
4,GCTCTGCTCTCGCTGC,11,1,[TCT:2],[0.011126493274650632],[0.7407236672648675]
5,TGCTCTGCTCTCGCTG,12,1,[TCT:3],[0.018168716200871116],[0.890582820716125]
6,CTGCTCTGCTCTCGCT,13,1,[TCT:9],[0.023835331933572484],[1.0389238068809217]
7,CGCTGCTCTGCTCTCG,15,1,[TGC:3],[0.011126493274650632],[0.9182542924874876]
8,CTACGCTGCTCTGCTC,18,1,[TCT:9],[0.023835331933572484],[1.0389238068809217]
9,GCTCTACGCTGCTCTG,21,1,[TCT:2],[0.011126493274650632],[0.7407236672648675]


In [37]:
toxic_oligos_df.to_csv("../data/toxic_malat_16mers_mi.01_logALT.74.12toxicKmers.csv")

## generate fasta for off targets

In [38]:
toxic_oligos_df.shape

(1526, 6)

## Pick 24 best sequences. 2 for each kmer. Pick these randomly

In [49]:
out_oligo_df=pd.DataFrame(columns=["Oligo Sequence", "Position in MALAT", "Num KMERs", "KMERs found", "KMER", "KMER position", "Position to Modify",  "KMER MI", "KMER fraction_dev_logALT"])

In [50]:
len(out_oligo_df)

0

In [51]:
out_oligo_df=pd.DataFrame(columns=["Oligo Sequence", "Position in MALAT", "Num KMERs",  "KMER", "KMER position", "Position to modify",  "KMER MI", "KMER fraction_dev_logALT"])


In [52]:

while len(out_oligo_df) < 24:
    r = random.randint(0, len(toxic_oligos_df)-1)
    row=toxic_oligos_df.loc[r].copy()
   
    if row["Num KMERs"]==1 and row["KMERs found"][0] in toxic_kmers_str:
        #print (row)
        row["KMER position"]=int(row["KMERs found"][0].split(":")[1])
        row["KMER"]=row["KMERs found"][0].split(":")[0]
        row["KMER MI"]=row["KMER MI"][0]
        row["KMER fraction_dev_logALT"]=row["KMER fraction_dev_logALT"][0]
        row["Position to modify"]=row["KMER position"]+1
        
        if sum((out_oligo_df.loc[:, "KMER"]==row["KMER"]) & (out_oligo_df.loc[:, "KMER position"]==row["KMER position"])) < 2:
            out_oligo_df=out_oligo_df.append(row)

out_oligo_df.sort_values(by=["KMER", "KMER position"],inplace=True)

In [53]:
out_oligo_df

Unnamed: 0,Oligo Sequence,Position in MALAT,Num KMERs,KMER,KMER position,Position to modify,KMER MI,KMER fraction_dev_logALT,KMERs found
797,TCATCAACAAAAGCCC,3273,1,CAT,1,2,0.011126,0.792336,[CAT:1]
777,TCATTTCTCCTACACT,3130,1,CAT,1,2,0.011126,0.792336,[CAT:1]
725,CCCCCAAGCCCTACGC,2867,1,CCA,3,4,0.011126,0.836711,[CCA:3]
810,CTGCCAGGCTGGTTAT,3345,1,CCA,3,4,0.011126,0.836711,[CCA:3]
87,CTTTTAGAACTTCACA,314,1,CTT,0,1,0.015777,1.007685,[CTT:0]
57,CTTCATGTTATCTCTT,198,1,CTT,0,1,0.015777,1.007685,[CTT:0]
1191,TCCCAACCCCCCTCAC,5247,1,TCC,0,1,0.011126,0.790894,[TCC:0]
1277,TCCTTCTACAGTCTGA,5605,1,TCC,0,1,0.011126,0.790894,[TCC:0]
1027,CATCCCCCCAAAATTG,4343,1,TCC,2,3,0.013426,0.95203,[TCC:2]
614,ATTCCGCCATGGCCAG,2238,1,TCC,2,3,0.013426,0.95203,[TCC:2]


In [54]:
sum((out_oligo_df.loc[:, "KMER"]=="TCC") & (out_oligo_df.loc[:, "KMER position"]==0))

2

In [55]:
out_oligo_df.to_csv("../data/MALAT_mouse_24_predicted_toxic_sequences.csv")

In [56]:
def print_fasta(df):
    fasta=""
    for index, row in df.iterrows():
        fasta+=">{}\n{}\n".format(row['Position in MALAT'], row["Oligo Sequence"])
    return fasta

In [57]:
fasta=print_fasta(out_oligo_df)

In [58]:
f = open("../data/mouseMalat_oligo_sequences.fa", "w")
f.write(fasta)
f.close()

We need one more sequence with TCT at pos 3

In [63]:
toxic_kmers_TCT_3_str=['TCT:3']

In [49]:
out_oligo_df=pd.DataFrame(columns=["Oligo Sequence", "Position in MALAT", "Num KMERs", "KMERs found", "KMER", "KMER position", "Position to Modify",  "KMER MI", "KMER fraction_dev_logALT"])

In [50]:
len(out_oligo_df)

0

In [51]:
out_oligo_df=pd.DataFrame(columns=["Oligo Sequence", "Position in MALAT", "Num KMERs",  "KMER", "KMER position", "Position to modify",  "KMER MI", "KMER fraction_dev_logALT"])


In [52]:

while len(out_oligo_df) < 24:
    r = random.randint(0, len(toxic_oligos_df)-1)
    row=toxic_oligos_df.loc[r].copy()
   
    if row["Num KMERs"]==1 and row["KMERs found"][0] in toxic_kmers_str:
        #print (row)
        row["KMER position"]=int(row["KMERs found"][0].split(":")[1])
        row["KMER"]=row["KMERs found"][0].split(":")[0]
        row["KMER MI"]=row["KMER MI"][0]
        row["KMER fraction_dev_logALT"]=row["KMER fraction_dev_logALT"][0]
        row["Position to modify"]=row["KMER position"]+1
        
        if sum((out_oligo_df.loc[:, "KMER"]==row["KMER"]) & (out_oligo_df.loc[:, "KMER position"]==row["KMER position"])) < 2:
            out_oligo_df=out_oligo_df.append(row)

out_oligo_df.sort_values(by=["KMER", "KMER position"],inplace=True)

In [53]:
out_oligo_df

Unnamed: 0,Oligo Sequence,Position in MALAT,Num KMERs,KMER,KMER position,Position to modify,KMER MI,KMER fraction_dev_logALT,KMERs found
797,TCATCAACAAAAGCCC,3273,1,CAT,1,2,0.011126,0.792336,[CAT:1]
777,TCATTTCTCCTACACT,3130,1,CAT,1,2,0.011126,0.792336,[CAT:1]
725,CCCCCAAGCCCTACGC,2867,1,CCA,3,4,0.011126,0.836711,[CCA:3]
810,CTGCCAGGCTGGTTAT,3345,1,CCA,3,4,0.011126,0.836711,[CCA:3]
87,CTTTTAGAACTTCACA,314,1,CTT,0,1,0.015777,1.007685,[CTT:0]
57,CTTCATGTTATCTCTT,198,1,CTT,0,1,0.015777,1.007685,[CTT:0]
1191,TCCCAACCCCCCTCAC,5247,1,TCC,0,1,0.011126,0.790894,[TCC:0]
1277,TCCTTCTACAGTCTGA,5605,1,TCC,0,1,0.011126,0.790894,[TCC:0]
1027,CATCCCCCCAAAATTG,4343,1,TCC,2,3,0.013426,0.95203,[TCC:2]
614,ATTCCGCCATGGCCAG,2238,1,TCC,2,3,0.013426,0.95203,[TCC:2]


In [54]:
sum((out_oligo_df.loc[:, "KMER"]=="TCC") & (out_oligo_df.loc[:, "KMER position"]==0))

2

In [55]:
out_oligo_df.to_csv("../data/MALAT_mouse_24_predicted_toxic_sequences.csv")

In [56]:
def print_fasta(df):
    fasta=""
    for index, row in df.iterrows():
        fasta+=">{}\n{}\n".format(row['Position in MALAT'], row["Oligo Sequence"])
    return fasta

In [57]:
fasta=print_fasta(out_oligo_df)

In [58]:
f = open("../data/mouseMalat_oligo_sequences.fa", "w")
f.write(fasta)
f.close()

We need one more sequence with TCT at pos 3

In [63]:
toxic_kmers_TCT_3_str=['TCT:3']

<span style="color:red">**<<<<<<< local**</span>

In [71]:
out_oligo_TCT_3_df=pd.DataFrame(columns=["Oligo Sequence", "Position in MALAT", "Num KMERs", "KMERs found", "KMER", "KMER position", "Position to Modify",  "KMER MI", "KMER fraction_dev_logALT"])

In [72]:
len(out_oligo_TCT_3_df)

0

In [73]:
out_oligo_TCT_3_df=pd.DataFrame(columns=["Oligo Sequence", "Position in MALAT", "Num KMERs",  "KMER", "KMER position", "Position to modify",  "KMER MI", "KMER fraction_dev_logALT"])


In [80]:

while len(out_oligo_TCT_3_df) < 10:
    r = random.randint(0, len(toxic_oligos_df)-1)
    row=toxic_oligos_df.loc[r].copy()
   
    if row["Num KMERs"]==1 and row["KMERs found"][0] in toxic_kmers_TCT_3_str:
        print (row)
        row["KMER position"]=int(row["KMERs found"][0].split(":")[1])
        row["KMER"]=row["KMERs found"][0].split(":")[0]
        row["KMER MI"]=row["KMER MI"][0]
        row["KMER fraction_dev_logALT"]=row["KMER fraction_dev_logALT"][0]
        row["Position to modify"]=row["KMER position"]+1
        
        #if sum((out_oligo_TCT_3_df[:, "KMER"]==row["KMER"]) & (out_oligo_TCT_3_df[:, "KMER position"]==row["KMER position"])) < 5:
        out_oligo_TCT_3_df=out_oligo_TCT_3_df.append(row)

out_oligo_TCT_3_df.sort_values(by=["KMER", "KMER position"],inplace=True)

Oligo Sequence                    TTTTCTGCCTTTACTT
Position in MALAT                             3409
Num KMERs                                        1
KMERs found                                [TCT:3]
KMER MI                     [0.018168716200871116]
KMER fraction_dev_logALT       [0.890582820716125]
Name: 828, dtype: object
Oligo Sequence                    CATTCTGCCTTAACTT
Position in MALAT                             1122
Num KMERs                                        1
KMERs found                                [TCT:3]
KMER MI                     [0.018168716200871116]
KMER fraction_dev_logALT       [0.890582820716125]
Name: 340, dtype: object
Oligo Sequence                    GATTCTGGAAAAGCTG
Position in MALAT                             6128
Num KMERs                                        1
KMERs found                                [TCT:3]
KMER MI                     [0.018168716200871116]
KMER fraction_dev_logALT       [0.890582820716125]
Name: 1380, dtype: object
Oligo 

In [81]:
out_oligo_TCT_3_df

Unnamed: 0,Oligo Sequence,Position in MALAT,Num KMERs,KMER,KMER position,Position to modify,KMER MI,KMER fraction_dev_logALT,KMERs found
1108,AACTCTTTAAACCCAC,4753,1,TCT,3,4,0.018169,0.890583,[TCT:3]
44,TCTTCTTAGATTATTA,178,1,TCT,3,4,0.018169,0.890583,[TCT:3]
1495,GCGTCTTTTGCTTTTT,6680,1,TCT,3,4,0.018169,0.890583,[TCT:3]
521,CATTCTTCTTTCTGGG,1886,1,TCT,3,4,0.018169,0.890583,[TCT:3]
206,CTATCTTCTCTAACAG,786,1,TCT,3,4,0.018169,0.890583,[TCT:3]
828,TTTTCTGCCTTTACTT,3409,1,TCT,3,4,0.018169,0.890583,[TCT:3]
340,CATTCTGCCTTAACTT,1122,1,TCT,3,4,0.018169,0.890583,[TCT:3]
1380,GATTCTGGAAAAGCTG,6128,1,TCT,3,4,0.018169,0.890583,[TCT:3]
502,ATCTCTTTACACAGAA,1764,1,TCT,3,4,0.018169,0.890583,[TCT:3]
94,GATTCTATTTAGGTAA,370,1,TCT,3,4,0.018169,0.890583,[TCT:3]


In [79]:
out_oligo_TCT_3_df.to_csv("../data/MALAT_mouse_more_TCT_3_predicted_toxic_sequences.csv")

<span style="color:red">**=======**</span>

## Make fasta

In [10]:
malat1_seqs=get_oligo_sequences(malat1, 16)

In [13]:
for idx, seq in enumerate(malat1_seqs):
    print(">{}".format(idx))
    print(seq[1])

>0
CCGGTGGGGCTGCGTC
>1
ACCGGTGGGGCTGCGT
>2
AACCGGTGGGGCTGCG
>3
CAACCGGTGGGGCTGC
>4
GCAACCGGTGGGGCTG
>5
CGCAACCGGTGGGGCT
>6
GCGCAACCGGTGGGGC
>7
TGCGCAACCGGTGGGG
>8
CTGCGCAACCGGTGGG
>9
ACTGCGCAACCGGTGG
>10
GACTGCGCAACCGGTG
>11
GGACTGCGCAACCGGT
>12
GGGACTGCGCAACCGG
>13
AGGGACTGCGCAACCG
>14
GAGGGACTGCGCAACC
>15
GGAGGGACTGCGCAAC
>16
GGGAGGGACTGCGCAA
>17
GGGGAGGGACTGCGCA
>18
CGGGGAGGGACTGCGC
>19
GCGGGGAGGGACTGCG
>20
GGCGGGGAGGGACTGC
>21
GGGCGGGGAGGGACTG
>22
GGGGCGGGGAGGGACT
>23
GGGGGCGGGGAGGGAC
>24
CGGGGGCGGGGAGGGA
>25
GCGGGGGCGGGGAGGG
>26
AGCGGGGGCGGGGAGG
>27
GAGCGGGGGCGGGGAG
>28
AGAGCGGGGGCGGGGA
>29
GAGAGCGGGGGCGGGG
>30
GGAGAGCGGGGGCGGG
>31
GGGAGAGCGGGGGCGG
>32
GGGGAGAGCGGGGGCG
>33
AGGGGAGAGCGGGGGC
>34
GAGGGGAGAGCGGGGG
>35
GGAGGGGAGAGCGGGG
>36
CGGAGGGGAGAGCGGG
>37
GCGGAGGGGAGAGCGG
>38
TGCGGAGGGGAGAGCG
>39
CTGCGGAGGGGAGAGC
>40
GCTGCGGAGGGGAGAG
>41
GGCTGCGGAGGGGAGA
>42
AGGCTGCGGAGGGGAG
>43
CAGGCTGCGGAGGGGA
>44
GCAGGCTGCGGAGGGG
>45
TGCAGGCTGCGGAGGG
>46
CTGCAGGCTGCGGAGG
>47
GCTGCAGGCTGCGGAG
>4

## PCSK9

In [7]:
def get_oligo_site_sequences(sequence, oligo_length):
    seqs=[]
    for i in range (0, len(sequence)-oligo_length):
        seqs.append( (i,sequence[i:i+(oligo_length)]))
    return seqs    

In [8]:
pcsk9_seq=get_gene('ENSMUSG00000044254')

In [9]:
pcsk9_sites_seqs=get_oligo_site_sequences(pcsk9_seq, 16)

In [17]:
pcsk9_sites_seqs

[(0, 'GGAGTGGGGATTAAGA'),
 (1, 'GAGTGGGGATTAAGAG'),
 (2, 'AGTGGGGATTAAGAGG'),
 (3, 'GTGGGGATTAAGAGGG'),
 (4, 'TGGGGATTAAGAGGGG'),
 (5, 'GGGGATTAAGAGGGGG'),
 (6, 'GGGATTAAGAGGGGGG'),
 (7, 'GGATTAAGAGGGGGGA'),
 (8, 'GATTAAGAGGGGGGAA'),
 (9, 'ATTAAGAGGGGGGAAT'),
 (10, 'TTAAGAGGGGGGAATG'),
 (11, 'TAAGAGGGGGGAATGT'),
 (12, 'AAGAGGGGGGAATGTA'),
 (13, 'AGAGGGGGGAATGTAA'),
 (14, 'GAGGGGGGAATGTAAC'),
 (15, 'AGGGGGGAATGTAACA'),
 (16, 'GGGGGGAATGTAACAG'),
 (17, 'GGGGGAATGTAACAGG'),
 (18, 'GGGGAATGTAACAGGT'),
 (19, 'GGGAATGTAACAGGTC'),
 (20, 'GGAATGTAACAGGTCC'),
 (21, 'GAATGTAACAGGTCCC'),
 (22, 'AATGTAACAGGTCCCG'),
 (23, 'ATGTAACAGGTCCCGT'),
 (24, 'TGTAACAGGTCCCGTT'),
 (25, 'GTAACAGGTCCCGTTT'),
 (26, 'TAACAGGTCCCGTTTG'),
 (27, 'AACAGGTCCCGTTTGC'),
 (28, 'ACAGGTCCCGTTTGCA'),
 (29, 'CAGGTCCCGTTTGCAG'),
 (30, 'AGGTCCCGTTTGCAGC'),
 (31, 'GGTCCCGTTTGCAGCC'),
 (32, 'GTCCCGTTTGCAGCCC'),
 (33, 'TCCCGTTTGCAGCCCA'),
 (34, 'CCCGTTTGCAGCCCAA'),
 (35, 'CCGTTTGCAGCCCAAT'),
 (36, 'CGTTTGCAGCCCAATT'),
 (37, 'GTTT

In [14]:
f = open("../data/mouse.Pcsk9.sites.seq.fa", "w")

for idx, seq in enumerate(pcsk9_sites_seqs):
    f.write(">{}\n".format(idx))
    f.write(seq[1]+"\n")
f.close()

In [13]:
malat1_seqs

NameError: name 'malat1_seqs' is not defined

In [19]:
oligos_df=pd.DataFrame(columns=["ID","OligoSequence"])
for idx, seq in enumerate(pcsk9_sites_seqs):
    oligos_df.loc[idx]=[idx, seq[1]]

In [20]:
oligos_df.head()

Unnamed: 0,ID,OligoSequence
0,0,GGAGTGGGGATTAAGA
1,1,GAGTGGGGATTAAGAG
2,2,AGTGGGGATTAAGAGG
3,3,GTGGGGATTAAGAGGG
4,4,TGGGGATTAAGAGGGG


In [21]:
oligos_df.to_csv("../data/OBMSeqs.txt", index=None)

<span style="color:red">**>>>>>>> remote**</span>