## Collection of siRNA Features from the Input Sequence

#### Packages Installation

In [1]:
%%sh
pip install gffutils     # https://pythonhosted.org/pyfaidx/
pip install pyfaidx      # https://pythonhosted.org/gffutils/contents.html
pip install biopython    # http://biopython.org/DIST/docs/tutorial/Tutorial.html
pip install seqfold      # https://pypi.org/project/seqfold/



#### Parsing FASTA file for being BioPython readable and for features extraction

In [2]:
#Parsing process + we are already thresholding the scores specific cutoff
from Bio.Seq import Seq
from Bio import SeqIO
Scores = []
for seqs in SeqIO.parse("data.fasta", "fasta"):
    score = ((float(seqs.id[5:])))
    if score >= 0.666:
        Scores.append(float(1))
    else:
        Scores.append(float(0))
print(Scores)
print(len(Scores))
#the score represent the experimentally validated one that I put in the fasta header in order 
#to have an easy way to extract it
#here I would need to add some line for the exclusion of those sequences that contains toxic motif as described in Lawlor et al., 2012, Krzyzosiak et al., 2012, Ahmed et al., 2011

In [3]:
#Parsing process + we are not thresholding the scores specific cutoff
from Bio.Seq import Seq
from Bio import SeqIO
Scores = []
for seqs in SeqIO.parse("data.fasta", "fasta"):
    score = ((float(seqs.id[5:])))
    Scores.append(score)
    
print(Scores)
print(len(Scores))
#the score represent the experimentally validated one that I put in the fasta header in order 
#to have an easy way to extract it

[0.462, 0.384, 0.514, 0.364, 0.522, 0.442, 0.441, 0.436, 0.591, 0.511, 0.555, 0.547, 0.44, 0.651, 0.759, 0.624, 0.858, 0.849, 0.383, 0.537, 0.832, 0.684, 0.736, 0.83, 0.389, 0.757, 0.552, 0.773, 0.707, 0.518, 0.753, 0.817, 0.794, 0.753, 0.836, 0.76, 0.423, 0.788, 0.871, 0.761, 0.821, 0.72, 0.863, 0.471, 0.868, 0.776, 0.849, 0.87, 0.544, 0.856, 0.835, 0.746, 0.825, 0.849, 0.754, 0.83, 0.928, 0.616, 0.825, 0.808, 0.804, 0.723, 0.597, 0.757, 0.887, 0.688, 0.79, 0.627, 0.69, 0.827, 0.893, 0.372, 0.517, 0.336, 0.268, 0.255, 0.737, 0.59, 0.652, 0.524, 0.534, 0.43, 0.584, 0.349, 0.766, 0.743, 0.565, 0.412, 0.829, 0.364, 0.794, 0.552, 0.459, 0.682, 0.7, 0.232, 0.584, 0.423, 0.813, 0.449, 0.721, 0.722, 0.578, 0.726, 0.393, 0.254, 0.442, 0.251, 0.367, 0.659, 0.449, 0.58, 0.511, 0.298, 0.598, 0.737, 0.417, 0.74, 0.727, 0.643, 0.755, 0.673, 0.443, 0.476, 0.683, 0.924, 0.82, 0.624, 0.862, 0.445, 0.565, 0.958, 0.422, 0.952, 0.945, 0.984, 0.961, 0.236, 0.27, 0.642, 0.306, 0.806, 0.92, 0.897, 0.845, 0

In [4]:
from Bio.Seq import Seq
from Bio.SeqUtils import GC
from Bio import SeqIO
seq_objects = SeqIO.parse("data.fasta", "fasta")
sequences = [seq for seq in seq_objects]
for seq in sequences:
    seq_id = seq.id
    sequence = seq.seq
    sequence = sequence.replace("t", "U")
    sequence = sequence.replace("a", "A")
    sequence = sequence.replace("c", "C")
    sequence = sequence.replace("g", "G")
    sequence = sequence.replace("u", "U")

In [5]:
#global GC content 
from Bio.Seq import Seq
from Bio.SeqUtils import GC
from Bio import SeqIO
seq_objects = SeqIO.parse("data.fasta", "fasta")
sequences = [seq for seq in seq_objects]
for seq in sequences:
    seq_id = seq.id
    sequence = seq.seq
    gc_content = GC(sequence)
    gc = round(gc_content,2)

In [6]:
#Features Group 5 Extraction - Local GC Content (k=5)
GC_I=[]
GC_II=[]
GC_III=[]
GC_IV=[]
GC_V=[]
GC_VI=[]
GC_VII=[]
GC_VIII=[]
GC_IX=[]
GC_X=[]
GC_XI=[]
GC_XII=[]
GC_XIII=[]
GC_XIV=[]
GC_XV=[]
GC_XVI=[]

for seq in sequences:
    seq_id = seq.id
    sub1 = seq.seq[0:6]
    sub2 = seq.seq[1:7]
    sub3 = seq.seq[2:8]
    sub3 = seq.seq[3:9]
    sub4 = seq.seq[4:10]
    sub5 = seq.seq[5:11]
    sub6 = seq.seq[6:12]
    sub7 = seq.seq[7:13]
    sub8 = seq.seq[8:14]
    sub9 = seq.seq[9:15]
    sub10 = seq.seq[10:16]
    sub11 = seq.seq[11:17]
    sub12 = seq.seq[12:18]
    sub13 = seq.seq[13:19]
    sub14 = seq.seq[14:20]
    sub15 = seq.seq[15:21]
    sub16 = seq.seq[16:]
    
    
    gc1 = round(GC(sub1),2)
    gc2 = round(GC(sub2),2)
    gc3 = round(GC(sub3),2)
    gc4 = round(GC(sub4),2)
    gc5 = round(GC(sub5),2)
    gc6 = round(GC(sub6),2)
    gc7 = round(GC(sub7),2)
    gc8 = round(GC(sub8),2)
    gc9 = round(GC(sub9),2)
    gc10 = round(GC(sub10),2)
    gc11 = round(GC(sub11),2)
    gc12 = round(GC(sub12),2)
    gc13 = round(GC(sub13),2)
    gc14 = round(GC(sub14),2)
    gc15 = round(GC(sub15),2)
    gc16 = round(GC(sub16),2)
    
    GC_I.append(gc1)
    GC_II.append(gc2)
    GC_III.append(gc3)
    GC_IV.append(gc4)
    GC_V.append(gc5)
    GC_VI.append(gc6)
    GC_VII.append(gc7)
    GC_VIII.append(gc8)
    GC_IX.append(gc9)
    GC_X.append(gc10)
    GC_XI.append(gc11)
    GC_XII.append(gc12)
    GC_XIII.append(gc13)
    GC_XIV.append(gc14)
    GC_XV.append(gc15)
    GC_XVI.append(gc16)

In [7]:
#Features Group 6 Extraction - siRNA Secondary Structure ΔG
dgs=[]
dgs3=[]
dgs5=[]

from seqfold import dg, Struct
for seq in sequences:
    seq_id = seq.id
    sequence = (seq.seq).replace("t", "U")
    sequence = sequence.replace("a", "A")
    sequence = sequence.replace("g", "G")
    sequence = sequence.replace("c", "C")
    five = sequence[0:7]
    three = sequence[-7:]
    delta_G = dg(sequence, temp = 25.0)
   
    dgs.append(delta_G)

In [8]:
#Features Group 7 Extraction - siRNA Melting Temperature
from Bio.SeqUtils import MeltingTemp as mt
Tms=[]
for seq in sequences:
    seq_id = seq.id
    sequence = (seq.seq).replace("t", "U")
    Tm = mt.Tm_NN(sequence, check=True, strict=False, nn_table=mt.RNA_NN3)
    Tms.append(round(Tm, 2))


In [9]:
#Features Group 9 Extraction - 5' A/U and G/C Richness (important in the antisense strand)
five_prime_AU = []
five_prime_GC = []

for seq in sequences:
    seq_id = seq.id
    sequence = (seq.seq).replace("t", "U")
    sequence = sequence.replace("a", "A")
    sequence = sequence.replace("g", "G")
    sequence = sequence.replace("c", "C")
    five_prime_AU_subseq = sequence[:7]
    five_prime_GC_content = round(GC(five_prime_AU_subseq), 3)
    five_prime_AU_content = round((100 - five_prime_GC_content), 3)
    
    five_prime_AU.append(five_prime_AU_content)
    five_prime_GC.append(five_prime_GC_content)

In [10]:
#Features Group 1 Extraction - Position-Specific Nucleotides + Class Label + Data Point IDs + Global GC Content
seqs_ids=[]
gc_contents=[]
P1s=[]
P2s=[]
P3s=[]
P4s=[]
P5s=[]
P6s=[]
P7s=[]
P8s=[]
P9s=[]
P10s=[]
P11s=[]
P12s=[]
P13s=[]
P14s=[]
P15s=[]
P16s=[]
P17s=[]
P18s=[]
P19s=[]
P20s=[]
P21s=[]



for seq in sequences:
    seq_id = seq.id
    sequence = (seq.seq).replace("t", "U")
    sequence = sequence.replace("a", "A")
    sequence = sequence.replace("g", "G")
    sequence = sequence.replace("c", "C")
    gc_content = GC(sequence)
    gc = round(gc_content,3)
    P1 = sequence[0]
    P2 = sequence[1]
    P3 = sequence[2]
    P4 = sequence[3]
    P5 = sequence[4]
    P6 = sequence[5]
    P7 = sequence[6]
    P8 = sequence[7]
    P9 = sequence[8]
    P10 = sequence[9]
    P11 = sequence[10]
    P12 = sequence[11]
    P13 = sequence[12]
    P14 = sequence[13]
    P15 = sequence[14]
    P16 = sequence[15]
    P17 = sequence[16]
    P18 = sequence[17]
    P19 = sequence[18]
    P20 = sequence[19]
    P21 = sequence[20]
    
    
    seqs_ids.append(seq_id)
    gc_contents.append(gc)
    P1s.append(P1)
    P2s.append(P2)
    P3s.append(P3)
    P4s.append(P4)
    P5s.append(P5)
    P6s.append(P6)
    P7s.append(P7)
    P8s.append(P8)
    P9s.append(P9)
    P10s.append(P10)
    P11s.append(P11)
    P12s.append(P12)
    P13s.append(P13)
    P14s.append(P14)
    P15s.append(P15)
    P16s.append(P16)
    P17s.append(P17)
    P18s.append(P18)
    P19s.append(P19)
    P20s.append(P20)
    P21s.append(P21)

In [11]:
#Features Group 8 Extraction - Dinucleotide frequency
AC=[]
AU=[]
AG=[]
AA=[]
CA=[]
CU=[]
CG=[]
CC=[]
UU=[]
UA=[]
UG=[]
UC=[]
GU=[]
GG=[]
GA=[]
GC=[]

for seq in sequences:
    seq_id = seq.id
    sequence = (seq.seq).replace("t", "U")
    sequence = sequence.replace("a", "A")
    sequence = sequence.replace("g", "G")
    sequence = sequence.replace("c", "C")
    ac=0
    au=0
    ag=0
    aa=0
    ca=0
    cu=0
    cg=0
    cc=0
    uu=0
    ua=0
    ug=0
    uc=0
    gu=0
    gg=0
    ga=0
    gc=0
    for i in range(len(sequence)-1):
        dinuc = sequence[i:i+2]
        if dinuc == "AA":
            aa = aa + 1
        elif dinuc == "AC":
            ac = ac + 1
        elif dinuc == "AG":
            ag = ag +1
        elif dinuc == "AU":
            au = au +1
        elif dinuc == "CA":
            ca = ca + 1
        elif dinuc == "CC":
            cc = cc + 1
        elif dinuc == "CG":
            cg = cg +1
        elif dinuc == "CU":
            cu = cu +1
        elif dinuc == "GA":
            ga = ga + 1
        elif dinuc == "GC":
            gc = gc + 1
        elif dinuc == "GG":
            gg = gg +1
        elif dinuc == "GU":
            gu = gu +1
        elif dinuc == "UA":
            ua = ua + 1
        elif dinuc == "UC":
            uc = uc + 1
        elif dinuc == "UG":
            ug = ug +1
        elif dinuc == "UU":
            uu = uu +1
    AA.append(round(((aa/20)*100), 3))
    AC.append(round(((ac/20)*100), 3))
    AG.append(round(((ag/20)*100), 3))
    AU.append(round(((au/20)*100), 3))
    CA.append(round(((ca/20)*100), 3))
    CC.append(round(((cc/20)*100), 3))
    CG.append(round(((cg/20)*100), 3))
    CU.append(round(((cu/20)*100), 3))
    GA.append(round(((aa/20)*100), 3))
    GC.append(round(((gc/20)*100), 3))
    GG.append(round(((gg/20)*100), 3))
    GU.append(round(((gu/20)*100), 3))
    UA.append(round(((ua/20)*100), 3))
    UC.append(round(((uc/20)*100), 3))
    UG.append(round(((ug/20)*100), 3))
    UU.append(round(((uu/20)*100), 3))


In [12]:
#Features Group 8 Extraction - Trinucleotide frequency
AAA=[]
AAC=[]
AAU=[]
AAG=[]
ACA=[]
ACC=[]
ACU=[]
ACG=[]
AUA=[]
AUC=[]
AUU=[]
AUG=[]
AGA=[]
AGC=[]
AGU=[]
AGG=[]
GGA=[]
GGC=[]
GGU=[]
GGG=[]
GCA=[]
GCC=[]
GCU=[]
GCG=[]
GUA=[]
GUC=[]
GUU=[]
GAG=[]
GAA=[]
GAC=[]
GAU=[]
GUG=[]
CAA=[]
CAC=[]
CAU=[]
CAG=[]
CCA=[]
CCC=[]
CCU=[]
CCG=[]
CGA=[]
CGC=[]
CGU=[]
CGG=[]
CUA=[]
CUC=[]
CUU=[]
CUG=[]
UAA=[]
UAC=[]
UAU=[]
UAG=[]
UCA=[]
UCC=[]
UCU=[]
UCG=[]
UGA=[]
UGC=[]
UGU=[]
UGG=[]
UUA=[]
UUC=[]
UUU=[]
UUG=[]


for seq in sequences:
    seq_id = seq.id
    sequence = (seq.seq).replace("t", "U")
    sequence = sequence.replace("a", "A")
    sequence = sequence.replace("g", "G")
    sequence = sequence.replace("c", "C")
    aaa=0
    aac=0
    aau=0
    aag=0
    aca=0
    acc=0
    acu=0
    acg=0
    aua=0
    auc=0
    auu=0
    aug=0
    aga=0
    agc=0
    agu=0
    agg=0
    gga=0
    ggc=0
    ggu=0
    ggg=0
    gca=0
    gcc=0
    gcu=0
    gcg=0
    gua=0
    guc=0
    guu=0
    gag=0
    gaa=0
    gac=0
    gau=0
    gug=0
    caa=0
    cac=0
    cau=0
    cag=0
    cca=0
    ccc=0
    ccu=0
    ccg=0
    cga=0
    cgc=0
    cgu=0
    cgg=0
    cua=0
    cuc=0
    cuu=0
    cug=0
    uaa=0
    uac=0
    uau=0
    uag=0
    uca=0
    ucc=0
    ucu=0
    ucg=0
    uga=0
    ugc=0
    ugu=0
    ugg=0
    uua=0
    uuc=0
    uuu=0
    uug=0
    for i in range(len(sequence)-2):
        dinuc = sequence[i:i+3]
        if dinuc == "AAA":
            aaa = aa + 1
        elif dinuc == "AAC":
            aac = aac + 1
        elif dinuc == "AAG":
            aag = aag +1
        elif dinuc == "AAU":
            aau = aau +1
        elif dinuc == "ACA":
            aca = aca + 1
        elif dinuc == "ACC":
            acc = acc + 1
        elif dinuc == "ACG":
            acg = acg +1
        elif dinuc == "ACU":
            acu = acu +1
        elif dinuc == "AGA":
            aga = aga + 1
        elif dinuc == "AGC":
            agc = agc + 1
        elif dinuc == "AGG":
            agg = agg +1
        elif dinuc == "AGU":
            agu = agu +1
        elif dinuc == "AUA":
            aua = aua + 1
        elif dinuc == "AUC":
            auc = auc + 1
        elif dinuc == "AUG":
            aug = aug +1
        elif dinuc == "AUU":
            auu = auu +1
        elif dinuc == "CCA":
            cca = cca + 1
        elif dinuc == "CCC":
            ccc = ccc + 1
        elif dinuc == "CCG":
            ccg = ccg +1
        elif dinuc == "CCU":
            ccu = ccu +1
        elif dinuc == "CGA":
            cga = cga + 1
        elif dinuc == "CGC":
            cgc = cgc + 1
        elif dinuc == "CGG":
            cgg = cgg +1
        elif dinuc == "CGU":
            cgu = cgu +1
        elif dinuc == "CUA":
            cua = cua + 1
        elif dinuc == "CUC":
            cuc = cuc + 1
        elif dinuc == "CUG":
            cug = cug +1
        elif dinuc == "CUU":
            cuu = cuu +1
        elif dinuc == "CAU":
            cau = cau +1
        elif dinuc == "CAA":
            caa = caa + 1
        elif dinuc == "CAC":
            cac = cac + 1
        elif dinuc == "CAG":
            cag = cag +1
        elif dinuc == "GCA":
            gca = gca + 1
        elif dinuc == "GCC":
            gcc = gcc + 1
        elif dinuc == "GCG":
            gcg = gcg +1
        elif dinuc == "GCU":
            gcu = gcu +1
        elif dinuc == "GGA":
            gga = gga + 1
        elif dinuc == "GGC":
            ggc = ggc + 1
        elif dinuc == "GGG":
            ggg = ggg +1
        elif dinuc == "GGU":
            ggu = ggu +1
        elif dinuc == "GUA":
            gua = gua + 1
        elif dinuc == "GUC":
            guc = guc + 1
        elif dinuc == "GUG":
            gug = gug +1
        elif dinuc == "GUU":
            guu = guu +1
        elif dinuc == "GAU":
            gau = gau +1
        elif dinuc == "GAA":
            gaa = gaa + 1
        elif dinuc == "GAC":
            gac = gac + 1
        elif dinuc == "GAG":
            gag = gag +1
        elif dinuc == "UCA":
            uca = uca + 1
        elif dinuc == "UCC":
            ucc = ucc + 1
        elif dinuc == "UCG":
            ucg = ucg +1
        elif dinuc == "UCU":
            ucu = ucu +1
        elif dinuc == "UGA":
            uga = uga + 1
        elif dinuc == "UGC":
            ugc = ugc + 1
        elif dinuc == "UGG":
            ugg = ugg +1
        elif dinuc == "UGU":
            ugu = ugu +1
        elif dinuc == "UUA":
            uua = uua + 1
        elif dinuc == "UUC":
            uuc = uuc + 1
        elif dinuc == "UUG":
            uug = uug +1
        elif dinuc == "UUU":
            uuu = uuu +1
        elif dinuc == "UAU":
            uau = uau +1
        elif dinuc == "UAA":
            uaa = uaa + 1
        elif dinuc == "UAC":
            uac = uac + 1
        elif dinuc == "UAG":
            uag = uag +1
        
    AAA.append(round(((aaa/19)*100), 3))
    AAC.append(round(((aac/18)*100), 3))
    AAG.append(round(((aag/19)*100), 3))
    AAU.append(round(((aau/19)*100), 3))
    ACA.append(round(((aca/19)*100), 3))
    ACC.append(round(((acc/19)*100), 3))
    ACG.append(round(((acg/19)*100), 3))
    ACU.append(round(((acu/19)*100), 3))
    AGA.append(round(((aga/19)*100), 3))
    AGC.append(round(((agc/19)*100), 3))
    AGG.append(round(((agg/19)*100), 3))
    AGU.append(round(((agu/19)*100), 3))
    AUA.append(round(((aua/19)*100), 3))
    AUC.append(round(((auc/19)*100), 3))
    AUG.append(round(((aug/19)*100), 3))
    AUU.append(round(((auu/19)*100), 3))
    
    CAA.append(round(((caa/19)*100), 3))
    CAC.append(round(((cac/19)*100), 3))
    CAG.append(round(((cag/19)*100), 3))
    CAU.append(round(((cau/19)*100), 3))
    CCA.append(round(((cca/19)*100), 3))
    CCC.append(round(((ccc/19)*100), 3))
    CCG.append(round(((ccg/19)*100), 3))
    CCU.append(round(((ccu/19)*100), 3))
    CGA.append(round(((cga/19)*100), 3))
    CGC.append(round(((cgc/19)*100), 3))
    CGG.append(round(((cgg/19)*100), 3))
    CGU.append(round(((cgu/19)*100), 3))
    CUA.append(round(((cua/19)*100), 3))
    CUC.append(round(((cuc/19)*100), 3))
    CUG.append(round(((cug/19)*100), 3))
    CUU.append(round(((cuu/19)*100), 3))
    
    GAA.append(round(((gaa/19)*100),3))
    GAC.append(round(((gac/19)*100),3))
    GAG.append(round(((gag/19)*100),3))
    GAU.append(round(((gau/19)*100),3))
    GCA.append(round(((gca/19)*100),3))
    GCC.append(round(((gcc/19)*100),3))
    GCG.append(round(((gcg/19)*100),3))
    GCU.append(round(((gcu/19)*100),3))
    GGA.append(round(((gga/19)*100),3))
    GGC.append(round(((ggc/19)*100),3))
    GGG.append(round(((ggg/19)*100),3))
    GGU.append(round(((ggu/19)*100),3))
    GUA.append(round(((gua/19)*100),3))
    GUC.append(round(((guc/19)*100),3))
    GUG.append(round(((gug/19)*100),3))
    GUU.append(round(((guu/19)*100),3))
    
    UAA.append(round(((uaa/19)*100),3))
    UAC.append(round(((uac/19)*100),3))
    UAG.append(round(((uag/19)*100),3))
    UAU.append(round(((uau/19)*100),3))
    UCA.append(round(((uca/19)*100),3))
    UCC.append(round(((ucc/19)*100),3))
    UCG.append(round(((ucg/19)*100),3))
    UCU.append(round(((ucu/19)*100),3))
    UGA.append(round(((uga/19)*100),3))
    UGC.append(round(((ugc/19)*100),3))
    UGG.append(round(((ugg/19)*100),3))
    UGU.append(round(((ugu/19)*100),3))
    UUA.append(round(((uua/19)*100),3))
    UUC.append(round(((uuc/19)*100),3))
    UUG.append(round(((uug/19)*100),3))
    UUU.append(round(((uuu/19)*100),3))


In [13]:
#Features Group 9 Extraction - 5' and 3' end (7 nt)
five_prime_7nt = []
three_prime_7nt = []

for seq in sequences:
    seq_id = seq.id
    sequence = (seq.seq).replace("t", "U")
    sequence = sequence.replace("a", "A")
    sequence = sequence.replace("g", "G")
    sequence = sequence.replace("c", "C")
    five = sequence[:7]
    three = sequence[-7:]
    five_prime_7nt.append(five)
    three_prime_7nt.append(three)


In [14]:
import pandas as pd
dataframe=pd.DataFrame()
#sequences id
#dataframe['Sequence_ID']=seqs_ids
#position specific nucleotides
dataframe['P1']=P1s
dataframe['P2']=P2s
dataframe['P3']=P3s
dataframe['P4']=P4s
dataframe['P5']=P5s
dataframe['P6']=P6s
dataframe['P7']=P7s
dataframe['P8']=P8s
dataframe['P9']=P9s
dataframe['P10']=P10s
dataframe['P11']=P11s
dataframe['P12']=P12s
dataframe['P13']=P13s
dataframe['P14']=P14s
dataframe['P15']=P15s
dataframe['P16']=P16s
dataframe['P17']=P17s
dataframe['P18']=P18s
dataframe['P19']=P19s
dataframe['P20']=P20s
dataframe['P21']=P21s
#global and local gc content, with k=5 for local
dataframe['GC_Content']=gc_contents
dataframe['GC_1']=GC_I
dataframe['GC_2']=GC_II
dataframe['GC_3']=GC_III
dataframe['GC_4']=GC_IV
dataframe['GC_5']=GC_V
dataframe['GC_6']=GC_VI
dataframe['GC_7']=GC_VII
dataframe['GC_8']=GC_VIII
dataframe['GC_9']=GC_IX
dataframe['GC_10']=GC_X
dataframe['GC_11']=GC_XI
dataframe['GC_12']=GC_XII
dataframe['GC_13']=GC_XIII
dataframe['GC_14']=GC_XIV
dataframe['GC_15']=GC_XV
dataframe['GC_16']=GC_XVI
#dinucleotides content
dataframe['AA%']=AA
dataframe['AC%']=AC
dataframe['AG%']=AG
dataframe['AU%']=AU
dataframe['CA%']=CA
dataframe['CC%']=CC
dataframe['CG%']=CG
dataframe['CU%']=CU
dataframe['GG%']=GG
dataframe['GA%']=GA
dataframe['GC%']=GC
dataframe['GU%']=GU
dataframe['UA%']=UA
dataframe['UC%']=UC
dataframe['UG%']=UG
dataframe['UU%']=UU
#trinucleotides content
dataframe['AAA%']=AAA
dataframe['AAC%']=AAC
dataframe['AAU%']=AAU
dataframe['AAG%']=AAG
dataframe['ACA%']=ACA
dataframe['ACC%']=ACC
dataframe['ACU%']=ACU
dataframe['ACG%']=ACG
dataframe['AUA%']=AUA
dataframe['AUC%']=AUC
dataframe['AUU%']=AUU
dataframe['AUG%']=AUG
dataframe['AGA%']=AGA
dataframe['AGC%']=AGC
dataframe['AGU%']=AGU
dataframe['AGG%']=AGG
dataframe['GGA%']=GGA
dataframe['GGC%']=GGC
dataframe['GGU%']=GGU
dataframe['GGG%']=GGG
dataframe['GCA%']=GCA
dataframe['GCC%']=GCC
dataframe['GCU%']=GCU
dataframe['GCG%']=GCG
dataframe['GUA%']=GUA
dataframe['GUC%']=GUC
dataframe['GUU%']=GUU
dataframe['GAG%']=GAG
dataframe['GAA%']=GAA
dataframe['GAC%']=GAC
dataframe['GAU%']=GAU
dataframe['GUG%']=GUG
dataframe['CAA%']=CAA
dataframe['CAC%']=CAC
dataframe['CAU%']=CAU
dataframe['CAG%']=CAG
dataframe['CCA%']=CCA
dataframe['CCC%']=CCC
dataframe['CCU%']=CCU
dataframe['CCG%']=CCG
dataframe['CGA%']=CGA
dataframe['CGC%']=CGC
dataframe['CGU%']=CGU
dataframe['CGG%']=CGG
dataframe['CUA%']=CUA
dataframe['CUC%']=CUC
dataframe['CUU%']=CUU
dataframe['CUG%']=CUG
dataframe['UAA%']=UAA
dataframe['UAC%']=UAC
dataframe['UAU%']=UAU
dataframe['UAG%']=UAG
dataframe['UCA%']=UCA
dataframe['UCC%']=UCC
dataframe['UCU%']=UCU
dataframe['UCG%']=UCG
dataframe['UGA%']=UGA
dataframe['UGC%']=UGC
dataframe['UGU%']=UGU
dataframe['UGG%']=UGG
dataframe['UUA%']=UUA
dataframe['UUC%']=UUC
dataframe['UUU%']=UUU
dataframe['UUG%']=UUG
#5' GC, AU content
dataframe['five_prime_GC']=five_prime_GC
dataframe['five_prime_AU']=five_prime_AU
#thermodynamics profile
dataframe['Global_Tm']=Tms  #using termodynamics neighbor
dataframe['Global_ΔG']=dgs  #at 25°C
#5' and 3' end sequences with k=7
dataframe['five_prime_seq']=five_prime_7nt
dataframe['three_prime_seq']=three_prime_7nt
#classes' label
dataframe['Label']=Scores

  dataframe['CUU%']=CUU
  dataframe['CUG%']=CUG
  dataframe['UAA%']=UAA
  dataframe['UAC%']=UAC
  dataframe['UAU%']=UAU
  dataframe['UAG%']=UAG
  dataframe['UCA%']=UCA
  dataframe['UCC%']=UCC
  dataframe['UCU%']=UCU
  dataframe['UCG%']=UCG
  dataframe['UGA%']=UGA
  dataframe['UGC%']=UGC
  dataframe['UGU%']=UGU
  dataframe['UGG%']=UGG
  dataframe['UUA%']=UUA
  dataframe['UUC%']=UUC
  dataframe['UUU%']=UUU
  dataframe['UUG%']=UUG
  dataframe['five_prime_GC']=five_prime_GC
  dataframe['five_prime_AU']=five_prime_AU
  dataframe['Global_Tm']=Tms  #using termodynamics neighbor
  dataframe['Global_ΔG']=dgs  #at 25°C
  dataframe['five_prime_seq']=five_prime_7nt
  dataframe['three_prime_seq']=three_prime_7nt
  dataframe['Label']=Scores


In [15]:
print(dataframe)

     P1 P2 P3 P4 P5 P6 P7 P8 P9 P10  ...    UUC%    UUU%   UUG% five_prime_GC  \
0     C  U  A  A  U  A  U  G  U   U  ...   0.000   5.263  5.263        14.286   
1     A  A  U  A  U  G  U  U  A   A  ...   0.000   5.263  5.263        14.286   
2     G  A  U  U  U  A  U  A  C   A  ...  10.526  10.526  0.000        14.286   
3     C  A  A  U  U  C  C  U  U   U  ...  10.526  15.789  0.000        42.857   
4     C  A  G  A  C  C  A  A  A   A  ...   0.000   0.000  0.000        57.143   
...  .. .. .. .. .. .. .. .. ..  ..  ...     ...     ...    ...           ...   
2423  A  A  A  A  C  C  C  A  C   C  ...   0.000   0.000  0.000        42.857   
2424  G  A  U  G  A  U  C  A  G   A  ...   0.000   0.000  0.000        42.857   
2425  A  U  C  A  G  A  A  G  C   U  ...   0.000   0.000  0.000        28.571   
2426  G  A  A  G  C  U  G  A  A   C  ...   0.000   0.000  5.263        57.143   
2427  G  A  U  G  A  U  C  C  C   G  ...   0.000   0.000  0.000        42.857   

     five_prime_AU Global_T

In [16]:
outputfile='/Users/christian/Desktop/siRNA ML/siRNA_features.csv'
dataframe.to_csv(outputfile,index=False)

*Notebook Created By: Christian Mandelli, Oregon State University*