In [18]:
import pandas
import keras

primers = pandas.read_csv("primers.csv")
primers.head()

Unnamed: 0,seq,tm,hairpin
0,CAGCTTACGGG,31.818141,0.0
1,TCAAGACGAGGTAACCACGTCG,56.489715,-2181.812493
2,ACTTCGGACGCCCGTTTGC,58.110465,-1148.900996
3,TTTTGTTCGTGATGGGGGAAGCGCTCCT,64.080574,-318.446996
4,GTCGCCTTTTC,30.062109,0.0


In [19]:
import numpy

# one hot encode the primer sequences
bp_one_hot = {
    "A": [1, 0, 0, 0],
    "T": [0, 1, 0, 0],
    "G": [0, 0, 1, 0],
    "C": [0, 0, 0, 1],
}

primer_seqs = primers.seq.tolist()
max_primer_len = 50
seqs = numpy.zeros((len(primer_seqs), max_primer_len, 4), dtype=numpy.int8)

for i, seq in enumerate(primer_seqs):
    for j, bp in enumerate(seq):
        seqs[i][j] = bp_one_hot[bp]

In [20]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

y = primers.hairpin
X_train, X_test, y_train, y_test = train_test_split(seqs, y, test_size=0.1)

In [66]:
import numpy
from keras.models import Model
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding, Flatten, Add, Input, Conv1D
from keras.wrappers.scikit_learn import KerasRegressor

n = 200000

seq_in = Input(shape=(max_primer_len, 4), name='seq_input')
seq_model = Conv1D(256, 16, activation='relu')(seq_in)
seq_model = Dropout(0.1)(seq_model)
seq_model = Dense(128, activation='relu')(seq_model)
seq_model = Dropout(0.1)(seq_model)
seq_model = Dense(80, activation='relu')(seq_model)
seq_model = Flatten(input_shape=(max_primer_len, 4))(seq_model)
seq_out = Dense(1, activation="linear", name='seq_output')(seq_model)

model = Model(inputs=seq_in, outputs=seq_out)
model.compile(loss='mse', optimizer='rmsprop')

model.fit(x=X_train[:n], y=y_train[:n], validation_data=(X_test, y_test), epochs=5, verbose=1)

# test the output
diffs = []
ests = model.predict(X_test)
for est, actual in zip(ests, y_test):
    diffs.append(abs(est - actual))
print("standard deviation: ", numpy.std(diffs))
print("median difference: ", numpy.median(diffs))


Train on 180000 samples, validate on 20000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
standard deviation:  454.5776
median difference:  92.48992


In [68]:
import primer3

seq = "TGAGGGGCTGGGGGCGCTTTG"

test_seq = numpy.array([[bp_one_hot[bp] for bp in seq] + [[0, 0, 0, 0]] * (max_primer_len - len(seq))])
test_tm = primer3.calcHairpin(seq).dg

print(model.predict([test_seq])[0][0], test_tm)

-559.0588 -1281.9874934343788
