In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import Adam
from tensorflow import cast,float32
from keras import backend as kb
from statistics import mean, stdev

In [2]:
def generate_kmer_multiple(seqlist,k):
    kmer_list = []
    n = -1
    for seq in seqlist:
        kmer_list.append(generate_kmer_single(seq,k))
    return kmer_list
    
def generate_kmer_single(seq,k):
    kmer = ""
    for i in range(0,len(seq)-k,1):
        kmer += seq[i:i+k]+" "
    return kmer[:-1]

def test_rmse(model,X_test,Y_test):
    test_preds = model.predict(X_test)
    mse = mean_squared_error(Y_test, test_preds)
    rmse = sqrt(mse)
    return rmse

def root_mean_squared_error(y_true, y_pred):
    y_true = cast(y_true,float32)
    return kb.sqrt(kb.mean(kb.square(y_pred - y_true)))

In [3]:
def create_model():
    model=Sequential()
    model.add(Dense(512, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(16,activation='relu'))
    model.add(Dense(8,activation='relu'))
    model.add(Dense(1,activation='linear'))
    model.compile(loss=root_mean_squared_error,optimizer=Adam(0.001))
    return model

In [4]:
da = pd.read_csv("datasets/full_length_reads.csv")
X = da["sequence"]
Y = da["copy_number"]

In [6]:
multiplicand = int(X.shape[0]*0.2)
X_list = []
Y_list = []
for i in range(0,5,1):
    X_list.append(X[i*multiplicand:(i+1)*multiplicand])
    Y_list.append(Y[i*multiplicand:(i+1)*multiplicand])

In [7]:
rmse = []
for i in range(0,5,1):
    X_test = X_list[i]
    Y_test = Y_list[i]
    X_train = []
    Y_train = []
    for j in range(0,5,1):
        if j != i:
            X_train = pd.DataFrame(X_list[j]).append(X_train)
            Y_train = pd.DataFrame(Y_list[j]).append(Y_train)
    vectorizer = CountVectorizer()
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)
    X_train = X_train.values.reshape(X_train.shape[0], )
    X_test = X_test.values.reshape(X_test.shape[0], )
    kmer_train = generate_kmer_multiple(X_train.tolist(), 6)
    kmer_test = generate_kmer_multiple(X_test.tolist(), 6)
    X_train = vectorizer.fit_transform(kmer_train).toarray()
    X_test = vectorizer.transform(kmer_test).toarray()
    model = create_model()
    model.fit(X_train,Y_train,validation_split=0.1, batch_size=100,epochs=50,verbose = 0)
    rmse.append(test_rmse(model,X_test,Y_test))
    print(rmse[i])

0.7099190288078919
0.7847049239283396
0.67993143565748
0.7322174995883802
0.7401263775962516


In [8]:
mean(rmse)

0.7293798531156687

In [6]:
rmse.to_csv("performance/MLP_full_length.csv",index=False)