In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from math import sqrt
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import Adam
from tensorflow import cast,float32
from keras import backend as kb
from statistics import mean, stdev

2022-08-24 16:17:05.265208: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [2]:
def generate_kmer_multiple(seqlist,k):
    kmer_list = []
    n = -1
    for seq in seqlist:
        kmer_list.append(generate_kmer_single(seq,k))
    return kmer_list
    
def generate_kmer_single(seq,k):
    kmer = ""
    for i in range(0,len(seq)-k,1):
        kmer += seq[i:i+k]+" "
    return kmer[:-1]

def test_rmse(model,X_test,Y_test):
    test_preds = model.predict(X_test)
    mse = mean_squared_error(Y_test, test_preds)
    rmse = sqrt(mse)
    return rmse

def root_mean_squared_error(y_true, y_pred):
    y_true = cast(y_true,float32)
    return kb.sqrt(kb.mean(kb.square(y_pred - y_true)))

def read_region(region):
    da = pd.read_csv("datasets/full_length_reads.csv")
    file_handle = open("datasets/"+region+".fasta","r")
    seq = []
    seqid = []
    tmp_seq = ""
    for line in file_handle:
        if (line[0] == ">"):
            if tmp_seq != "":
                seq.append(tmp_seq)
            seqid.append(line.split("\n")[0][1:])
            tmp_seq = ""
        else:
            tmp_seq+=line.split("\n")[0]
    seq.append(tmp_seq)
    file_handle.close()
    sub = pd.DataFrame([seq,seqid], index = [region,"accession"])
    sub = sub.transpose()
    da = da[["accession","copy_number"]]
    da = pd.merge(da,sub,on="accession",how="inner")
    return da

In [3]:
def create_model():
    model=Sequential()
    model.add(Dense(512, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(16,activation='relu'))
    model.add(Dense(8,activation='relu'))
    model.add(Dense(1,activation='linear'))
    model.compile(loss=root_mean_squared_error,optimizer=Adam(0.001))
    return model

In [4]:
da = read_region("V1_simu")
da.columns = ["accession","copy_number","sequence"]
for region in ["V2_simu","V3_simu","V4_simu","V5_simu","V6_simu","V7_simu","V8_simu","V9_simu"]:
    tmp = read_region(region)
    tmp.columns = ["accession","copy_number","sequence"]
    da = pd.concat([da,tmp],axis = 0)

In [5]:
da = da.sample(frac = 1, random_state = 42)

In [7]:
multiplicand = int(X.shape[0]*0.2)

In [9]:
rmse = []
for i in range(0,5,1):
    X_test = da["sequence"][i*multiplicand:(i+1)*multiplicand]
    Y_test = da['copy_number'][i*multiplicand:(i+1)*multiplicand]
    X_train = pd.concat([da["sequence"][0:i*multiplicand],da["sequence"][(i+1)*multiplicand:]],axis = 0)
    Y_train = pd.concat([da['copy_number'][0:i*multiplicand],da['copy_number'][(i+1)*multiplicand:]],axis = 0)
    vectorizer = CountVectorizer()
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)
    X_train = X_train.values.reshape(X_train.shape[0], )
    X_test = X_test.values.reshape(X_test.shape[0], )
    kmer_train = generate_kmer_multiple(X_train.tolist(), 6)
    kmer_test = generate_kmer_multiple(X_test.tolist(), 6)
    X_train = vectorizer.fit_transform(kmer_train).toarray()
    X_test = vectorizer.transform(kmer_test).toarray()
    model = create_model()
    model.fit(X_train,Y_train, validation_split= 0.1, batch_size=64,epochs=20,verbose = 0)
    rmse.append(test_rmse(model,X_test,Y_test))
    print(rmse[i])

2022-08-24 16:18:37.331435: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2022-08-24 16:18:37.332864: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2794690000 Hz
2022-08-24 16:18:37.949293: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2022-08-24 16:18:38.606085: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11
2022-08-24 16:18:38.606145: I tensorflow/stream_executor/cuda/cuda_blas.cc:1838] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


0.8721106004840917
0.8763050660913456
0.9433123327178579
0.9065184060819004
0.8998458572473736


In [10]:
pd.DataFrame(rmse,columns=["method1_self_cv"]).to_csv("performance/MLP_method1_train.csv",index=False)

In [11]:
X = da["sequence"]
Y = da["copy_number"]
vectorizer = CountVectorizer(lowercase=False)
kmer_train = generate_kmer_multiple(X.tolist(), 6)
x = vectorizer.fit_transform(kmer_train).toarray()
model = create_model()
model.fit(x,Y,validation_split=0.1, batch_size=100,epochs=20,verbose = 1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f90af1adf70>

In [12]:
performance = {}
for region in ["V1-V2","V1-V3","V3-V4","V4","V4-V5","V6-V8","V7-V9"]:
    region_da = read_region(region)
    X = region_da[region]
    Y = region_da['copy_number']
    kmer_train = generate_kmer_multiple(X.tolist(), 6)
    x = vectorizer.transform(kmer_train).toarray()
    res = test_rmse(model,x,Y)
    print(res)
    performance[region] = res

2.6283952403496214
3.9841057641193838
2.7045830899275773
1.0342397194012474
3.2559326080977895
4.27168729169467
3.7800950255715953


In [14]:
pd.DataFrame([list(performance.keys()),list(performance.values())],index = ["test","rmse"]).transpose()

Unnamed: 0,test,rmse
0,V1-V2,2.628395
1,V1-V3,3.984106
2,V3-V4,2.704583
3,V4,1.03424
4,V4-V5,3.255933
5,V6-V8,4.271687
6,V7-V9,3.780095


In [15]:
pd.DataFrame([list(performance.keys()),list(performance.values())],index = ["test","rmse"]).transpose().to_csv("performance/MLP_method1_test.csv",index=False)