In [1]:
import numpy as np
import pandas as pd
import random

import matplotlib.pyplot as plt
import collections
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from math import sqrt
from sklearn.feature_extraction.text import CountVectorizer
import textwrap
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow import cast,float32
from keras import backend as kb
from statistics import mean, stdev
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold

2022-08-24 14:49:02.265346: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [2]:
def augment_mlp(x,y):
    seq_list = []
    count_list = []
    for i in range(0,len(x)):
        split = textwrap.wrap(x[i], width=101)
        seq_list.append(split)
        count_list.append([y[i],]*len(split))
    seq_list = sum(seq_list,[])
    count_list = sum(count_list,[])
    x = build_kmers_mlp(seq_list, 6)
    y = np.array(count_list)
    x = np.array(x)
    return x,y

def build_kmers_mlp(x, k):
    kmer_arr = []
    for seq in x:
        
        kmers = ''
        n_kmers = len(seq) - k + 1

        for i in range(0,n_kmers):
            kmer = seq[i:i + k]
            kmers += kmer + ' '

        kmer_arr.append(kmers)
     
    return kmer_arr

def create_model_mlp():
    model=Sequential()
    model.add(Dense(512, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(16,activation='relu'))
    model.add(Dense(8,activation='relu'))
    model.add(Dense(1,activation='linear'))
    model.compile(loss=root_mean_squared_error,optimizer=Adam(0.001))
    return model

def test_rmse(model,X_test,Y_test):
    test_preds = model.predict(X_test)
    mse = mean_squared_error(Y_test, test_preds)
    rmse = sqrt(mse)
    return rmse

def root_mean_squared_error(y_true, y_pred):
    y_true = cast(y_true,float32)
    return kb.sqrt(kb.mean(kb.square(y_pred - y_true)))

In [3]:
data =pd.read_csv("~/autodl-tmp/full_length_reads.csv")
X = data['sequence']
Y = data['copy_number']

## Self Cross-validation

In [4]:
multiplicand = int(X.shape[0]*0.2)
X_list = []
Y_list = []
for i in range(0,5,1):
    X_list.append(X[i*multiplicand:(i+1)*multiplicand])
    Y_list.append(Y[i*multiplicand:(i+1)*multiplicand])

In [5]:
rmse = []
for i in range(0,5,1):
    X_test = X_list[i]
    Y_test = Y_list[i]
    X_train = []
    Y_train = []
    for j in range(0,5,1):
        if j != i:
            X_train.append(X_list[j])
            Y_train.append(Y_list[j])
    X_train = pd.concat(X_train,axis = 0)
    Y_train = pd.concat(Y_train,axis = 0)
    x_train,y_train = augment_mlp(X_train.tolist(),Y_train.tolist())
    x_test,y_test = augment_mlp(X_test.tolist(),Y_test.tolist())
    vectorizer = CountVectorizer(lowercase=False)
    x_train = vectorizer.fit_transform(x_train).toarray()
    x_test = vectorizer.transform(x_test).toarray()
    model = create_model_mlp()
    model.fit(x_train,y_train,validation_split=0.1, batch_size=100,epochs=20,verbose = 0)
    rmse.append(test_rmse(model,x_test,y_test))
    print(rmse[i])

2022-08-24 14:50:07.549325: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2022-08-24 14:50:07.673594: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:41:00.0 name: NVIDIA GeForce RTX 3090 computeCapability: 8.6
coreClock: 1.695GHz coreCount: 82 deviceMemorySize: 23.70GiB deviceMemoryBandwidth: 871.81GiB/s
2022-08-24 14:50:07.673634: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2022-08-24 14:50:07.677256: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2022-08-24 14:50:07.677295: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11
2022-08-24 14:50:07.678024: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcu

0.9545984754140131


2022-08-24 14:53:09.410224: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 7607245544 exceeds 10% of free system memory.


0.9911909684538481


2022-08-24 14:55:59.906060: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 7845519280 exceeds 10% of free system memory.


0.9380719422125862


2022-08-24 14:58:53.818267: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 7799767680 exceeds 10% of free system memory.


0.9805189293141714


2022-08-24 15:02:18.180420: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 7888320896 exceeds 10% of free system memory.


0.9598807748314199


In [6]:
rmse

[0.9545984754140131,
 0.9911909684538481,
 0.9380719422125862,
 0.9805189293141714,
 0.9598807748314199]

In [10]:
pd.DataFrame(rmse,columns=["method2_self_cv"]).to_csv("performance/MLP_method2_train.csv",index=False)

## Tests on Subregions

In [17]:
data =pd.read_csv("datasets/full_length_reads.csv")
X = data['sequence']
Y = data['copy_number']
x,y = augment_mlp(X.tolist(),Y.tolist())
vectorizer = CountVectorizer(lowercase=False)
x = vectorizer.fit_transform(x).toarray()
model = create_model_mlp()
model.fit(x,y,validation_split=0.1, batch_size=100,epochs=20,verbose = 1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f21d53eee50>

In [12]:
def read_region(region):
    da = pd.read_csv("datasets/full_length_reads.csv")
    file_handle = open("datasets/"+region+".fasta","r")
    seq = []
    seqid = []
    tmp_seq = ""
    for line in file_handle:
        if (line[0] == ">"):
            if tmp_seq != "":
                seq.append(tmp_seq)
            seqid.append(line.split("\n")[0][1:])
            tmp_seq = ""
        else:
            tmp_seq+=line.split("\n")[0]
    seq.append(tmp_seq)
    file_handle.close()
    sub = pd.DataFrame([seq,seqid], index = [region,"accession"])
    sub = sub.transpose()
    da = da[["accession","copy_number"]]
    da = pd.merge(da,sub,on="accession",how="inner")
    return da

In [29]:
performance = {}
for region in ["V1-V2","V1-V3","V3-V4","V4","V4-V5","V6-V8","V7-V9"]:
    da = read_region(region)
    X = da[region]
    Y = da['copy_number']
    x,y = augment_mlp(X.tolist(),Y.tolist())
    x = vectorizer.transform(x).toarray()
    res = test_rmse(model,x,y)
    print(res)
    performance[region] = res

1.4043640492140774
1.2573387532066091
1.912700622972369
1.281319887533199
1.8633085289178444
1.3818691236873073
2.238917277959003


In [30]:
pd.DataFrame([list(performance.keys()),list(performance.values())],index = ["test","rmse"]).transpose()

Unnamed: 0,test,rmse
0,V1-V2,1.404364
1,V1-V3,1.257339
2,V3-V4,1.912701
3,V4,1.28132
4,V4-V5,1.863309
5,V6-V8,1.381869
6,V7-V9,2.238917


In [31]:
pd.DataFrame([list(performance.keys()),list(performance.values())],index = ["test","rmse"]).transpose().to_csv("performance/MLP_method2_test.csv",index=False)