In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from math import sqrt
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import Adam
from tensorflow import cast,float32
from keras import backend as kb
from statistics import mean, stdev

In [2]:
def generate_kmer_multiple(seqlist,k):
    kmer_list = []
    n = -1
    for seq in seqlist:
        kmer_list.append(generate_kmer_single(seq,k))
    return kmer_list
    
def generate_kmer_single(seq,k):
    kmer = ""
    for i in range(0,len(seq)-k,1):
        kmer += seq[i:i+k]+" "
    return kmer[:-1]

def test_rmse(model,X_test,Y_test):
    test_preds = model.predict(X_test)
    mse = mean_squared_error(Y_test, test_preds)
    rmse = sqrt(mse)
    return rmse

def root_mean_squared_error(y_true, y_pred):
    y_true = cast(y_true,float32)
    return kb.sqrt(kb.mean(kb.square(y_pred - y_true)))

In [3]:
def read_region(region):
    da = pd.read_csv("datasets/full_length_reads.csv")
    file_handle = open("datasets/"+region+".fasta","r")
    seq = []
    seqid = []
    tmp_seq = ""
    for line in file_handle:
        if (line[0] == ">"):
            if tmp_seq != "":
                seq.append(tmp_seq)
            seqid.append(line.split("\n")[0][1:])
            tmp_seq = ""
        else:
            tmp_seq+=line.split("\n")[0]
    seq.append(tmp_seq)
    file_handle.close()
    sub = pd.DataFrame([seq,seqid], index = [region,"accession"])
    sub = sub.transpose()
    da = da[["accession","copy_number"]]
    da = pd.merge(da,sub,on="accession",how="inner")
    return da

def write_vocal(vectorizer,region):
    names = vectorizer.get_feature_names()
    file_handle = open('deployment/vector_vocal_'+region+'.csv','w')
    for name in names:
        file_handle.write(name)
        file_handle.write('\n')
    file_handle.close()

In [4]:

def create_model(region):
    model=Sequential()
    if region == "full-length":
        model.add(Dense(512, activation='relu'))
        model.add(Dense(256, activation='relu'))
        model.add(Dense(128, activation='relu'))
        model.add(Dense(64,activation='relu'))
        model.add(Dense(16,activation='relu'))
        model.add(Dense(8,activation='relu'))
        model.add(Dense(1,activation='linear'))
        model.compile(loss=root_mean_squared_error,optimizer=Adam(0.001))
    else:
        if region in ["V1-V2","V3-V4"]:
            model.add(Dense(1024, activation='relu'))
            model.add(Dense(512, activation='relu'))
            model.add(Dense(128, activation='relu'))
            model.add(Dense(64,activation='relu'))
            model.add(Dense(16,activation='relu'))
            model.add(Dense(1,activation='linear'))
        elif region in ["V4-V5"]:
            model.add(Dense(1024, activation='relu'))
            model.add(Dense(128, activation='relu'))
            model.add(Dense(64,activation='relu'))
            model.add(Dense(16,activation='relu'))
            model.add(Dense(1,activation='linear'))
        else:
            model.add(Dense(512, activation='relu'))
            if region not in ["V6-V8","V7-V9"]:
                model.add(Dense(256, activation='relu'))
            model.add(Dense(128, activation='relu'))
            model.add(Dense(64,activation='relu'))
            model.add(Dense(16,activation='relu'))
            model.add(Dense(1,activation='linear'))

        if region in ["V7-V9"]:
            model.compile(loss=root_mean_squared_error,optimizer=Adam(0.002))
        else:
            model.compile(loss=root_mean_squared_error,optimizer=Adam(0.001))

    return model

def fit_model(model,X_train,Y_train,region):
    if region == "full-length":
        model.fit(X_train,Y_train,validation_split=0.1, batch_size=100,epochs=50,verbose = 0)
    elif region == "V1-V2":
        model.fit(X_train,Y_train,validation_split=0.1,batch_size=128,epochs=50,verbose=0)
    elif region == "V1-V3":
        model.fit(X_train,Y_train,validation_split=0.1,batch_size=128,epochs=50,verbose=0)
    elif region in ["V3-V4"]:
        model.fit(X_train,Y_train,validation_split=0.1,batch_size=64,epochs=50,verbose=0)
    elif region in ["V4"]:
        model.fit(X_train,Y_train,validation_split=0.1,batch_size=100,epochs=30,verbose=0)
    elif region in ["V4-V5"]:
        model.fit(X_train,Y_train,validation_split=0.1,batch_size=64,epochs=50,verbose=0)
    elif region in ["V6-V8","V7-V9"]:
        model.fit(X_train,Y_train,validation_split=0.1,batch_size=64,epochs=50,verbose=0)
    return model

In [5]:
da = pd.read_csv("datasets/full_length_reads.csv")
X = da["sequence"]
Y = da['copy_number']
kmer = generate_kmer_multiple(X.tolist(), 6)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(kmer).toarray()
write_vocal(vectorizer,"full-length")
model = create_model("full-length")
model = fit_model(model,X,Y,"full-length")
model.save("deployment/mlp_full-length.h5")

for region in ["V1-V2","V1-V3","V3-V4","V4","V4-V5","V6-V8","V7-V9"]:
    da = read_region(region)
    vectorizer = CountVectorizer()
    X = da[region]
    Y = da['copy_number']
    kmer = generate_kmer_multiple(X.tolist(), 6)
    X = vectorizer.fit_transform(kmer).toarray()
    write_vocal(vectorizer,region)
    model = create_model(region)
    model = fit_model(model,X,Y,region)
    model.save("deployment/mlp_"+region+".h5")