In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Concatenate
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import Adam
from tensorflow import cast,float32
from keras import backend as kb
from statistics import mean, stdev
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from tensorflow.keras import Input, Model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.svm import SVR

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
class Preprocessing():
    def __init__(self,k_size=6):
        self.k_size = k_size
        kmers = self.generate_kmers("",self.k_size)
        self.vectorizer = CountVectorizer(vocabulary = kmers)
        self.seqs = []
    
    def generate_kmers(self,current_kmer,current_depth):
        if current_depth == 1:
            return [current_kmer+"a",current_kmer+"t",current_kmer+"c",current_kmer+"g"]
        else:
            ret = self.generate_kmers(current_kmer+"a",current_depth-1)
            for nt in ['t','c','g']:
                ret += self.generate_kmers(current_kmer+nt,current_depth-1)
            return ret
    
    def generate_kmer_multiple(self,seqlist,k):
        kmer_list = []
        n = -1
        for seq in seqlist:
            kmer_list.append(self.generate_kmer_single(str(seq),k))
        return kmer_list
    
    def generate_kmer_single(self,seq,k):
        kmer = ""
        for i in range(0,len(seq)-k,1):
            kmer += seq[i:i+k]+" "
        return kmer[:-1]
    
    def CountKmers(self,seqs):
        if type(seqs) in [type([]),type(pd.core.series.Series([1]))]:
            kmer = self.generate_kmer_multiple(seqs, self.k_size)
            transformed_X = self.vectorizer.transform(kmer).toarray()
            return transformed_X
        else:
            raise ValueError("""Invalid 'seqs' format.
            Expected formats are 'list' or 'pandas.core.series.Series'.""")
            
    def ReadFASTA(self,filename,as_pd=True):
        if filename.split(".")[-1] not in ["fasta","fna","fa"]:
            raise ValueError('Invalid file format. Expected formats are ["fasta","fna","fa"].')
        file_handle = open(filename,"r")
        seqs = []
        seqid = []
        tmp_seq = ""
        for line in file_handle:
            if (line[0] == ">"):
                if tmp_seq != "":
                    seqs.append(tmp_seq)
                seqid.append(line.split("\n")[0][1:])
                tmp_seq = ""
            else:
                tmp_seq+=line.split("\n")[0]
        seqs.append(tmp_seq)
        file_handle.close()
        if as_pd:
            fasta = {}
            for i in range(len(seqs)):
                fasta[seqid[i]] = seqs[i]
            return pd.DataFrame(fasta,index=["sequence"]).transpose()["sequence"]
        else:
            return seqs, seqid

In [10]:
def split_test_train(da,multiplicand):
    pp = Preprocessing()
    for i in range(0,5,1):
        X_test = da["sequence"].loc[i*multiplicand:(i+1)*multiplicand]
        Y_test = da["copy_number"].loc[i*multiplicand:(i+1)*multiplicand]
        X_train = pd.concat([da["sequence"].loc[0:i*multiplicand],
                             da["sequence"].loc[(i+1)*multiplicand:]],axis = 0)
        Y_train = pd.concat([da["copy_number"].loc[0:i*multiplicand],
                             da["copy_number"].loc[(i+1)*multiplicand:]],axis = 0)
        X_train = pp.CountKmers(seqs=X_train)
        X_test = pp.CountKmers(seqs=X_test)
        pd.DataFrame(X_train).to_pickle(f"datasets/splits/X_train_{i}.gz")
        pd.DataFrame(Y_train).to_pickle(f"datasets/splits/Y_train_{i}.gz")
        pd.DataFrame(X_test).to_pickle(f"datasets/splits/X_test_{i}.gz")
        pd.DataFrame(Y_test).to_pickle(f"datasets/splits/Y_test_{i}.gz")
    return

In [11]:
da = pd.read_csv("datasets/full_length_reads.csv")
multiplicand = int(da.shape[0]*0.2)
split_test_train(da,multiplicand)

In [8]:
pp = Preprocessing()
da1 = pd.read_csv("4.final_test/datasets/full_length_testdata.filtered.csv")
X0 = da["sequence"]
Y0 = da['copy_number']
X1 = da1["sequence"]
Y1 = da1['copy_number']
X0 = pp.CountKmers(X0)
X1 = pp.CountKmers(X1)

In [12]:
pd.DataFrame(X0).to_pickle("4.final_test/datasets/X_final_train.gz")
pd.DataFrame(Y0).to_pickle("4.final_test/datasets/Y_final_train.gz")
pd.DataFrame(X1).to_pickle("4.final_test/datasets/X_final_test.gz")
pd.DataFrame(Y1).to_pickle("4.final_test/datasets/Y_final_test.gz")

In [3]:
import pandas as pd
import numpy as np

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [27]:
# vocal = {
#     'A':[1,0,0,0],
#     'T':[0,1,0,0],
#     'C':[0,0,1,0],
#     'G':[0,0,0,1],
    
#     'R':[0.5,0,0,0.5],
#     'K':[0,0.5,0,0.5],
#     'S':[0,0,0.5,0.5],
#     'Y':[0,0.5,0.5,0],
#     'M':[0.5,0,0.5,0],
#     'W':[0.5,0.5,0,0],
#     'B':[0,0.33,0.33,0.33],
#     'H':[0.33,0.33,0.33,0],
#     'N':[0.25,0.25,0.25,0.25],
#     'D':[0.33,0.33,0,0.33],
#     "V":[0.33,0,0.33,0.33]
# }

vocal = {
    'A':1,
    'T':2,
    'C':3,
    'G':4,
}

In [5]:
da = pd.read_csv("datasets/full_length_reads.csv")
multiplicand = int(da.shape[0]*0.2)
da1 = pd.read_csv("4.final_test/datasets/full_length_testdata.filtered.csv")

In [9]:
maxlen = max(da["sequence"].apply(len).max(), da1["sequence"].apply(len).max())

In [28]:
def OneHotEncode(X, maxlen):
    EncodedX = []
    for seq in X.tolist():
        EncodedLine = []
        for char in list(seq):
            EncodedLine.append(vocal.get(char,5))
        EncodedLine = EncodedLine + [0] * (maxlen - len(EncodedLine)) 
        EncodedX.append(EncodedLine)
    EncodedX = np.array(EncodedX)
    return EncodedX

In [25]:
def split_test_train_onehot(X,Y,multiplicand):
    for i in range(0,5,1):
        X_test = X[i*multiplicand:(i+1)*multiplicand]
        Y_test = Y[i*multiplicand:(i+1)*multiplicand]
        X_train = np.concatenate([X[0:i*multiplicand],
                             X[(i+1)*multiplicand:]],axis = 0)
        Y_train = np.concatenate([Y[0:i*multiplicand],
                             Y[(i+1)*multiplicand:]],axis = 0)
        pd.DataFrame(X_train).to_pickle(f"datasets/onehot/X_train_{i}.gz")
        pd.DataFrame(Y_train).to_pickle(f"datasets/onehot/Y_train_{i}.gz")
        pd.DataFrame(X_test).to_pickle(f"datasets/onehot/X_test_{i}.gz")
        pd.DataFrame(Y_test).to_pickle(f"datasets/onehot/Y_test_{i}.gz")
    return

In [29]:
X0 = OneHotEncode(da["sequence"], maxlen)
X1 = OneHotEncode(da1["sequence"], maxlen)
Y0 = da['copy_number']
Y1 = da1['copy_number']

In [30]:
split_test_train_onehot(X0, Y0, multiplicand)

In [32]:
X0.shape

(19520, 2402)

In [31]:
pd.DataFrame(X0).to_pickle("4.final_test/datasets/X_final_train_onehot.gz")
pd.DataFrame(Y0).to_pickle("4.final_test/datasets/Y_final_train_onehot.gz")
pd.DataFrame(X1).to_pickle("4.final_test/datasets/X_final_test_onehot.gz")
pd.DataFrame(Y1).to_pickle("4.final_test/datasets/Y_final_test_onehot.gz")