In [1]:
!git clone https://github.com/Merlinaphist/ReproduceANNA16.git

Cloning into 'ReproduceANNA16'...
remote: Enumerating objects: 298, done.[K
remote: Counting objects: 100% (117/117), done.[K
remote: Compressing objects: 100% (90/90), done.[K
remote: Total 298 (delta 33), reused 107 (delta 24), pack-reused 181[K
Receiving objects: 100% (298/298), 291.47 MiB | 14.76 MiB/s, done.
Resolving deltas: 100% (42/42), done.
Updating files: 100% (180/180), done.


In [2]:
import pandas as pd
import numpy as np
import pickle, os, shutil
from math import sqrt
from zipfile import ZipFile
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow import cast,float32
from keras import backend as kb
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [3]:
class CopyNumberPredictor():
    def __init__(self,region):
        self.region = region
        self.state = [59, 0.00016770313599, 0.11, 100, 489, 1, 926, 0, 645, 0, 929, 3, 582, 1, 82, 4]
        self.activation_indices={0:"relu",1:"gelu",2:"selu",3:"elu",4:"linear"}
        self.mlp = self.create_mlp()
        self.ridge = Ridge(alpha = 49)
        self.pca = PCA(n_components=100)
        self.svr = SVR(kernel='rbf',C=11,gamma='auto')

    def save(self,filename):
        if filename[-4:] != ".zip":
            raise ValueError('Invalid filename. Expect a zip file.')

        path = filename[:-4]
        prefix = path + "/" + self.region

        if not os.path.exists(path):
            os.makedirs(path)

        with open(prefix+'_pca.pkl','wb') as file:
            pickle.dump(self.pca,file)
        with open(prefix+'_ridge.pkl','wb') as file:
            pickle.dump(self.ridge,file)
        with open(prefix+'_svr.pkl','wb') as file:
            pickle.dump(self.svr,file)
        self.mlp.save(prefix+"_mlp.h5")

        shutil.make_archive(path, 'zip', path)
        shutil.rmtree(path)

    def load(self,filename):
        if filename[-4:] != ".zip":
            raise ValueError('Invalid input file. Expect a zip file.')

        path = filename[:-4]

        if not os.path.exists(path):
            os.makedirs(path)

        with ZipFile(filename,'r') as zObject:
            zObject.extractall(path=path)

        prefix = path + "/" + self.region
        with open(prefix+'_pca.pkl', 'rb') as file:
            self.pca = pickle.load(file)
        with open(prefix+'_ridge.pkl', 'rb') as file:
            self.ridge = pickle.load(file)
        with open(prefix+'_svr.pkl', 'rb') as file:
            self.svr = pickle.load(file)
        self.mlp = load_model(prefix + "_mlp.h5",
                              custom_objects={"root_mean_squared_error": self.root_mean_squared_error})
        shutil.rmtree(path)

    def fit(self,X_train,Y_train,verbose=True):
        self.echo(text = "------Training Starts------", verbose = verbose)
        X_train_pca = self.pca.fit_transform(X_train)
        self.fit_mlp(X_train,Y_train)
        mlp_pred = self.mlp.predict(X_train,verbose=0)
        self.echo(text = "Model 1: MLP done.", verbose = verbose)

        reshaped_Y_train = Y_train.values.reshape(Y_train.shape[0])
        new_X_train = pd.concat([pd.DataFrame(X_train_pca),pd.DataFrame(mlp_pred)], axis = 1)

        signals = ["Model 2: SVR done."]

        self.svr.fit(X_train_pca,reshaped_Y_train)
        svr_pred = self.svr.predict(X_train_pca)
        new_X_train = pd.concat([new_X_train, pd.DataFrame(svr_pred)], axis = 1)
        self.echo(text = signals[0], verbose = verbose)

        del signals[0]

        self.ridge.fit(new_X_train,reshaped_Y_train)
        self.echo(text = "Meta-Model: Ridge done.", verbose = verbose)

    def echo(self,text,verbose):
        if verbose not in [True,False]:
            raise ValueError('verbose must be True or False')
        if verbose:
            print(text)

    def predict(self, X_test):
        X_test_pca = self.pca.transform(X_test)
        mlp_pred = self.mlp.predict(X_test,verbose=0)
        new_X_test = pd.concat([pd.DataFrame(X_test_pca),pd.DataFrame(mlp_pred)], axis = 1)

        svr_pred = self.svr.predict(X_test_pca)
        new_X_test = pd.concat([new_X_test, pd.DataFrame(svr_pred)], axis = 1)

        final_pred = self.ridge.predict(new_X_test)
        return final_pred, mlp_pred, svr_pred

    def create_mlp(self):
        learning_rate = self.state[1]
        model = Sequential()
        for i in range(4,len(self.state),2):
            n_neurons = self.state[i]
            activation = self.activation_indices[self.state[i+1]]
            if n_neurons != 0:
                model.add(Dense(n_neurons,activation=activation))
        model.add(Dense(1, activation="linear"))
        model.compile(loss=self.root_mean_squared_error,optimizer=Adam(learning_rate))
        return model

    def root_mean_squared_error(self, y_true, y_pred):
        y_true = cast(y_true,float32)
        return kb.sqrt(kb.mean(kb.square(y_pred - y_true)))

    def fit_mlp(self,X_train,Y_train):
        epochs = self.state[0]
        validation_split = self.state[2]
        batch_size = self.state[3]
        self.mlp.fit(X_train,Y_train,validation_split=validation_split,
                     batch_size=batch_size,epochs=epochs,verbose=0)

In [4]:
performance = {}
path = "ReproduceANNA16/data/cv/datasets/kmer_splits"
for region in ["full_length","V1-V2","V1-V3","V3-V4","V4","V4-V5","V6-V8","V7-V9"]:
    rmse = {}
    for i in range(0,5,1):
        X_train = pd.read_pickle(f"{path}/{region}_X_train_{i}.gz")
        Y_train = pd.read_pickle(f"{path}/{region}_Y_train_{i}.gz")
        X_test = pd.read_pickle(f"{path}/{region}_X_test_{i}.gz")
        Y_test = pd.read_pickle(f"{path}/{region}_Y_test_{i}.gz")
        model = CopyNumberPredictor(region)
        model.fit(X_train,Y_train,verbose=False)
        final_pred, mlp_pred, svr_pred = model.predict(X_test)
        rmse[i] = [sqrt(mean_squared_error(Y_test,pred)) for pred in [final_pred, mlp_pred, svr_pred]]
        print(f"Region: {region}, Fold: {i}, RMSE: {rmse[i]}")
    performance[region] = rmse


Region: full_length, Fold: 0, RMSE: [0.7078293526657407, 0.7301941504268585, 0.7370163553284313]
Region: full_length, Fold: 1, RMSE: [0.7572660090736023, 0.7923202774577532, 0.7661139602050707]
Region: full_length, Fold: 2, RMSE: [0.6499096450688457, 0.687188487156243, 0.6738135330548477]
Region: full_length, Fold: 3, RMSE: [0.7099611911351, 0.752661277762901, 0.7334345856993951]
Region: full_length, Fold: 4, RMSE: [0.721986375554942, 0.7532027294082585, 0.7247132820370134]
Region: V1-V2, Fold: 0, RMSE: [0.76956078742896, 0.7808716679484325, 0.8655991761495786]
Region: V1-V2, Fold: 1, RMSE: [0.852939941578042, 0.8734755196636058, 0.9052716700982707]
Region: V1-V2, Fold: 2, RMSE: [0.7620617150306135, 0.7821621209430291, 0.8360484806765677]
Region: V1-V2, Fold: 3, RMSE: [0.8082899936724249, 0.8324067411516363, 0.8644693592003119]
Region: V1-V2, Fold: 4, RMSE: [0.7995651022105486, 0.8139211391221054, 0.852856942063965]
Region: V1-V3, Fold: 0, RMSE: [0.7558995996747897, 0.7744424743719007,

In [5]:
performance = np.array(performance)
np.save("cv_performance.npy", performance)