In [1]:
!git clone https://github.com/Merlinaphist/ReproduceANNA16.git

Cloning into 'ReproduceANNA16'...
remote: Enumerating objects: 110, done.[K
remote: Counting objects: 100% (31/31), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 110 (delta 2), reused 28 (delta 2), pack-reused 79[K
Receiving objects: 100% (110/110), 160.65 MiB | 25.89 MiB/s, done.
Resolving deltas: 100% (25/25), done.
Updating files: 100% (86/86), done.


In [2]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Concatenate
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import Adam
from tensorflow import cast,float32
from keras import backend as kb
from statistics import mean, stdev
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from tensorflow.keras import Input, Model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.svm import SVR

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
class CopyNumberPredictor():
    def __init__(self,region):
        self.region = region
        self.state = [59, 0.00016770313599, 0.11, 100, 489, 1, 926, 0, 645, 0, 929, 3, 582, 1, 82, 4]
        self.activation_indices={0:"relu",1:"gelu",2:"selu",3:"elu",4:"linear"}
        self.mlp = self.create_mlp()
        self.ridge = Ridge(alpha = 49)
        self.pca = PCA(n_components=100)
        self.svr = SVR(kernel='rbf',C=11,gamma='auto')

    def save(self,filename):
        if filename[-4:] != ".zip":
            raise ValueError('Invalid filename. Expect a zip file.')

        path = filename[:-4]
        prefix = path + "/" + self.region

        if not os.path.exists(path):
            os.makedirs(path)

        with open(prefix+'_pca.pkl','wb') as file:
            pickle.dump(self.pca,file)
        with open(prefix+'_ridge.pkl','wb') as file:
            pickle.dump(self.ridge,file)
        with open(prefix+'_svr.pkl','wb') as file:
            pickle.dump(self.svr,file)
        self.mlp.save(prefix+"_mlp.h5")

        shutil.make_archive(path, 'zip', path)
        shutil.rmtree(path)

    def load(self,filename):
        if filename[-4:] != ".zip":
            raise ValueError('Invalid input file. Expect a zip file.')

        path = filename[:-4]

        if not os.path.exists(path):
            os.makedirs(path)

        with ZipFile(filename,'r') as zObject:
            zObject.extractall(path=path)

        prefix = path + "/" + self.region
        with open(prefix+'_pca.pkl', 'rb') as file:
            self.pca = pickle.load(file)
        with open(prefix+'_ridge.pkl', 'rb') as file:
            self.ridge = pickle.load(file)
        with open(prefix+'_svr.pkl', 'rb') as file:
            self.svr = pickle.load(file)
        self.mlp = load_model(prefix + "_mlp.h5",
                              custom_objects={"root_mean_squared_error": self.root_mean_squared_error})
        shutil.rmtree(path)

    def fit(self,X_train,Y_train,verbose=True):
        self.echo(text = "------Training Starts------", verbose = verbose)
        X_train_pca = self.pca.fit_transform(X_train)
        self.fit_mlp(X_train,Y_train)
        mlp_pred = self.mlp.predict(X_train,verbose=0)
        self.echo(text = "Model 1: MLP done.", verbose = verbose)

        reshaped_Y_train = Y_train.values.reshape(Y_train.shape[0])
        new_X_train = pd.concat([pd.DataFrame(X_train_pca),pd.DataFrame(mlp_pred)], axis = 1)

        signals = ["Model 2: SVR done."]
        for model in [self.svr]:
            model.fit(X_train_pca,reshaped_Y_train)
            pred = model.predict(X_train_pca)
            new_X_train = pd.concat([new_X_train, pd.DataFrame(pred)], axis = 1)
            self.echo(text = signals[0], verbose = verbose)
            del signals[0]

        self.ridge.fit(new_X_train,reshaped_Y_train)
        self.echo(text = "Meta-Model: Ridge done.", verbose = verbose)

    def echo(self,text,verbose):
        if verbose not in [True,False]:
            raise ValueError('verbose must be True or False')
        if verbose:
            print(text)

    def predict(self, X_test):
        X_test_pca = self.pca.transform(X_test)
        mlp_pred = self.mlp.predict(X_test,verbose=0)
        new_X_test = pd.concat([pd.DataFrame(X_test_pca),pd.DataFrame(mlp_pred)], axis = 1)

        for model in [self.svr]:
            pred = model.predict(X_test_pca)
            new_X_test = pd.concat([new_X_test, pd.DataFrame(pred)], axis = 1)

        final_pred = self.ridge.predict(new_X_test)
        return final_pred

    def create_mlp(self):
        learning_rate = self.state[1]
        model = Sequential()
        for i in range(4,len(self.state),2):
            n_neurons = self.state[i]
            activation = self.activation_indices[self.state[i+1]]
            if n_neurons != 0:
                model.add(Dense(n_neurons,activation=activation))
        model.add(Dense(1, activation="linear"))
        model.compile(loss=self.root_mean_squared_error,optimizer=Adam(learning_rate))
        return model

    def root_mean_squared_error(self, y_true, y_pred):
        y_true = cast(y_true,float32)
        return kb.sqrt(kb.mean(kb.square(y_pred - y_true)))

    def fit_mlp(self,X_train,Y_train):
        epochs = self.state[0]
        validation_split = self.state[2]
        batch_size = self.state[3]
        self.mlp.fit(X_train,Y_train,validation_split=validation_split,
                     batch_size=batch_size,epochs=epochs,verbose=0)

In [None]:
rmse = []
for i in range(5):
    X_train = pd.read_pickle(f"ReproduceANNA16/datasets/splits/X_train_{i}.gz")
    Y_train = pd.read_pickle(f"ReproduceANNA16/datasets/splits/Y_train_{i}.gz")
    X_test = pd.read_pickle(f"ReproduceANNA16/datasets/splits/X_test_{i}.gz")
    Y_test = pd.read_pickle(f"ReproduceANNA16/datasets/splits/Y_test_{i}.gz")
    X_train = X_train.div(X_train.sum(axis=1), axis=0)
    X_test = X_test.div(X_test.sum(axis=1), axis=0)
    model = CopyNumberPredictor("full_length")
    model.fit(X_train,Y_train,verbose=False)
    pred = model.predict(X_test)
    rmse.append(sqrt(mean_squared_error(Y_test,pred)))
    print(rmse[i])

0.7817467066696948
0.8615972932230562
0.719343217919398
0.7808213771027683
0.7772356290659969


In [None]:
mean(rmse)

0.7841488447961829

In [None]:
X_train = pd.read_pickle(f"ReproduceANNA16/4.final_test/datasets/X_final_train.gz")
X_test = pd.read_pickle(f"ReproduceANNA16/4.final_test/datasets/X_final_test.gz")
Y_train = pd.read_pickle(f"ReproduceANNA16/4.final_test/datasets/Y_final_train.gz")
Y_test = pd.read_pickle(f"ReproduceANNA16/4.final_test/datasets/Y_final_test.gz")
X_train = X_train.div(X_train.sum(axis=1), axis=0)
X_test = X_test.div(X_test.sum(axis=1), axis=0)

In [None]:
model = CopyNumberPredictor("full_length")
model.fit(X_train,Y_train,verbose=False)
pred = model.predict(X_test)
sqrt(mean_squared_error(Y_test,pred))

0.7621037025872948