In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Concatenate
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import Adam
from tensorflow import cast,float32
from keras import backend as kb
from statistics import mean, stdev
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from tensorflow.keras import Input, Model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.svm import SVR

2023-04-09 09:09:30.462686: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
class CopyNumberPredictor():
    def __init__(self,region):
        self.region = region
        self.state = [59, 0.00016770313599, 0.11, 100, 489, 1, 926, 0, 645, 0, 929, 3, 582, 1, 82, 4]
        self.activation_indices={0:"relu",1:"gelu",2:"selu",3:"elu",4:"linear"}
        self.mlp = self.create_mlp()
        self.ridge = Ridge(alpha = 49)
        self.pca = PCA(n_components=100)
        self.svr = SVR(kernel='rbf',C=11,gamma='auto')
        
    def save(self,filename):
        if filename[-4:] != ".zip":
            raise ValueError('Invalid filename. Expect a zip file.')
            
        path = filename[:-4]
        prefix = path + "/" + self.region
            
        if not os.path.exists(path):
            os.makedirs(path)
        
        with open(prefix+'_pca.pkl','wb') as file:
            pickle.dump(self.pca,file)
        with open(prefix+'_ridge.pkl','wb') as file:
            pickle.dump(self.ridge,file)
        with open(prefix+'_svr.pkl','wb') as file:
            pickle.dump(self.svr,file)
        self.mlp.save(prefix+"_mlp.h5")
        
        shutil.make_archive(path, 'zip', path)
        shutil.rmtree(path)
            
    def load(self,filename):
        if filename[-4:] != ".zip":
            raise ValueError('Invalid input file. Expect a zip file.')
            
        path = filename[:-4]
        
        if not os.path.exists(path):
            os.makedirs(path)
        
        with ZipFile(filename,'r') as zObject:
            zObject.extractall(path=path)
        
        prefix = path + "/" + self.region
        with open(prefix+'_pca.pkl', 'rb') as file:
            self.pca = pickle.load(file) 
        with open(prefix+'_ridge.pkl', 'rb') as file:
            self.ridge = pickle.load(file)
        with open(prefix+'_svr.pkl', 'rb') as file:
            self.svr = pickle.load(file)
        self.mlp = load_model(prefix + "_mlp.h5",
                              custom_objects={"root_mean_squared_error": self.root_mean_squared_error})
        shutil.rmtree(path)
        
    def fit(self,X_train,Y_train,verbose=True):
        self.echo(text = "------Training Starts------", verbose = verbose)
        X_train_pca = self.pca.fit_transform(X_train)
        self.fit_mlp(X_train,Y_train)
        mlp_pred = self.mlp.predict(X_train,verbose=0)
        self.echo(text = "Model 1: MLP done.", verbose = verbose)
        
        reshaped_Y_train = Y_train.values.reshape(Y_train.shape[0])
        new_X_train = pd.concat([pd.DataFrame(X_train_pca),pd.DataFrame(mlp_pred)], axis = 1)
        
        signals = ["Model 2: SVR done."]
        for model in [self.svr]:
            model.fit(X_train_pca,reshaped_Y_train)
            pred = model.predict(X_train_pca)
            new_X_train = pd.concat([new_X_train, pd.DataFrame(pred)], axis = 1)
            self.echo(text = signals[0], verbose = verbose)
            del signals[0]
        
        self.ridge.fit(new_X_train,reshaped_Y_train)
        self.echo(text = "Meta-Model: Ridge done.", verbose = verbose)
    
    def echo(self,text,verbose):
        if verbose not in [True,False]:
            raise ValueError('verbose must be True or False')
        if verbose:
            print(text)
        
    def predict(self, X_test):
        X_test_pca = self.pca.transform(X_test)
        mlp_pred = self.mlp.predict(X_test,verbose=0)
        new_X_test = pd.concat([pd.DataFrame(X_test_pca),pd.DataFrame(mlp_pred)], axis = 1)
        
        for model in [self.svr]:
            pred = model.predict(X_test_pca)
            new_X_test = pd.concat([new_X_test, pd.DataFrame(pred)], axis = 1)
            
        final_pred = self.ridge.predict(new_X_test)
        return final_pred
    
    def create_mlp(self):
        learning_rate = self.state[1]
        model = Sequential()
        for i in range(4,len(self.state),2):
            n_neurons = self.state[i]
            activation = self.activation_indices[self.state[i+1]]
            if n_neurons != 0:
                model.add(Dense(n_neurons,activation=activation))
        model.add(Dense(1, activation="linear"))
        model.compile(loss=self.root_mean_squared_error,optimizer=Adam(learning_rate))
        return model
    
    def root_mean_squared_error(self, y_true, y_pred):
        y_true = cast(y_true,float32)
        return kb.sqrt(kb.mean(kb.square(y_pred - y_true)))
    
    def fit_mlp(self,X_train,Y_train):
        epochs = self.state[0]
        validation_split = self.state[2]
        batch_size = self.state[3]
        self.mlp.fit(X_train,Y_train,validation_split=validation_split,
                     batch_size=batch_size,epochs=epochs,verbose=0)

In [4]:
rmse = []
for i in range(5):
    X_train = pd.read_csv("datasets/splits/X_train_{}.csv".format(str(i)))
    Y_train = pd.read_csv("datasets/splits/Y_train_{}.csv".format(str(i)))
    X_test = pd.read_csv("datasets/splits/X_test_{}.csv".format(str(i)))
    Y_test = pd.read_csv("datasets/splits/Y_test_{}.csv".format(str(i)))
    model = CopyNumberPredictor()
    model.fit(X_train,Y_train,verbose=False)
    pred = model.predict(X_test)
    rmse.append(sqrt(mean_squared_error(Y_test,pred)))
    print(rmse[i])
    pred_records = np.array([Y_test,pred]).transpose()
    pred_records = pd.DataFrame(pred_records)
    pred_records.columns = ["Y_test","Y_pred"]
    pred_records["group"] = i
    pred_records.to_csv("pred_records/full_length_pred_records_{}.csv".format(str(i)),index=False)

2023-04-09 09:09:41.397796: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-09 09:09:41.911761: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9651 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:3d:00.0, compute capability: 7.5


0.6763061242996811
0.7354001494547514
0.6297395471687893
0.6895670932190365
0.6926191879388353


In [5]:
mean(rmse)

0.6847264204162187

In [6]:
rmse

[0.6763061242996811,
 0.7354001494547514,
 0.6297395471687893,
 0.6895670932190365,
 0.6926191879388353]

In [7]:
pd.DataFrame(rmse,
             columns = ["smac_full_length"]).to_csv("performance/smac_universal_full_length.csv",
                                                        index=False)