In [1]:
import os
import tensorflow as tf
import random
from scipy import stats
import warnings
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import numpy as np
from helpers import open_and_prepare_df, X_y_split
import numpy as np
from sklearn.model_selection import StratifiedKFold

warnings.filterwarnings("ignore", category=UserWarning)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

X, y = X_y_split(open_and_prepare_df('features'), 'nlp_all')

2023-04-24 22:03:25.247138: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-24 22:03:25.269957: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-24 22:03:26.793713: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-04-24 22:03:26.822928: I tensorflow/comp

Num GPUs Available:  1


Grid parameters search

In [3]:
def get_kfold_results(model_class, X, y, params):

    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=8)

    true_y = []
    preds = []

    for train_index, test_index in kfold.split(X, y):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model = model_class(neurons=params['neurons'], activation_1=params['activation_1'], activation_2=params['activation_2'],
                            learning_rate=params['learning_rate'], optimizer=params['optimizer'])
        
        model.fit(X_train, y_train, batch_size=params['batch_size'], epochs=params['epochs'], verbose=0)
        pred = model.predict(X_test, verbose=0).flatten()
        preds = np.concatenate((preds, pred))

        true_y = np.concatenate((true_y, y_test))

    corr = round(stats.pearsonr(preds, true_y)[0], 3)
    mae = round(mean_absolute_error(preds, true_y), 3)
    print(f'Corr = {corr}, MAE = {mae}')

    return corr, mae

In [4]:
def create_model(neurons, activation_1, activation_2, activation_3, learning_rate, optimizer):
    np.random.seed(123)
    random.seed(123)
    tf.random.set_seed(1234)
    
    keras_model = tf.keras.Sequential([
                        tf.keras.layers.Dense(neurons, activation=activation_1),
                        tf.keras.layers.Dense(neurons/2, activation=activation_2),
                        tf.keras.layers.Flatten(),
                        tf.keras.layers.Dense(neurons/3, activation=activation_3),
                        tf.keras.layers.Dense(units=1)
                        ])

    keras_model.compile(optimizer=optimizer(learning_rate=learning_rate), loss='mean_absolute_error')
    return keras_model

In [14]:
def run_grid(create_model, X, y, activations_1, activations_2, activations_3, learning_rates, neurons_count, epochs, batch_sizes, optimizers):
    best_corr = 0
    best_params = None
    loop = 1
    loops_count = len(activations_1) * len(activations_2) * len(activations_3) * len(learning_rates) *\
                  len(neurons_count) * len(epochs) * len(batch_sizes) * len(optimizers)

    for epoch in epochs:
        for batch_size in batch_sizes:
            for activation_1 in activations_1:
                for activation_2 in activations_2:
                    for activation_3 in activations_3:
                        for lr in learning_rates:
                            for neurons in neurons_count:
                                for optimizer in optimizers:
                                
                                    params = {}
                                    params['epochs'] = epoch
                                    params['batch_size'] = batch_size
                                    params['activation_1'] = activation_1
                                    params['activation_2'] = activation_2
                                    params['activation_3'] = activation_3
                                    params['learning_rate'] = lr
                                    params['neurons'] = neurons
                                    params['optimizer'] = optimizer

                                    corr, mae = get_kfold_results(create_model, X, y, params)
                                    print(params)
                                    if corr > best_corr:
                                        best_corr = corr
                                        best_params = params

                                    print(f'{loop}/{loops_count}')
                                    loop += 1
    return best_params, best_corr

In [23]:
activations_1 = ['relu', 'tanh', 'sigmoid']
activations_2 = ['relu', 'tanh', 'sigmoid']
activations_3 = ['relu']
learning_rates = [0.001, 0.005, 0.01]
neurons_count = [256, 512]
epochs = [150]
batch_sizes = [30]
optimizers = [tf.keras.optimizers.SGD, tf.keras.optimizers.Adam]

In [25]:
best_params, best_corr = run_grid(create_model, X, y, activations_1, activations_2, activations_3, learning_rates, neurons_count, epochs, batch_sizes, optimizers)

Corr = 0.436, MAE = 2.516
{'epochs': 150, 'batch_size': 30, 'activation_1': 'relu', 'activation_2': 'relu', 'activation_3': 'relu', 'learning_rate': 0.001, 'neurons': 256, 'optimizer': <class 'keras.optimizers.sgd.SGD'>}
1/108
Corr = 0.387, MAE = 2.614
{'epochs': 150, 'batch_size': 30, 'activation_1': 'relu', 'activation_2': 'relu', 'activation_3': 'relu', 'learning_rate': 0.001, 'neurons': 256, 'optimizer': <class 'keras.optimizers.adam.Adam'>}
2/108
Corr = 0.465, MAE = 2.46
{'epochs': 150, 'batch_size': 30, 'activation_1': 'relu', 'activation_2': 'relu', 'activation_3': 'relu', 'learning_rate': 0.001, 'neurons': 512, 'optimizer': <class 'keras.optimizers.sgd.SGD'>}
3/108
Corr = 0.411, MAE = 2.553
{'epochs': 150, 'batch_size': 30, 'activation_1': 'relu', 'activation_2': 'relu', 'activation_3': 'relu', 'learning_rate': 0.001, 'neurons': 512, 'optimizer': <class 'keras.optimizers.adam.Adam'>}
4/108
Corr = 0.398, MAE = 2.73
{'epochs': 150, 'batch_size': 30, 'activation_1': 'relu', 'activ

KeyboardInterrupt: 

Manual parameters search

In [238]:
def create_model():
    np.random.seed(123)
    random.seed(123)
    tf.random.set_seed(1234)
    keras_model = tf.keras.Sequential([
                        tf.keras.layers.Dense(300, activation='sigmoid'),
                        tf.keras.layers.Dense(150, activation='LeakyReLU'),
                        tf.keras.layers.Dense(64, activation='LeakyReLU'),
                        
                        tf.keras.layers.Dense(units=1, activation='selu')
                        ])

    keras_model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.85), loss='mean_absolute_error')
    return keras_model

In [239]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=8)

keras_predictions = []
y_main_true = []

for train_index, test_index in kfold.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = create_model()
    model.fit(X_train, y_train, batch_size=70, epochs=150, verbose=0)
    keras_prediction = model.predict(X_test, verbose=0).flatten()

    keras_predictions = np.concatenate((keras_predictions, keras_prediction))
    y_main_true = np.concatenate((y_main_true, y_test))
    print(stats.pearsonr(keras_prediction, y_test))
    
stats.pearsonr(y_main_true, keras_predictions)

PearsonRResult(statistic=0.5600297059856576, pvalue=7.721593043560443e-05)
PearsonRResult(statistic=0.6565855656015634, pvalue=1.2999157760911512e-06)
PearsonRResult(statistic=0.5796692914057802, pvalue=3.7333191059363985e-05)
PearsonRResult(statistic=0.36150157943615624, pvalue=0.01722612179760342)
PearsonRResult(statistic=0.5140335874458264, pvalue=0.00042208035385956873)
PearsonRResult(statistic=0.42258422385221905, pvalue=0.004757784428009698)
PearsonRResult(statistic=0.5992540157800261, pvalue=2.1771931842210808e-05)
PearsonRResult(statistic=0.5162803758902919, pvalue=0.00039419010245248274)
PearsonRResult(statistic=0.5467894733309738, pvalue=0.00014838157875065355)
PearsonRResult(statistic=0.428928356355101, pvalue=0.0041053063960848615)


PearsonRResult(statistic=0.5066628412218657, pvalue=1.2936976058517208e-29)