In [None]:
# Run this to setup environment
%pip install -r requirements.txt

In [1]:
import pandas as pd
from processing_utils import *
from rbf import RBF
from rbf_optimization import objective
import optuna
from sklearn.metrics import r2_score, mean_squared_error
import random
%reload_ext autoreload
%autoreload 2

### RBF Network Regression

In [None]:
raw_dataset = pd.read_csv('./dataset/data.csv')

dataset = encode_smiles_column_of(
    prune_dataset_lines(
        raw_dataset,
        remove_nan_lines=False,
        remove_nan_cols=True,
        remove_duplicates=True
    ),
    strategy='count_encoding'
)

X_train, y_train, X_val, y_val, X_test, y_test = get_train_data(
    dataset,
    targets_columns=['Energy_(kcal/mol)', 'Energy DG:kcal/mol)'],
    random_state=None,
    as_numpy=False
)

### Example of Optuna hyperparameters optimization with Optuna

In [None]:
rbf = RBF(n_clusters=8, sigma=3.14, normalize=True).fit(X_train, y_train)   # RBF class does data normalization no worries.
y_pred = rbf.predict(X_val)
r2_score(y_val, y_pred)

study = optuna.create_study(
    direction='maximize',
    study_name="RBF hyperparameters optimization"
)

study.optimize(
    lambda trial: objective(
        trial, X_train, y_train, X_val, y_val, normalize=True, metric=r2_score),
    n_trials=10
)

In [None]:
rbf = RBF(study.best_params["n_clusters"], study.best_params["sigma"], normalize=True).fit(X_train, y_train)
y_pred = rbf.predict(X_test)

print("Test mse =", mean_squared_error(y_test, y_pred))
print("Test R2 =", r2_score(y_test, y_pred))
print("best parameters:", study.best_params)

## Performances w.r.t. training dataset size

In [None]:
N_ESSAIS = 50
mses = [[] for _ in range(N_ESSAIS)]
r2_scores = [[] for _ in range(N_ESSAIS)]
percentages = np.linspace(0.2, 1, 50)

In [None]:
from pathlib import Path
cluster_numbers = []
standard_deviations = []
optimization = np.arange(start=0, stop=N_ESSAIS * len(percentages), step=1)

# this took 15 min on my machine with N_ESSAIS = 5
for k in range(N_ESSAIS):
    seed = random.randint(0, 10000)
    random_state = np.random.RandomState(seed)

    for p in percentages:

        X_train, y_train, X_val, y_val, X_test, y_test = get_train_data(
            dataset,
            targets_columns=['Energy_(kcal/mol)', 'Energy DG:kcal/mol)'],
            random_state=random_state,
            as_numpy=False,
        )

        study = optuna.create_study(
            direction='maximize',
            study_name=f"RBF hyperparameters optimization for percentage p={p} Essai={k}"
        )

        study.optimize(
            lambda trial: objective(
                trial, X_train, y_train, X_val, y_val, normalize=True, metric=r2_score),
            n_trials=10,
            # n_jobs=-1
        )

        rbf = RBF(
            study.best_params["n_clusters"],
            study.best_params["sigma"],
            normalize=True
        ).fit(X_train, y_train)
        y_pred = rbf.predict(X_test)
        mses[k].append(mean_squared_error(y_test, y_pred))
        r2_scores[k].append(r2_score(y_test, y_pred))

        cluster_numbers.append(study.best_params["n_clusters"])
        standard_deviations.append(study.best_params["sigma"])


path = Path('./results/rbf')
mses = np.array(mses)
r2_scores = np.array(r2_scores)

np.save(path/'rbf_mses_var_percentage', mses)
np.save(path/'rbf_scores_var_percentage', r2_scores)


In [None]:
import matplotlib.pyplot as plt
from pathlib import Path

path = Path('./results/rbf')

#mses = np.array(mses)
#r2_scores = np.array(r2_scores)

mses = np.load(path/'rbf_mses_var_percentage.npy')
r2_scores = np.load(path/'rbf_scores_var_percentage.npy')

np.save(path/'rbf_mses_var_percentage', mses)
np.save(path/'rbf_scores_var_percentage', r2_scores)

plt.figure(figsize=(20, 8))
plt.subplot(1, 2, 1)
plt.title("Mean Squares Error")
plt.xlabel("Train Data Percentage")
plt.ylabel("Test MSE")
plt.grid(True)

final_mses = np.mean(mses, axis=0)
mses_error = np.std(mses, axis=0)

# remove the fucky percentages
rows = np.abs(final_mses) < 10000
final_mses_chosen = final_mses[rows][2:]
mses_error_chosen = mses_error[rows][2:]
percentages_chosen = percentages[rows][2:]

plt.semilogy(percentages_chosen, final_mses_chosen, label='Linear Regression')
plt.legend()
plt.fill_between(percentages_chosen, final_mses_chosen - mses_error_chosen, final_mses_chosen + mses_error_chosen, alpha=0.2, edgecolor='#1B2ACC', facecolor='#089FFF')

plt.subplot(1, 2, 2)
plt.title("R2 score")
plt.xlabel("Train Data Percentage")
plt.ylabel("Test R2 score")
plt.grid(True)
plt.yscale('linear')

final_r2_score = np.mean(r2_scores, axis=0)
r2_score_error = np.std(r2_scores, axis=0)

# remove fucky values
r2_rows = np.abs(final_r2_score) < 10000
final_r2_score_chosen = final_r2_score[r2_rows][2:]
r2_score_error_chosen = r2_score_error[r2_rows][2:]
r2_percentages_chosen = percentages[r2_rows][2:]

plt.plot(r2_percentages_chosen, final_r2_score_chosen, label='Linear Regression')
plt.legend()
plt.fill_between(r2_percentages_chosen, final_r2_score_chosen - r2_score_error_chosen, final_r2_score_chosen + r2_score_error_chosen, alpha=0.2, edgecolor='#1B2ACC', facecolor='#089FFF')

## Cross Validation Score

In [2]:
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

def cross_validation_of(rbf_network: RBF, X: pd.DataFrame, y: pd.DataFrame, n_splits=5):
    assert y.ndim == 1 or y.shape[1] == 1 
    
    kf = KFold(n_splits=n_splits)
    cross_validation_scores = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.to_numpy()[train_index], X.to_numpy()[test_index]
        y_train, y_test = y.to_numpy()[train_index].reshape(-1, 1), y.to_numpy()[test_index].reshape(-1, 1)

        rbf_network.fit(X_train, y_train)
        cross_validation_scores.append(r2_score(rbf_network.predict(X_test), y_test))

    return cross_validation_scores

#### Computation of Best RBF Network

In [None]:
study = optuna.create_study(
    direction='maximize',
    study_name="RBF hyperparameters optimization"
)

study.optimize(
    lambda trial: objective(
        trial, X_train, y_train, X_val, y_val, normalize=True, metric=r2_score),
    n_trials=200
)

X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

final_rbf = RBF(study.best_params["n_clusters"], study.best_params["sigma"], normalize=True).fit(X_train, y_train)
y_pred = rbf.predict(X_test)

print("Test mse =", mean_squared_error(y_test, y_pred))
print("Test R2 =", r2_score(y_test, y_pred))
print("best parameters:", study.best_params)

#### Computation of it's Cross Validation Score

In [None]:
print("RBF Cross Validation Score =", cross_validation_of(final_rbf, X, y))