In [None]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [None]:
from typing import *
import numpy as np
from functools import partial
from fastprogress import progress_bar
import pandas as pd
import h5py

from lumin.plotting.results import plot_roc

import torch
from torch import Tensor, nn
import torch.nn.functional as F
from torch._vmap_internals import _vmap as vmap

from tomopt.volume import *
from tomopt.muon import *
from tomopt.inference import *
from tomopt.optimisation import *
from tomopt.core import *
from tomopt.utils import *
from tomopt.plotting import *

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('z_sep_data.csv')

In [None]:
df

In [None]:
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize, Optimizer
from skopt.plots import plot_evaluations, plot_objective

In [None]:
det_pars = [f for f in df.columns if 'z' in f]

In [None]:
space  = [Real(df[f].min(), df[f].max(), name=f) for f in det_pars]

In [None]:
opt = gp_minimize(lambda x: 0, space, n_random_starts=0, x0=df[det_pars].values.tolist(), y0=-df.auc, n_calls=0, verbose=1, random_state=0);

In [None]:
_ = plot_evaluations(opt, dimensions=det_pars)

In [None]:
_ = plot_objective(opt, dimensions=det_pars)

## Extra feats

In [None]:
above_z = [f for f in det_pars if 'az' in f]
below_z = [f for f in det_pars if 'bz' in f]

In [None]:
df['mean_z_above'] = np.mean(df[above_z].values, axis=-1)
df['mean_z_below'] = np.mean(df[below_z].values, axis=-1)

In [None]:
df['std_z_above'] = np.std(df[above_z].values, axis=-1)
df['std_z_below'] = np.std(df[below_z].values, axis=-1)

In [None]:
df['central_seperation'] = df.mean_z_above-df.mean_z_below
df['max_seperation'] = df.az0-df.bz3

In [None]:
df['z_width_above'] = df.az0-df.az3
df['z_width_below'] = df.bz0-df.bz3

In [None]:
df['az01_sep'] = df.az0-df.az1
df['az12_sep'] = df.az1-df.az2
df['az23_sep'] = df.az2-df.az3

In [None]:
df['bz01_sep'] = df.bz0-df.bz1
df['bz12_sep'] = df.bz1-df.bz2
df['bz23_sep'] = df.bz2-df.bz3

In [None]:
df['az01_az23_centre_sep'] = np.mean(df[['az0', 'az1']].values, axis=-1)-np.mean(df[['az2', 'az3']].values, axis=-1)
df['bz01_bz23_centre_sep'] = np.mean(df[['bz0', 'bz1']].values, axis=-1)-np.mean(df[['bz2', 'bz3']].values, axis=-1)

In [None]:
df

In [None]:
extra_feats = [f for f in df.columns if f not in ['auc', 'unc']+det_pars]

In [None]:
for f in extra_feats:
    plt.scatter(df[f], df.auc, alpha=0.5)
    plt.xlabel(f)
    plt.ylabel('ROC AUC')
    plt.show()

In [None]:
check_feats = ['az01_az23_centre_sep', 'z_width_above', 'std_z_above']

In [None]:
check_opt = gp_minimize(lambda x: 0, [Real(df[f].min(), df[f].max(), name=f) for f in extra_feats], n_random_starts=0, x0=df[extra_feats].values.tolist(), y0=-df.auc, n_calls=0, verbose=1, random_state=0);

In [None]:
_ = plot_evaluations(check_opt, dimensions=extra_feats)

In [None]:
_ = plot_objective(check_opt, dimensions=extra_feats)

## RF on det pars

In [None]:
from sklearn.model_selection import train_test_split

trn_idxs, val_idxs = train_test_split(df.index, test_size=0.2, random_state=1111)
len(df.loc[trn_idxs]), len(df.loc[val_idxs])

In [None]:
from lumin.optimisation.hyper_param import get_opt_rf_params
from collections import OrderedDict

rf_params, rf = get_opt_rf_params(x_trn=df[det_pars], y_trn=df['auc'],
                                 x_val=df[det_pars], y_val=df['auc'],
                                 objective='regression',
                                 n_estimators=100, verbose=False,
                                 params=OrderedDict({'min_samples_leaf': [2,4,8,16,32],
                                                     'max_features': [0.3,0.5,0.7,0.9]}))

In [None]:
rf

In [None]:
from lumin.plotting.interpretation import plot_1d_partial_dependence, plot_2d_partial_dependence

In [None]:
for f in det_pars:
    plot_1d_partial_dependence(rf, df=df, feat=f, train_feats=det_pars, pdp_isolate_kargs={'percentile_range':(1,99)}, y_lim=(df.auc.min(), df.auc.max()))

## RF on extra feats

In [None]:
from lumin.optimisation.hyper_param import get_opt_rf_params
from collections import OrderedDict

rf_params, rf = get_opt_rf_params(x_trn=df[extra_feats], y_trn=df['auc'],
                                 x_val=df[extra_feats], y_val=df['auc'],
                                 objective='regression',
                                 n_estimators=100, verbose=False,
                                 params=OrderedDict({'min_samples_leaf': [2,4,8,16,32],
                                                     'max_features': [0.3,0.5,0.7,0.9]}))

In [None]:
rf

In [None]:
from lumin.plotting.interpretation import plot_1d_partial_dependence, plot_2d_partial_dependence

In [None]:
for f in extra_feats:
    plot_1d_partial_dependence(rf, df=df, feat=f, train_feats=extra_feats, pdp_isolate_kargs={'percentile_range':(1,99)}, y_lim=(df.auc.min(), df.auc.max()))

## RF on selected feats

In [None]:
all_feats = det_pars+extra_feats

In [None]:
from lumin.optimisation.hyper_param import get_opt_rf_params
from collections import OrderedDict

rf_params, rf = get_opt_rf_params(x_trn=df[all_feats], y_trn=df['auc'],
                                 x_val=df[all_feats], y_val=df['auc'],
                                 objective='regression',
                                 n_estimators=100, verbose=False,
                                 params=OrderedDict({'min_samples_leaf': [2,4,8,16,32],
                                                     'max_features': [0.3,0.5,0.7,0.9]}))

In [None]:
rf

In [None]:
from lumin.optimisation.features import auto_filter_on_linear_correlation

filtered_feats = auto_filter_on_linear_correlation(train_df=df, val_df=df,
                                                   check_feats=all_feats, n_rfs=5,
                                                   corr_threshold=0.8, rf_params=rf_params, optimise_rf=False,
                                                   objective='regression', targ_name='auc')

In [None]:
from lumin.optimisation.features import rf_rank_features

_ = rf_rank_features(train_df=df, val_df=df, objective='regression',
                     train_feats=filtered_feats, targ_name='auc',
                     importance_cut=0.001,
                     rf_params=OrderedDict({'min_samples_leaf': [2,4,8,16,32],
                                            'max_features': [0.3,0.5,0.7,0.9]}), n_rfs=5, optimise_rf=True)

In [None]:
from lumin.optimisation.features import repeated_rf_rank_features

import_feats, fi = repeated_rf_rank_features(train_df=df, val_df=df, targ_name='auc',
                                             n_reps=10, min_frac_import=0.3,
                                             rf_params=OrderedDict({'min_samples_leaf': [2,4,8,16,32],
                                                                    'max_features': [0.3,0.5,0.7,0.9]}),
                                             objective='regression', resample_val=True,
                                             train_feats=filtered_feats, importance_cut=0.001,
                                             n_rfs=5, n_threads=5)

In [None]:
rf_params, rf = get_opt_rf_params(x_trn=df[import_feats], y_trn=df['auc'],
                                 x_val=df[import_feats], y_val=df['auc'],
                                 objective='regression',
                                 n_estimators=100, verbose=False,
                                 params=OrderedDict({'min_samples_leaf': [2,4,8,16,32],
                                                     'max_features': [0.3,0.5,0.7,0.9]}))

In [None]:
from lumin.optimisation.features import auto_filter_on_mutual_dependence

filtered_feats = auto_filter_on_mutual_dependence(train_df=df, val_df=df,
                                                  check_feats=import_feats,
                                                  objective='regression', targ_name='auc',
                                                  rf_params=rf_params,
                                                  optimise_rf=False)

In [None]:
rf_params, rf = get_opt_rf_params(x_trn=df[filtered_feats], y_trn=df['auc'],
                                 x_val=df[filtered_feats], y_val=df['auc'],
                                 objective='regression',
                                 n_estimators=100, verbose=False,
                                 params=OrderedDict({'min_samples_leaf': [2,4,8,16,32],
                                                     'max_features': [0.3,0.5,0.7,0.9]}))

In [None]:
from lumin.plotting.interpretation import plot_1d_partial_dependence, plot_2d_partial_dependence

In [None]:
for f in filtered_feats:
    plot_1d_partial_dependence(rf, df=df, feat=f, train_feats=filtered_feats, pdp_isolate_kargs={'percentile_range':(1,99)}, y_lim=(df.auc.min(), df.auc.max()))