# Hyperoptimized Parameter Selection

Strategy: find set of parameters with best median performance over all folds

In [None]:
import os.path
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
AUC_results_path = '/Users/jk1/temp/opsum_prediction_output/LSTM_72h_testing/3M_mRS01/2023_01_06_1847/AUC_history_gridsearch.tsv'
CV_results = '/Users/jk1/temp/opsum_prediction_output/LSTM_72h_testing/3M_mRS01/2023_01_06_1847/CV_history_gridsearch.tsv'

In [None]:
AUC_results_df = pd.read_csv(AUC_results_path, sep='\t')
CV_results_df = pd.read_csv(CV_results, sep='\t')

In [None]:
AUC_results_df

In [None]:
# get summary stats
AUC_results_df.auc_val.describe()

In [None]:
# plot train and val AUC for each activation
print(AUC_results_df.activation.unique())
ax  = sns.stripplot(x='activation', y='auc_train', data=AUC_results_df, alpha=0.3, size=2.5, color='blue')
sns.stripplot(x='activation', y='auc_val', data=AUC_results_df, ax=ax, alpha=0.3, size=2.5, color='orange')
# add legend
ax.legend(['train', 'val'])
ax.set_title('LSTM AUC by activation')
ax.set_ylim(0.4, 1)
ax.set_ylabel('AUC')
plt.show()

In [None]:
color1_alpha1 = (0.9647058823529412, 0.06274509803921569, 0.403921568627451, 1)
color1_alpha01 = (0.9647058823529412, 0.06274509803921569, 0.403921568627451, 0.1)

color2_alpha1 = (0.01568627450980392, 0.6078431372549019, 0.6039215686274509, 1)
color2_alpha01 = (0.01568627450980392, 0.6078431372549019, 0.6039215686274509, 0.1)

In [None]:
categorical_variable = 'data'
print(AUC_results_df[categorical_variable].unique())
fig, [ax1, ax2] = plt.subplots(1, 2, figsize=(10, 5))

# plot train and val AUC for each variable
sns.violinplot(x=categorical_variable, y='auc_train', data=AUC_results_df, ax=ax1, alpha=0.1, size=2.5, inner=None)
sns.violinplot(x=categorical_variable, y='auc_val', data=AUC_results_df, ax=ax1, alpha=0.1, size=2.5)
ax1.collections[0].set_edgecolor(color1_alpha1)
ax1.collections[0].set_facecolor(color1_alpha01)
ax1.collections[1].set_edgecolor(color1_alpha1)
ax1.collections[1].set_facecolor(color1_alpha01)
ax1.collections[2].set_edgecolor(color2_alpha1)
ax1.collections[2].set_facecolor(color2_alpha01)
# ax1.collections[4].set_edgecolor(color2_alpha1)
# ax1.collections[4].set_facecolor(color2_alpha01)

sns.stripplot(x=categorical_variable, y='auc_train', data=AUC_results_df, ax=ax1, alpha=0.1, size=2.5, color='magenta')
sns.stripplot(x=categorical_variable, y='auc_val', data=AUC_results_df, ax=ax1, alpha=0.1, size=2.5, color='green')
# add legend
ax1.legend(['train', 'val'])
ax1.set_title(f'LSTM AUC by {categorical_variable}')
ax1.set_ylim(0.4, 1)
ax1.set_ylabel('AUC')

# plot train and val matthews for each variable
sns.violinplot(x=categorical_variable, y='matthews_train', data=AUC_results_df, ax=ax2, alpha=0.1, size=2.5, inner=None)
sns.violinplot(x=categorical_variable, y='matthews_val', data=AUC_results_df, ax=ax2, alpha=0.1, size=2.5)
ax2.collections[0].set_edgecolor(color1_alpha1)
ax2.collections[0].set_facecolor(color1_alpha01)
ax2.collections[1].set_edgecolor(color1_alpha1)
ax2.collections[1].set_facecolor(color1_alpha01)
ax2.collections[2].set_edgecolor(color2_alpha1)
ax2.collections[2].set_facecolor(color2_alpha01)
# ax2.collections[4].set_edgecolor(color2_alpha1)
# ax2.collections[4].set_facecolor(color2_alpha01)

sns.stripplot(x=categorical_variable, y='matthews_train', data=AUC_results_df, ax=ax2, alpha=0.1, size=2.5, color='magenta')
sns.stripplot(x=categorical_variable, y='matthews_val', data=AUC_results_df, ax=ax2, alpha=0.1, size=2.5, color='green')
# add legend
ax2.legend(['train', 'val'])
ax2.set_title(f'LSTM Matthews by {categorical_variable}')
ax2.set_ylim(-0.2, 0.8)
ax2.set_ylabel('Matthews')

plt.show()

Best Data: balanced

In [None]:
categorical_variable = 'dropout'
print(AUC_results_df[categorical_variable].unique())
fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2, 2, figsize=(10, 5))

# plot train and val AUC for each variable
sns.violinplot(x=categorical_variable, y='auc_train', data=AUC_results_df, ax=ax1, alpha=0.1, size=2.5, palette='Spectral')
sns.violinplot(x=categorical_variable, y='auc_val', data=AUC_results_df, ax=ax2, alpha=0.1, size=2.5, palette='Spectral')


# plot train and val matthews for each variable
sns.violinplot(x=categorical_variable, y='matthews_train', data=AUC_results_df, ax=ax3, alpha=0.1, size=2.5, palette='Spectral')
sns.violinplot(x=categorical_variable, y='matthews_val', data=AUC_results_df, ax=ax4, alpha=0.1, size=2.5, palette='Spectral')

fig.set_tight_layout(True)
fig.suptitle(f'LSTM results by {categorical_variable}')

plt.show()

Best dropout: 0.2-0.4?

In [None]:
categorical_variable = 'layers'
print(AUC_results_df[categorical_variable].unique())
fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2, 2, figsize=(10, 5))

# plot train and val AUC for each variable
sns.violinplot(x=categorical_variable, y='auc_train', data=AUC_results_df, ax=ax1, alpha=0.1, size=2.5, palette='Spectral')
sns.violinplot(x=categorical_variable, y='auc_val', data=AUC_results_df, ax=ax2, alpha=0.1, size=2.5, palette='Spectral')


# plot train and val matthews for each variable
sns.violinplot(x=categorical_variable, y='matthews_train', data=AUC_results_df, ax=ax3, alpha=0.1, size=2.5, palette='Spectral')
sns.violinplot(x=categorical_variable, y='matthews_val', data=AUC_results_df, ax=ax4, alpha=0.1, size=2.5, palette='Spectral')

fig.set_tight_layout(True)
fig.suptitle(f'LSTM results by {categorical_variable}')

plt.show()

best layers: 1

In [None]:
categorical_variable = 'optimizer'
print(AUC_results_df[categorical_variable].unique())
fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2, 2, figsize=(10, 5))

# plot train and val AUC for each variable
sns.violinplot(x=categorical_variable, y='auc_train', data=AUC_results_df, ax=ax1, alpha=0.1, size=2.5, palette='Spectral')
sns.violinplot(x=categorical_variable, y='auc_val', data=AUC_results_df, ax=ax2, alpha=0.1, size=2.5, palette='Spectral')


# plot train and val matthews for each variable
sns.violinplot(x=categorical_variable, y='matthews_train', data=AUC_results_df, ax=ax3, alpha=0.1, size=2.5, palette='Spectral')
sns.violinplot(x=categorical_variable, y='matthews_val', data=AUC_results_df, ax=ax4, alpha=0.1, size=2.5, palette='Spectral')

fig.set_tight_layout(True)

# set figure title
fig.suptitle(f'LSTM results by {categorical_variable}')

plt.show()

In [None]:
categorical_variable = 'units'
print(AUC_results_df[categorical_variable].unique())
fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2, 2, figsize=(10, 10))

# plot train and val AUC for each variable
sns.violinplot(x=categorical_variable, y='auc_train', data=AUC_results_df, ax=ax1, alpha=0.1, size=2.5, palette='Spectral')
sns.violinplot(x=categorical_variable, y='auc_val', data=AUC_results_df, ax=ax2, alpha=0.1, size=2.5, palette='Spectral')


# plot train and val matthews for each variable
sns.violinplot(x=categorical_variable, y='matthews_train', data=AUC_results_df, ax=ax3, alpha=0.1, size=2.5, palette='Spectral')
sns.violinplot(x=categorical_variable, y='matthews_val', data=AUC_results_df, ax=ax4, alpha=0.1, size=2.5, palette='Spectral')

fig.set_tight_layout(True)

# set figure title
fig.suptitle(f'LSTM results by {categorical_variable}')

plt.show()

In [None]:
AUC_results_df.head()

## Finding best overall parameters

(best mean performance over all folds)

In [None]:
# get mean auc_val of parameter combination over all CV folds (cv_num)
AUC_results_df.groupby(['activation', 'batch', 'data', 'layers', 'masking', 'units', 'optimizer', 'outcome', 'dropout'])['auc_val'].median().sort_values( ascending=False)

In [None]:
AUC_results_df.groupby(['activation', 'batch', 'data', 'layers', 'masking', 'units', 'optimizer', 'outcome', 'dropout'])['matthews_val'].median().sort_values( ascending=False)

In [None]:
best_overall_parameters = pd.DataFrame(AUC_results_df.groupby(['activation', 'batch', 'data', 'layers', 'masking', 'units', 'optimizer', 'outcome', 'dropout'])['auc_val'].median().sort_values(ascending=False).reset_index().iloc[0]).T
best_overall_parameters

In [None]:
overall_best_parameter_result = AUC_results_df[
    (AUC_results_df['data'] == best_overall_parameters['data'][0]) &
    (AUC_results_df['dropout'] == best_overall_parameters.dropout[0]) &
    (AUC_results_df['layers'] == best_overall_parameters.layers[0]) &
    (AUC_results_df['optimizer'] == best_overall_parameters.optimizer[0]) &
    (AUC_results_df['units'] == best_overall_parameters.units[0])
]
overall_best_parameter_result

In [None]:
best_overall_parameters['best_fold'] = overall_best_parameter_result.sort_values('auc_val', ascending=False)['cv_num'].iloc[0]
print(f"Best CV fold: {best_overall_parameters['best_fold']}")

In [None]:
# Save best parameters
dirname = os.path.dirname(AUC_results_path)
best_overall_parameters.to_csv(os.path.join(dirname, 'selected_best_hyperopt_parameters.csv'))

## Overall Best fold

In [None]:
AUC_results_df

## Best individual parameters

(by best individually performing parameter)

In [None]:
individual_best_parameter_result = AUC_results_df[
    (AUC_results_df['data'] == AUC_results_df.sort_values('auc_val', ascending=False)['data'].iloc[0] ) &
    (AUC_results_df['dropout'] == AUC_results_df.sort_values('auc_val', ascending=False)['dropout'].iloc[0]) &
    (AUC_results_df['layers'] == AUC_results_df.sort_values('auc_val', ascending=False)['layers'].iloc[0]) &
    (AUC_results_df['optimizer'] == AUC_results_df.sort_values('auc_val', ascending=False)['optimizer'].iloc[0]) &
    (AUC_results_df['units'] == AUC_results_df.sort_values('auc_val', ascending=False)['units'].iloc[0])
]
individual_best_parameter_result

In [None]:
individual_best_parameter_result['matthews_val'].median(), individual_best_parameter_result['auc_val'].median()

In [None]:
individual_best_parameter_CV_df = CV_results_df[
    (CV_results_df['data'] == AUC_results_df.sort_values('auc_val', ascending=False)['data'].iloc[0]) &
    (CV_results_df['dropout'] == AUC_results_df.sort_values('auc_val', ascending=False)['dropout'].iloc[0]) &
    (CV_results_df['layers'] == AUC_results_df.sort_values('auc_val', ascending=False)['layers'].iloc[0]) &
    (CV_results_df['optimizer'] == AUC_results_df.sort_values('auc_val', ascending=False)['optimizer'].iloc[0]) &
    (CV_results_df['units'] == AUC_results_df.sort_values('auc_val', ascending=False)['units'].iloc[0])
]


In [None]:
individual_best_parameter_CV_df.head()