In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import os
import seaborn as sns

In [None]:
gridsearch_path_v1 = '/Users/jk1/Downloads/gridsearch_v1.jsonl'
gridsearch_path_v2 = '/Users/jk1/Downloads/gridsearch_v2.jsonl'
gridsearch_path_v3 = '/Users/jk1/Downloads/gridsearch_v3.jsonl'

In [None]:
df_v1 = pd.read_json(gridsearch_path_v1, lines=True, dtype={'timestamp': 'object'}, convert_dates=False).drop(0).dropna(axis=1, how='all')
df_v2 = pd.read_json(gridsearch_path_v2, lines=True, dtype={'timestamp': 'object'}, convert_dates=False)
df_v3 = pd.read_json(gridsearch_path_v3, lines=True, dtype={'timestamp': 'object'}, convert_dates=False)

In [None]:
df_v3.shape

In [None]:
df = pd.concat([df_v1, df_v2, df_v3])
df['feature_aggregation'].fillna(False, inplace=True)
df.shape

In [None]:
df

In [None]:
# find best by median_val_scores
best_df = df.sort_values('median_val_scores', ascending=False).head(1)
best_df

In [None]:
# convert best_df to json
best_df.to_json(os.path.join(os.path.dirname(gridsearch_path), f'hyperopt_selected_transformer_{best_df["timestamp"].values[0]}.json'), orient='records', lines=True)

In [None]:
cat_gs_variables = ['num_layers', 'model_dim',
        'batch_size', 'balanced',
       'num_head']
cont_gs_variables = ['dropout', 'train_noise',
       'lr', 'weight_decay','grad_clip_value']

In [None]:
ax = sns.boxplot(x='num_layers', y='median_val_scores', data=df, hue='feature_aggregation')
# set figure size to 10,10
ax.figure.set_size_inches(10,10)
ax.set_title('Number of layers')
ax.set_ylim(0.88, 0.92)
plt.show()

In [None]:
ax = sns.boxplot(x='model_dim', y='median_val_scores', data=df, hue='feature_aggregation')
ax.figure.set_size_inches(10,10)
ax.set_title('Model dimension')
ax.set_ylim(0.88, 0.92)
plt.show()

In [None]:
ax = sns.boxplot(x='feature_aggregation', y='median_val_scores', data=df)
ax.figure.set_size_inches(10,10)
ax.set_title('Feature aggregation')
ax.set_ylim(0.88, 0.92)
plt.show()

In [None]:
ax = sns.boxplot(x='batch_size', y='median_val_scores', data=df)
ax.set_title('Batch size')
ax.figure.set_size_inches(10,10)
ax.set_ylim(0.88, 0.92)

In [None]:
ax = sns.boxplot(x='balanced', y='median_val_scores', data=df)
ax.figure.set_size_inches(10,10)
ax.set_title('Balanced')
ax.set_ylim(0.88, 0.92)

In [None]:
ax = sns.violinplot(x='num_head', y='median_val_scores', data=df)
ax.figure.set_size_inches(10,10)
ax.set_title('Number of heads')
ax.set_ylim(0.88, 0.92)

In [None]:
ax = sns.regplot(x='dropout', y='median_val_scores', data=df)
ax.figure.set_size_inches(10,10)
ax.set_title('Dropout')
ax.set_ylim(0.88, 0.92)


In [None]:
ax = sns.regplot(x='train_noise', y='median_val_scores', data=df, logx=True)
ax.figure.set_size_inches(10,10)
ax.set_title('Train noise')
ax.set_ylim(0.88, 0.92)
ax.set_xscale('log')

In [None]:
ax = sns.scatterplot(x='lr', y='median_val_scores', data=df)
ax.figure.set_size_inches(10,10)
ax.set_title('Learning rate')
ax.set_ylim(0.88, 0.92)

In [None]:
ax = sns.scatterplot(x='weight_decay', y='median_val_scores', data=df)
ax.figure.set_size_inches(10,10)
ax.set_title('Weight decay')
ax.set_ylim(0.88, 0.92)
ax.set_xlim(0, 0.1)

In [None]:
ax = sns.scatterplot(x='grad_clip_value', y='median_val_scores', data=df)
ax.figure.set_size_inches(10,10)
ax.set_title('Gradient clipping')
ax.set_ylim(0.88, 0.92)