In [1]:
import wandb
import pandas as pd
import matplotlib.pyplot as plt

In [33]:

# Initialize wandb API
api = wandb.Api()

# Replace 'your_sweep_id' with your actual sweep ID


task = 'SSC'
# sweep_ids = ['22j8955i']
sweep_ids = ['wauv85y4', 'gfwkoa2e', 'i81h7lqv', '7befnpfs']
# Fetch the sweep object


sweep_params = ['lr', 'pdrop', 'scheduler_patience', 'scheduler_factor', 'dt_max']

In [34]:
# Initialize an empty DataFrame to hold the relevant data
data = []

for sweep_id in sweep_ids:
    sweep_id = 'maximes_crew/S3_'+task+'_runs/' + sweep_id
    sweep = api.sweep(sweep_id)

    # Fetch all runs in the sweep
    runs = sweep.runs
    # Extract the necessary data from each run
    for run in runs:
        config = run.config
        summary = run.summary
        history = run.history()
        n_layers = config.get('nb_layers')
        n_hiddens = config.get('nb_hiddens')
        lr = config.get('lr')
        dt_min = config.get('dt_min')
        dt_max = config.get('dt_max')
        pdrop = config.get('pdrop')
        sched_patience = config.get('scheduler_patience')
        sched_factor = config.get('scheduler_factor')
        if 'valid acc' in history.columns and 'test acc' in history.columns:
            best_valid_acc_idx = history['valid acc'].idxmax()  # Get the index of the best valid accuracy
            if (best_valid_acc_idx+1) in history['test acc'].index:
                test_acc = history['test acc'].loc[best_valid_acc_idx+1]  # Get test acc at that index
            else:
                test_acc = None
        else:
            test_acc = None

        if test_acc is not None:
            data.append({
                'n_layers': n_layers,
                'n_hiddens': n_hiddens,
                'lr': lr,
                'dt_min': dt_min,
                'dt_max': dt_max,
                'pdrop' : pdrop,
                'scheduler_patience' : sched_patience,
                'scheduler_factor' : sched_factor,
                'test_acc': test_acc
            })



In [35]:
data[0]

{'n_layers': 3,
 'n_hiddens': 512,
 'lr': 0.01,
 'dt_min': 0.01,
 'dt_max': 25.2,
 'pdrop': 0.55,
 'scheduler_patience': 10,
 'scheduler_factor': 0.7,
 'test_acc': 0.8156608484968355}

In [36]:
# Convert the data to a DataFrame for easier manipulation
df = pd.DataFrame(data)

# Group by n_layers and lr, then compute the mean of best_valid_acc for each group
df_grouped = df.groupby(sweep_params).agg({'test_acc': 'mean'}).reset_index()

# Calculate the number of runs for each line configuration
df['run_count'] = df.groupby(sweep_params).transform('size')

# Add the run count to the grouped dataframe
df_grouped['run_count'] = df_grouped[sweep_params].apply(lambda x: df[(df[sweep_params] == x).all(axis=1)]['run_count'].iloc[0], axis=1)

df_grouped_sorted = df_grouped.sort_values(by='test_acc', ascending=False).reset_index(drop=True)

In [37]:
df_grouped_sorted[0:10]

Unnamed: 0,lr,pdrop,scheduler_patience,scheduler_factor,dt_max,test_acc,run_count
0,0.01,0.45,10,0.7,5.4,0.81403,12
1,0.01,0.45,10,0.7,30.0,0.813621,12
2,0.008,0.55,10,0.7,29.0,0.812853,12
3,0.01,0.55,10,0.7,25.2,0.811167,12
