In [None]:
import timm
import gc
import wandb
from fastai.vision.all import *
from fastai.callback.wandb import *
from utils.core import *

path = Path('/root/.fastai/data/oxford-iiit-pet')
Path.BASE_PATH = path

df = pd.read_csv('pets_sample.csv')

In [None]:
sweep_cfg = {
    'project': 'my_fastbook',
    'metric': {'name': 'error_rate', 'goal': 'minimize'},
    # 'method': 'random',
    'method': 'bayes',
    'parameters': {
        'epochs': {
            'value': 15
        },
        'pre_size': {
            'value': 160,
        },
        'post_size': {
            'value': 112,
        },
        'pre_bs': {
            'value': 256,
        },
        'post_bs': {
            'value': 192,
        },
        'pre_lr': {
            'min': 1e-3,
            'max': 1e-2
        },
        'pre_epoch': {
            'values': [1,2,3],
        },
        'post_lr': {
            'min': 1e-3,
            'max': 1e-2
        },
        'post_div': {
            'min': 50,
            'max': 250
        }
    }
   
}

sweep_cfg2 = {
    'project': 'my_fastbook',
    'metric': {'name': 'best_value', 'goal': 'minimize'},
    # 'metric': {'name': 'error_rate', 'goal': 'minimize'},
    # 'method': 'random',
    'method': 'bayes',
    'parameters': {
        'arch': {
            'value': 'convnext_tiny_in22k_stage0.pth'
        },
        'epochs': {
            'value': 15
        },
        'pre_size': {
            'value': 240,
        },
        'post_size': {
            'value': 168,
        },
        'pre_bs': {
            'value': 192,
        },
        'post_bs': {
            'value': 96,
        },
        'pre_lr': {
            'min': 5e-4,
            'max': 5e-3
        },
        'pre_epoch': {
            'values': [0,1,2],
        },
        'post_lr': {
            'min': 5e-4,
            'max': 5e-3
        },
        'post_div': {
            'min': 50,
            'max': 250
        }
    }
   
}

def get_dls(item_tfms=Resize(160), 
            batch_tfms=aug_transforms(size=112), 
            valid_pct=.4,
            bs=64):
    return ImageDataLoaders.from_df(df, path/'images',
                                    splitter=RandomSplitter(seed=42), 
                                    valid_pct=valid_pct,
                                    item_tfms=item_tfms,
                                    batch_tfms=batch_tfms,
                                    seed=42,
                                    bs=bs)

def get_learn():
        return vision_learner(get_dls(), arch, 
                           metrics=error_rate)

arch = 'convnext_tiny_in22k'

def trainer():
    wandb.init()
    set_seed(2022, True)
    dls = get_dls(item_tfms=Resize(wandb.config.pre_size),
                  batch_tfms=aug_transforms(size=wandb.config.post_size),
                  bs=wandb.config.pre_bs)
    learn = vision_learner(get_dls(), arch, 
                           cbs=[WandbCallback()], 
                           metrics=error_rate)
    learn.fit_one_cycle(wandb.config.pre_epoch, lr_max=wandb.config.pre_lr)
    learn.unfreeze()
    learn.dls = get_dls(item_tfms=Resize(wandb.config.pre_size),
                  batch_tfms=aug_transforms(size=wandb.config.post_size),
                  bs=wandb.config.post_bs)
    learn.fit_one_cycle(wandb.config.epochs - wandb.config.pre_epoch, lr_max=(wandb.config.post_lr/wandb.config.post_div, wandb.config.post_lr))
    learn = None
    dls = None
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()
    
class BestValueCallback(TrackerCallback):
    def after_epoch(self):
        super().after_epoch()
        if self.wandb: wandb.log({'best_value':self.best})
        
def trainer2(cfg={}):
    wandb.init(config=cfg)
    set_seed(2022, True)
    learn = load_learner(wandb.config.arch)
    learn.add_cbs([
        BestValueCallback(monitor='error_rate', comp=np.less),
        # SaveModelCallback(monitor='error_rate', comp=np.less),
        # EarlyStoppingCallback(monitor='error_rate', comp=np.less, patience=5)
    ])
    if wandb.config.pre_epoch > 0:
        learn.freeze()
        learn.dls = get_dls(item_tfms=Resize(wandb.config.pre_size),
                      batch_tfms=aug_transforms(size=wandb.config.post_size),
                      bs=wandb.config.pre_bs)
        learn.fit_one_cycle(wandb.config.pre_epoch, lr_max=wandb.config.pre_lr)
        learn.unfreeze()
    learn.dls = get_dls(item_tfms=Resize(wandb.config.pre_size),
                  batch_tfms=aug_transforms(size=wandb.config.post_size),
                  bs=wandb.config.post_bs)
    learn.fit_one_cycle(wandb.config.epochs - wandb.config.pre_epoch, lr_max=(wandb.config.post_lr/wandb.config.post_div, wandb.config.post_lr))
    if cfg != {}: return learn
    learn = None
    dls = None
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
dls = get_dls(item_tfms=Resize(224),
              batch_tfms=aug_transforms(size=196),
              bs=64)
learn = vision_learner(get_dls(), arch, 
                       # cbs=[WandbCallback()], 
                       metrics=error_rate)


In [None]:

# learn.summary()

# Total params: 28,621,440
# Total trainable params: 824,256
# Total non-trainable params: 27,797,184

In [None]:
from fastai.callback.

In [None]:
learn.loss_func = LabelSmoothingCrossEntropyFlat()

In [None]:
learn.fit_one_cycle(1)

epoch,train_loss,valid_loss,error_rate,time
0,4.410619,4.115881,0.811512,01:01


In [None]:
# cfg = {
#     'project': 'my_fastbook',
#         'epochs': 15,
#         'pre_size': 160,
#         'post_size': 112,
#         'pre_bs': 256,
#         'post_bs': 192,
#         'pre_lr': .009463,
#         'pre_epoch': 2,
#         'post_lr': .005208,
#         'post_div': 115   
# }

cfg = {
    'project': 'my_fastbook',
        'arch': 'convnext_tiny_in22k_stage0.pth',
        'epochs': 15,
        'pre_size': 160,
        'post_size': 112,
        'pre_bs': 256,
        'post_bs': 192,
        'pre_lr': .006337,
        'pre_epoch': 2,
        'post_lr': .008101,
        'post_div': 202   
}

cfg = {
    'project': 'my_fastbook',
    'arch': 'convnext_tiny_in22k_stage0.pth',
    'epochs': 15,
    'pre_size': 160,
    'post_size': 112,
    'pre_bs': 256,
    'post_bs': 192,
    'pre_lr': 1e-3,
    'pre_epoch': 2,
    'post_lr': 1e-3,
    'post_div': 202   
}

def runner(cfg={}):
    wandb.init(config=cfg)
    set_seed(2022, True)
    dls = get_dls(item_tfms=Resize(wandb.config.pre_size),
                  batch_tfms=aug_transforms(size=wandb.config.post_size),
                  bs=wandb.config.pre_bs)
    learn = vision_learner(get_dls(), arch, 
                           cbs=[WandbCallback()], 
                           metrics=error_rate)
    learn.fit_one_cycle(wandb.config.pre_epoch, lr_max=wandb.config.pre_lr)
    learn.unfreeze()
    learn.dls = get_dls(item_tfms=Resize(wandb.config.pre_size),
                  batch_tfms=aug_transforms(size=wandb.config.post_size),
                  bs=wandb.config.post_bs)
    learn.fit_one_cycle(wandb.config.epochs - wandb.config.pre_epoch, lr_max=(wandb.config.post_lr/wandb.config.post_div, wandb.config.post_lr))
    learn.path = Path('.')
    learn.export(learn.arch + '_stage0.pth')
    
    
# trainer2(cfg)

In [None]:
cfg1 = {
    'project': 'my_fastbook',
        'epochs': 15,
        'pre_size': 240,
        'post_size': 168,
        'pre_bs': 192,
        'post_bs': 96,
        'pre_lr': .001,
        'pre_epoch': 1,
        'post_lr': .002,
        'post_div': 100   
}

def runner1(cfg={}):
    wandb.init(config=cfg)
    set_seed(2022, True)
    learn = load_learner('convnext_tiny_in22k_stage0.pth')
    learn.dls = get_dls(item_tfms=Resize(wandb.config.pre_size),
                  batch_tfms=aug_transforms(size=wandb.config.post_size),
                  bs=wandb.config.pre_bs)
    learn.freeze()
    learn.fit_one_cycle(wandb.config.pre_epoch, lr_max=wandb.config.pre_lr)
    learn.unfreeze()
    learn.dls = get_dls(item_tfms=Resize(wandb.config.pre_size),
                  batch_tfms=aug_transforms(size=wandb.config.post_size),
                  bs=wandb.config.post_bs)
    learn.fit_one_cycle(wandb.config.epochs - wandb.config.pre_epoch, lr_max=(wandb.config.post_lr/wandb.config.post_div, wandb.config.post_lr))
    learn.path = Path('.')
    learn.export(learn.arch + '_stage1.pth')
    
runner1(cfg1)

[34m[1mwandb[0m: Currently logged in as: [33mmark_b2[0m. Use [1m`wandb login --relogin`[0m to force relogin


epoch,train_loss,valid_loss,error_rate,time
0,0.114213,0.279801,0.095937,00:40


epoch,train_loss,valid_loss,error_rate,time
0,0.107344,0.257886,0.085779,01:13
1,0.108769,0.261317,0.08465,01:13
2,0.101444,0.281408,0.082393,01:13
3,0.095706,0.297246,0.082393,01:13
4,0.086904,0.278543,0.073363,01:13
5,0.080473,0.283839,0.073363,01:14
6,0.070256,0.2628,0.071106,01:13
7,0.064559,0.265043,0.069977,01:13
8,0.056021,0.280601,0.073363,01:13
9,0.050752,0.260225,0.071106,01:13


In [None]:
cfg2 = {
    'project': 'my_fastbook',
    'arch': 'convnext_tiny_in22k_stage0.pth',
    'epochs': 2, #15,
    'pre_size': 240,
    'post_size': 168,
    'pre_bs': 192,
    'post_bs': 96,
    'pre_lr': .001,
    'pre_epoch': 1,
    'post_lr': .002,
    'post_div': 100  
}

learn = trainer2(cfg2)
# learn.path = Path('.')
# learn.export(learn.arch + '_stage1.pth')

[34m[1mwandb[0m: Currently logged in as: [33mmark_b2[0m. Use [1m`wandb login --relogin`[0m to force relogin


epoch,train_loss,valid_loss,error_rate,time
0,0.114213,0.279801,0.095937,00:41


Better model found at epoch 0 with error_rate value: 0.09593677520751953.


epoch,train_loss,valid_loss,error_rate,time
0,0.107944,0.292404,0.097065,01:11


Better model found at epoch 0 with error_rate value: 0.09706544876098633.


In [None]:
sweep_cfg3 = {
    'project': 'my_fastbook',
    'metric': {'name': 'best_value', 'goal': 'minimize'},
    # 'metric': {'name': 'error_rate', 'goal': 'minimize'},
    'method': 'bayes',
    'parameters': {
        'arch': {
            'value': 'convnext_tiny_in22k_stage1.pth'
        },
        'epochs': {
            'value': 15
        },
        'pre_size': {
            'value': 320,
        },
        'post_size': {
            'value': 224,
        },
        'pre_bs': {
            'value': 64,
        },
        'post_bs': {
            'value': 32,
        },
        'pre_lr': {
            'min': 5e-4,
            'max': 2e-3
        },
        'pre_epoch': {
            'values': [0,1,2],
        },
        'post_lr': {
            'min': 5e-4,
            'max': 2e-3
        },
        'post_div': {
            'min': 50,
            'max': 250
        }
    }
   
}

# sweep_id = wandb.sweep(sweep_cfg3)
sweep_id = 'mybku6li'

In [None]:
sweep_cfg4 = {
    'project': 'my_fastbook',
    'metric': {'name': 'best_value', 'goal': 'minimize'},
    # 'metric': {'name': 'error_rate', 'goal': 'minimize'},
    'method': 'bayes',
    'parameters': {
        'arch': {
            'value': 'convnext_tiny_in22k_stage0.pth'
        },
        'epochs': {
            'value': 20
        },
        'pre_size': {
            'value': 320,
        },
        'post_size': {
            'value': 224,
        },
        'pre_bs': {
            'value': 64,
        },
        'post_bs': {
            'value': 32,
        },
        'pre_lr': {
            'min': 5e-4,
            'max': 2e-3
        },
        'pre_epoch': {
            'value': 2,
        },
        'post_lr': {
            'min': 5e-4,
            'max': 2e-3
        },
        'post_div': {
            'min': 50,
            'max': 250
        }
    }
   
}

# sweep_id = wandb.sweep(sweep_cfg4)
sweep_id = 'uiujoqnj'

In [None]:
count = 60

wandb.agent(sweep_id, function=trainer2, count=count, project='my_fastbook')

[34m[1mwandb[0m: Agent Starting Run: iehbeh2e with config:
[34m[1mwandb[0m: 	arch: convnext_tiny_in22k_stage0.pth
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	post_bs: 32
[34m[1mwandb[0m: 	post_div: 176
[34m[1mwandb[0m: 	post_lr: 0.001627195427941712
[34m[1mwandb[0m: 	post_size: 224
[34m[1mwandb[0m: 	pre_bs: 64
[34m[1mwandb[0m: 	pre_epoch: 2
[34m[1mwandb[0m: 	pre_lr: 0.0015881240598632238
[34m[1mwandb[0m: 	pre_size: 320
[34m[1mwandb[0m: Currently logged in as: [33mmark_b2[0m. Use [1m`wandb login --relogin`[0m to force relogin


epoch,train_loss,valid_loss,error_rate,time
0,0.18913,0.269322,0.088036,01:11
1,0.152907,0.234473,0.06772,01:10


epoch,train_loss,valid_loss,error_rate,time
0,0.105973,0.219803,0.064334,02:07
1,0.105732,0.219498,0.064334,02:08
2,0.090679,0.240473,0.073363,02:08
3,0.094098,0.245685,0.075621,02:08
4,0.084092,0.21453,0.059819,02:08
5,0.08502,0.225323,0.064334,02:08
6,0.066543,0.258406,0.065463,02:08
7,0.081962,0.213072,0.05079,02:08
8,0.065278,0.215813,0.057562,02:08
9,0.052494,0.206653,0.051919,02:07


VBox(children=(Label(value='0.801 MB of 0.801 MB uploaded (0.780 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
best_value,█▄▄▄▄▄▃▃▃▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
eps_0,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eps_1,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
error_rate,█▄▄▄▅▆▃▄▄▁▂▁▃▄▃▂▃▂▂▂
lr_0,█▆▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr_1,▆▅▁▂▂▃▄▅▆▇███████▇▇▇▆▆▆▅▅▅▄▄▄▃▃▂▂▂▂▁▁▁▁▁
mom_0,▂▄██▇▆▅▄▃▂▁▁▁▁▁▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▇▇▇▇█████
mom_1,▂▄██▇▆▅▄▃▂▁▁▁▁▁▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▇▇▇▇█████
raw_loss,▅▅█▆▂█▇▂▂▂▃▂▅▂▆▃▁▁▂▁▆▁▁▂▁▁▃▁▁▄▁▁▂▂▁▃▁▁▂▁

0,1
best_value,0.05079
epoch,20.0
eps_0,1e-05
eps_1,1e-05
error_rate,0.05643
lr_0,0.0
lr_1,0.0
mom_0,0.95
mom_1,0.95
raw_loss,0.00129


[34m[1mwandb[0m: Agent Starting Run: maocwvq0 with config:
[34m[1mwandb[0m: 	arch: convnext_tiny_in22k_stage0.pth
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	post_bs: 32
[34m[1mwandb[0m: 	post_div: 66
[34m[1mwandb[0m: 	post_lr: 0.0012772479440252417
[34m[1mwandb[0m: 	post_size: 224
[34m[1mwandb[0m: 	pre_bs: 64
[34m[1mwandb[0m: 	pre_epoch: 2
[34m[1mwandb[0m: 	pre_lr: 0.0018069140315712863
[34m[1mwandb[0m: 	pre_size: 320


epoch,train_loss,valid_loss,error_rate,time
0,0.189682,0.280215,0.094808,01:10
1,0.151979,0.236049,0.069977,01:09


epoch,train_loss,valid_loss,error_rate,time
0,0.103024,0.221607,0.064334,02:07
1,0.103737,0.216469,0.062077,02:07
2,0.088264,0.238825,0.072235,02:07
3,0.08624,0.236222,0.073363,02:08
4,0.075979,0.228707,0.06772,02:07
5,0.080227,0.225265,0.063205,02:07
6,0.065874,0.223893,0.057562,02:07
7,0.06924,0.19878,0.059819,02:07
8,0.057916,0.231968,0.059819,02:07
9,0.048642,0.204715,0.05079,02:07


VBox(children=(Label(value='0.801 MB of 0.801 MB uploaded (0.780 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
best_value,█▄▃▃▃▃▃▃▂▂▂▁▁▁▁▁▁▁▁▁
epoch,▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
eps_0,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eps_1,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
error_rate,█▄▃▃▄▅▄▃▂▂▂▁▂▄▃▂▂▃▂▂
lr_0,█▆▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr_1,█▆▁▁▂▃▄▅▆▆▇▇▇▇▇▇▇▇▆▆▆▆▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁
mom_0,▂▄██▇▆▅▄▃▂▁▁▁▁▁▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▇▇▇▇█████
mom_1,▂▄██▇▆▅▄▃▂▁▁▁▁▁▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▇▇▇▇█████
raw_loss,▄▄▇▅▂▇▆▂▂▂▂▂▄▂█▂▁▂▁▁▂▁▁▃▁▁▄▁▁▂▁▁▂▁▁▁▁▁▂▁

0,1
best_value,0.05079
epoch,20.0
eps_0,1e-05
eps_1,1e-05
error_rate,0.05869
lr_0,0.0
lr_1,0.0
mom_0,0.95
mom_1,0.95
raw_loss,0.00067


[34m[1mwandb[0m: Agent Starting Run: lvix6lzw with config:
[34m[1mwandb[0m: 	arch: convnext_tiny_in22k_stage0.pth
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	post_bs: 32
[34m[1mwandb[0m: 	post_div: 235
[34m[1mwandb[0m: 	post_lr: 0.0017460090397006826
[34m[1mwandb[0m: 	post_size: 224
[34m[1mwandb[0m: 	pre_bs: 64
[34m[1mwandb[0m: 	pre_epoch: 2
[34m[1mwandb[0m: 	pre_lr: 0.0014202792238828437
[34m[1mwandb[0m: 	pre_size: 320


epoch,train_loss,valid_loss,error_rate,time
0,0.188915,0.260907,0.088036,01:09
1,0.152932,0.232693,0.069977,01:09


epoch,train_loss,valid_loss,error_rate,time
0,0.10653,0.218229,0.063205,02:07
1,0.106331,0.220704,0.062077,02:07
2,0.09216,0.244519,0.075621,02:07
3,0.095711,0.24733,0.072235,02:07
4,0.086628,0.223163,0.060948,02:07
5,0.083088,0.237164,0.066591,02:07
6,0.069311,0.268579,0.06772,02:07
7,0.083896,0.208462,0.05079,02:07
8,0.067773,0.225103,0.055305,02:07
9,0.051368,0.208314,0.058691,02:07


## TODO Check if `learn.export` save WITH optimizer state?

## Check if GradientAccumulation helps ??
## Check if more fine learning rate splitting helps??
## LabelSmoothing ??

In [None]:
Learner.export

In [None]:
res_df = pd.read_csv('results.csv') #.loc[df.stage==0]
by_valid = res_df.sort_values('valid', ascending=True)
by_error = res_df.sort_values('error', ascending=True)

In [None]:
# df.drop(df.loc[df['div'] > 0].index, inplace=True)
# df

In [None]:
# df.to_csv('results.csv', index=False)

In [None]:
by_error.head()

In [None]:
by_valid.head()

In [None]:
learn = vision_learner(get_dls(bs=256), arch, metrics=error_rate)
learn.fit_one_cycle(3, lr_max=4e-3)
learn.unfreeze()
learn.dls = get_dls(bs=192)
learn.fit_one_cycle(10, lr_max=(4e-5,4e-3))

In [None]:
wandb.init(project='fastbook', config=sweep_cfg)
arch = 'convnext_tiny_in22k'
learn = load_learner(arch+'_160_112_3_003_stage0.pth')
learn.unfreeze()
learn.dls = get_dls(bs=wandb.config.post_bs)
learn.add_cb(WandbCallback())
learn.fit_one_cycle(3, lr_max=(4e-5,4e-3))

In [None]:
learn = vision_learner(get_dls(bs=256), arch, metrics=error_rate)
learn.fit_one_cycle(3, lr_max=2e-3)
learn.unfreeze()
learn.dls = get_dls(bs=196)
learn.fit_one_cycle(10, lr_max=(1e-5,1e-3))

In [None]:
for lr in np.logspace(-3,-1.5,5):
    lr = float(lr)
    learn = vision_learner(get_dls(bs=256), arch, metrics=error_rate)
    print(lr)
    learn.fit_one_cycle(6, lr_max=lr, pct_start=.99)

In [None]:
learn = vision_learner(get_dls(bs=256), arch, metrics=error_rate)
learn.fit_one_cycle(6, lr_max=.006)
learn.path = Path('.')
learn.export('convnext_nano_160_112_005_stage0.pth')

In [None]:
for dev in np.logspace(1,3,5):
    for lr in [1e-3, 2e-3]:
        dev = float(dev)
        learn = load_learner('convnext_nano_160_112_005_stage0.pth')
        learn.dls = get_dls(bs=256)
        print(lr, dev)
        learn.fit_one_cycle(16, lr_max=slice(lr/dev,lr))

In [None]:
learn = load_learner('convnext_nano_160_112_005_stage0.pth')
learn.dls = get_dls(bs=256)
learn.fit_one_cycle(16, lr_max=(1e-5,1e-3))

In [None]:
for dev in np.logspace(1,3,5):
    for lr in [1e-3, 2e-3]:
        dev = float(dev)
        learn = load_learner('convnext_nano_160_112_005_stage0.pth')
        learn.dls = get_dls(bs=256)
        print(lr, dev)
        learn.fit_one_cycle(16, lr_max=slice(lr/dev,lr))