In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import pdb
import bcolz
import seaborn as sns
import librosa
import librosa.display
import matplotlib.pyplot as plt
import pickle
import datetime
from fastai.conv_learner import *
from fastai.dataset import *
from random import shuffle
from ipywidgets import widgets
from tqdm import tqdm
from scipy.io.wavfile import read, write

In [4]:
PATH = Path('data/freesound')
train_PATH = Path('data/freesound/audio_train')
test_PATH = Path('data/freesound/audio_test')

# Ensembling of results

In [422]:
class ResultsDataset(Dataset):
    def __init__(self, results_l, targets):
        self.results = np.concatenate(results_l, axis=1)
        self.targets = targets
    
    def __len__(self):    
        return len(self.results)
    
    def __getitem__(self, i):
        x = self.results[i]
        y = self.targets[i]
        return x, y
    
    
class EnsembleNet(nn.Module):
    def __init__(self, n, c):
        super().__init__()
        self.features_d = {}
        for i in range(n):
            setattr(self, f'features_{i}', nn.Sequential(nn.Linear(c,c), nn.Tanh()))
            self.features_d[i] = f'features_{i}'
        self.n = n
        self.c = c
                
    def forward(self, results):         
        c = self.c
        n = self.n
        ens_d = {}
        ens_d[0] = results[:,:c]
        m1 = 1; m2 = 2
        for i in range(n-2):
            ens_d[i+1] = results[:,c*m1:c*m2]
            m1 += 1
            m2 += 1
        ens_d[n-1]= results[:,c*(n-1):]
        var_l = []
        for i in range(n):    
            v = getattr(self, self.features_d[i])(V(ens_d[i].contiguous().cuda(async=True)))
            var_l.append(v.unsqueeze(0))
        x = torch.cat(var_l, 0)
        x = torch.mean(x, 0) * n
        return F.softmax(x, dim=-1)
    

class Ensemble():
    def __init__(self, c, classes, ensemble_trn_dl, targets_train, targets_val, PATH):
        self.RES_PATH = PATH / 'results'
        self.TEST_PATH = PATH / 'test_results'
        self.ENS_PATH = PATH / 'ensemble_files'
        os.makedirs(self.RES_PATH, exist_ok=True)
        os.makedirs(self.TEST_PATH, exist_ok=True)
        os.makedirs(self.ENS_PATH, exist_ok=True)
        self.c = c
        self.classes = classes
        self.ensemble_trn_dl = ensemble_trn_dl
        self.targets_trn = targets_train
        self.targets_val = targets_val
         
    def add_model(self, learn, mname, mdescription, own_accuracy):
        mnames = !ls {str(self.RES_PATH)}
        mnames = [f[:len(mname)] for f in mnames]
        assert mname not in mnames, 'This mname exists already, please use another name'
        
        try:
            mdescription_d = pickle.load(open(self.ENS_PATH/"mdescription_d.pickle", "rb"))
            mdescription_d[mname] = [mdescription, own_accuracy]
            pickle.dump(mdescription_d, open(self.ENS_PATH/"mdescription_d.pickle", "wb"))

        except (OSError, IOError) as e:
            mdescription_d = {}
            mdescription_d[mname] = [mdescription, own_accuracy] 
            pickle.dump(mdescription_d, open(self.ENS_PATH/"mdescription_d.pickle", "wb"))
        
        learn.data.trn_dl = self.ensemble_trn_dl
        results_trn = self.run_model(learn, learn.data.trn_dl)
        results_val = self.run_model(learn, learn.data.val_dl)
        results_test = self.run_model(learn, learn.data.test_dl)
        self.save_results(results_trn, results_val, results_test, mname)
            
    def add_model_by_using_results(self, mname, mdescription, own_accuracy, results_trn, results_val, results_test):
        mnames = !ls {str(self.RES_PATH)}
        mnames = [f[:len(mname)] for f in mnames]
        assert mname not in mnames, 'This mname exists already, please use another name'
        
        try:
            mdescription_d = pickle.load(open(self.ENS_PATH/"mdescription_d.pickle", "rb"))
            mdescription_d[mname] = [mdescription, own_accuracy]
            pickle.dump(mdescription_d, open(self.ENS_PATH/"mdescription_d.pickle", "wb"))

        except (OSError, IOError) as e:
            mdescription_d = {}
            mdescription_d[mname] = [mdescription, own_accuracy] 
            pickle.dump(mdescription_d, open(self.ENS_PATH/"mdescription_d.pickle", "wb"))
        
        self.save_results(results_trn, results_val, results_test, mname)
        
    def run_model(self, learn, dl):
        results = torch.zeros(1, self.c).cuda()
        learn.model.eval()
        it = iter(dl)
        _next = True
        while _next:
            try:
                x, y = next(it)
                probs = learn.model(V(x)).float().exp().data
                results = torch.cat((results, probs), dim=0)
            except StopIteration:
                _next = False
        results = results.cpu().numpy()
        return results[1:]
        
    def save_results(self, results_trn, results_val, results_test, mname):
        np.save(self.RES_PATH / f'{mname}_trn.npy', results_trn) 
        np.save(self.RES_PATH / f'{mname}_val.npy', results_val)
        np.save(self.TEST_PATH / f'{mname}_test.npy', results_test)
        
    def ensemble_magic(self, bs, lr, n_epochs, mnames=None, ensemble_ID=None):
        if mnames == None and ensemble_ID == None:
            raise ValueError('please enter either mnames or ensemble_ID')
            
        if mnames != None and ensemble_ID != None:
            raise ValueError('please enter either mnames or ensemble_ID')
                
        if mnames is not None:
            assert isinstance(mnames, list), 'mnames is not a list, please use a list'
            mnames = set(mnames)
            existing_mnames = !ls {str(self.RES_PATH)}
            existing_mnames = set([f[:-8] for f in existing_mnames])
            for mname in mnames:
                assert mname in existing_mnames, f'mname {mname} does not exist, please check it'

            try:
                ensemble_d = pickle.load(open(self.ENS_PATH / "ensemble_d.pickle", "rb"))
                ensemble_ID = self.ensemble_exists(mnames, ensemble_d)
                
                if ensemble_ID is False:
                    ensemble_ID = max(ensemble_d.keys()) + 1
                    ensemble_d[ensemble_ID] = mnames
                    pickle.dump(ensemble_d, open(self.ENS_PATH / "ensemble_d.pickle", "wb"))
                    self.run_experiment(ensemble_ID, bs, lr, n_epochs)      

                else:
                    print(f'This ensemble exists already, it\'s ID is: {ensemble_ID}')
                    print('Would you like to run another experiment on this ensemble?')
                    button = widgets.Button(description='Yes')    
                    display(button)
                    button.on_click(lambda b: self.run_experiment(ensemble_ID, bs, lr, n_epochs))
                    
            except (OSError, IOError) as e:
                ensemble_d = {}
                ensemble_ID = 0
                ensemble_d[ensemble_ID] = mnames
                pickle.dump(ensemble_d, open(self.ENS_PATH / "ensemble_d.pickle", "wb"))
                self.run_experiment(ensemble_ID, bs, lr, n_epochs)

        else:
            self.run_experiment(ensemble_ID, bs, lr, n_epochs)      
              
    def run_experiment(self, ensemble_ID, bs, lr, n_epochs, model_importance=False):                  
        results_l_trn, results_l_val, results_l_test = self.assemble_results(ensemble_ID)
        assert len(results_l_trn) == len(results_l_val) == len(results_l_test)
        n = len(results_l_trn)
        md = self.create_ModelData(results_l_trn, results_l_val, results_l_test, bs)
        ens_learn = self.ensemble_network(md, n, lr, n_epochs)
        if model_importance == False:
            print("Would you like to add the experiment to your report?")    
            button = widgets.Button(description='Yes')    
            display(button)
            button.on_click(lambda b: self.add2report(ens_learn, ensemble_ID))             
        else:
            return ens_learn   
                
    def assemble_results(self, ensemble_ID):
        ensemble_d = pickle.load(open(self.ENS_PATH / "ensemble_d.pickle", "rb"))
        mnames = ensemble_d[ensemble_ID]
        results_l_trn = [np.load(self.RES_PATH / f'{mname}_trn.npy') for mname in mnames]
        results_l_val = [np.load(self.RES_PATH / f'{mname}_val.npy') for mname in mnames]
        results_l_test = [np.load(self.TEST_PATH / f'{mname}_test.npy') for mname in mnames]
        return results_l_trn, results_l_val, results_l_test
        
    def create_ModelData(self, results_l_trn, results_l_val, results_l_test, bs):
        res_trn_ds = ResultsDataset(results_l_trn, self.targets_trn)
        res_val_ds = ResultsDataset(results_l_val, self.targets_val)
        res_test_ds = ResultsDataset(results_l_test, np.zeros(len(results_l_test[0])))
        trn_dl = DataLoader(res_trn_ds, bs)
        val_dl = DataLoader(res_val_ds, bs)
        test_dl = DataLoader(res_test_ds, bs)
        return ModelData(self.ENS_PATH, trn_dl, val_dl, test_dl)
        
    def ensemble_network(self, md, n, lr, n_epochs):
        ens = EnsembleNet(n, self.c).cuda()
        crit = nn.NLLLoss()
        metrics =[accuracy]
        torch.backends.cudnn.benchmark=True
        ens_learn = Learner(md, SingleModel(ens), opt_fn=optim.Adam, metrics=metrics, crit=crit)
        ens_learn.fit(lr, n_epochs)
        return ens_learn
    
    def add2report(self, ens_learn, ensemble_ID):
        results = self.run_model(ens_learn, ens_learn.data.val_dl)
        ensemble_d = pickle.load(open(self.ENS_PATH / "ensemble_d.pickle", "rb"))
        mnames_all = [v for v in ensemble_d.values()]  
        mnames_all = set([item for sublist in mnames_all for item in sublist])
        mnames = ensemble_d[ensemble_ID]
        row_df = pd.DataFrame(index=[0], columns=['accuracy', 'datetime', 'ensemble_ID', 'mapk'])
        row_df['ensemble_ID'] = ensemble_ID
        now = datetime.datetime.now()
        row_df['datetime'] =str(now.strftime("%Y-%m-%d %H:%M"))
        row_df['accuracy'] = self.accuracy(results, self.targets_val)
        row_df['mapk'] = self.mapk_score(results, self.targets_val)

        try:
            ensemble_df = pd.read_feather(self.ENS_PATH / 'ensemble_df')
            ensemble_df = pd.concat([row_df, ensemble_df])
            ensemble_df = ensemble_df.reset_index().drop(['index'], axis=1)
            ensemble_df.to_feather(self.ENS_PATH / 'ensemble_df')
            print(ensemble_df)

        except (OSError, IOError) as e:
            row_df.to_feather(self.ENS_PATH / 'ensemble_df')
            print(row_df)

        print("\n"'''Save ens_learn? Save ens_learn only if you want to run 
ensemble predictions to create a submission file.''')      
        button = widgets.Button(description='Yes')
        display(button)
        button.on_click(lambda b: self.save_ens_learn(ens_learn, ensemble_ID))
            
    def save_ens_learn(self, ens_learn, ensemble_ID):
        name = f'ens_learn_{ensemble_ID}.pkl'
        existing_names = !ls {str(self.ENS_PATH)}
        if name in existing_names:
            print('''An ens_learn object for this ensemble exists already, 
would you like to replace it?''')
            button = widgets.Button(description='Yes')
            display(button)
            button.on_click(lambda b: pickle.dump(ens_learn, open(self.ENS_PATH / name, 'wb')))            
        else:                         
            pickle.dump(ens_learn, open(self.ENS_PATH / name, 'wb'))
        
    def ensemble_exists(self, mnames, ensemble_d):
        ensembles = [(k,v) for k,v in ensemble_d.items()]  
        for k,v in ensembles:
            if v == mnames:
                return k 
        return False
             
    def accuracy(self, results, targets):
        idxs = np.argmax(results, axis=1)
        return np.where(idxs==targets, 1, 0).sum() / len(results)
    
    def apk(self, actual, predicted, k=10):
        if len(predicted)>k:
            predicted = predicted[:k]
        score = 0.0
        num_hits = 0.0
        for i,p in enumerate(predicted):
            if p in actual and p not in predicted[:i]:
                num_hits += 1.0
                score += num_hits / (i+1.0)

        if not actual:
            return 0.0

        return score / min(len(actual), k)

    def mapk(self, actual, predicted, k=10):
        return np.mean([self.apk(a,p,k) for a,p in zip(actual, predicted)])

    def mapk_score(self, results, targets):
        predictions = [list(np.argsort(results[i])[::-1][:3]) for i in range(len(results))]
        actual = [[i] for i in targets]
        return self.mapk(actual, predictions, k=3)
        
    def view_report(self):
        try:
            ensemble_df = pd.read_feather(self.ENS_PATH / 'ensemble_df')
            print(ensemble_df)
        except (OSError, IOError) as e:
            print('Report can be displayed when you have created an ensemble')
    
    def view_model_descriptions(self):
        try:
            mdescription_d = pickle.load(open(self.ENS_PATH / "mdescription_d.pickle", "rb"))
            desc_df = pd.DataFrame.from_dict(mdescription_d, orient='index')
            desc_df.columns = ['Description', 'Accuracy']
            print(desc_df)
        except (OSError, IOError) as e:
            print('No models have been added yet')
    
    def create_submission(self, ensemble_ID, name):
        ens_learn_name = f'ens_learn_{ensemble_ID}.pkl'
        existing_ens_learn_names = !ls {str(self.ENS_PATH)}
        assert ens_learn_name in existing_ens_learn_names, '''An ens_learn object for this 
ensemble does not exist. Please remember to save ens_learn after you have added an 
experiment to the report.'''
        ens_learn = pickle.load(open(self.ENS_PATH/f'ens_learn_{ensemble_ID}.pkl', "rb"))
        predictions = self.run_model(ens_learn, ens_learn.data.test_dl)
        predictions = self.preds_to_labels(predictions, self.classes)
        self.create_submission_file(predictions, name)
        
    def preds_to_labels(self, p, labels):
        predictions = [list(np.argsort(p[i])[::-1][:3]) for i in range(len(p))]
        prediction_labels = []

        for pred in predictions:
            label_list = []
            for output in pred:
                label_list.append(labels[output])
            prediction_labels.append(label_list)
        return prediction_labels

    def create_submission_file(self, predictions, name):
        predictions = ['{} {} {}'.format(x[0], x[1], x[2]) for x in predictions]
        submission = pd.read_csv(PATH/'sample_submission_no_bads.csv')
        submission.label = predictions
        submission.to_csv(f'{self.ENS_PATH}/{name}', index=False)
        print(f'Submission saved to {self.ENS_PATH/name}')    
        
    def model_importance(self, ensemble_ID, bs, lr, n_epochs):
        ens_learn = self.run_experiment(ensemble_ID, bs, lr, n_epochs, model_importance=True)     
        results = self.run_model(ens_learn, ens_learn.data.val_dl)
        accuracy_all = self.accuracy(results, self.targets_val)
        mapk_all = self.mapk_score(results, self.targets_val)
        ensemble_IDs = self.complement_ensemble(ensemble_ID)
        model_importance_d = {}
 
        for i in ensemble_IDs:
            print('')
            print(f'ensemble_ID: {i}')
            ens_learn = self.run_experiment(i, bs, lr, n_epochs, model_importance=True)     
            results = self.run_model(ens_learn, ens_learn.data.val_dl)
            accuracy = self.accuracy(results, self.targets_val)
            mapk = self.mapk_score(results, self.targets_val)
            model_importance_d[i] = [accuracy_all-accuracy, mapk_all-mapk] 
            
        ID_to_removed_model_d = {}
        for i in model_importance_d:
            model = self.removed_model(ensemble_ID, i)
            ID_to_removed_model_d[i] = model 
  
        model_importance_df = pd.DataFrame.from_dict(model_importance_d, orient='index').reset_index(drop=True)
        model_importance_df.columns = ['accuracy', 'mapk']
        model_importance_df['model'] = model_importance_d.keys()
        model_importance_df['model'] = model_importance_df['model'].map(ID_to_removed_model_d)
        model_importance_df.sort_values(by=['accuracy'], ascending=False, inplace=True)
        print(model_importance_df)

    def removed_model(self, ensemble_ID, comparison_ID):
        ensemble_d = pickle.load(open(self.ENS_PATH / "ensemble_d.pickle", "rb"))
        base_s = ensemble_d[ensemble_ID]
        one_removed_s = ensemble_d[comparison_ID]
        return base_s.difference(one_removed_s)
            
    def complement_ensemble(self, ensemble_ID):
        ensemble_d = pickle.load(open(self.ENS_PATH / "ensemble_d.pickle", "rb"))
        model_s = ensemble_d[ensemble_ID]
        one_model_removed_sets = self.create_one_model_removed_sets(model_s)
        existing_sets = [ensemble_d[i] for i in ensemble_d]
        complement_sets = [i for i in one_model_removed_sets if i not in existing_sets] 
        for i in complement_sets:
            ensemble_ID = max(ensemble_d.keys()) + 1
            ensemble_d[ensemble_ID] = i
        
        ensemble_IDs = [i for i in ensemble_d if ensemble_d[i] in one_model_removed_sets]
        pickle.dump(ensemble_d, open(self.ENS_PATH / "ensemble_d.pickle", "wb"))
        return ensemble_IDs

    def create_one_model_removed_sets(self, model_s):
        model_s = list(model_s)
        one_removed_sets = []
        for i in range(len(model_s)):
            one_removed_sets.append(set(model_s[i:] + model_s[:i-1]))
        return one_removed_sets[1:]
    
    def view_models_in_ensemble(self, ensemble_ID):
        ensemble_d = pickle.load(open(self.ENS_PATH / "ensemble_d.pickle", "rb"))
        return ensemble_d[ensemble_ID]
    
    def view_sorted_model_descriptions(self):
        try:
            mdescription_d = pickle.load(open(self.ENS_PATH / "mdescription_d.pickle", "rb"))
            desc_df = pd.DataFrame.from_dict(mdescription_d, orient='index')
            desc_df.columns = ['Description', 'Accuracy']
            desc_df.sort_values(by=['Accuracy'], ascending=False, inplace=True)
            return desc_df
        except (OSError, IOError) as e:
            print('No models have been added yet')
    

#### Create an instance of class Ensemble

In [423]:
c = 41
targets_train = split_df.iloc[ensemble_train_idxs].label.values
targets_val = split_df.iloc[val_split_idxs].label.values
PATH = Path('data/freesound')
freesound_ensemble = Ensemble(c, classes, ensemble_trn_dl, targets_train, targets_val, PATH) 

#### Add a model to your pool of models

In [228]:
mname = 'r31'
mdescription = 'resnet trained on random training set without weight decay'
own_accuracy = '0.64'

How ensembling works:
- 'c' is the number of target categories 
- training and validation dataloaders concatenate the training and validation results of the models in the ensemble to load the data as training and validation minibatches (batch size x (c * number of models)) to a neural network
- the neural network unconcatenates the results in each minibatch and matrix multiplies each bit (batch size x c) with a linear layer of size 'c'
- the neural network adds the resulting activations together and softmaxes the result
- a negative log loss function is used in optimizing the network 

Hence, you must always use the same training and validation sets when adding models to your model pool.

- Add a new fast.ai model to your model pool

In [229]:
freesound_ensemble.add_model(learn, mname, mdescription, own_accuracy)

- Add a new model by using training, validation and test results (numpy arrays)

In [482]:
mname = 'catboost'
mdescription = 'catboost'
own_accuracy = '0.75'
results_trn = np.load(PATH/'catboost_train.npy')
results_val = np.load(PATH/'catboost_val.npy')
results_test = np.load(PATH/'catboost_test.npy')

In [483]:
freesound_ensemble.add_model_by_using_results(mname, mdescription, own_accuracy, results_trn, results_val, results_test)

#### View model descriptions

In [417]:
freesound_ensemble.view_model_descriptions()

                                               Description Accuracy
r_2      resnet trained on random training set with wei...     0.67
r_4      resnet trained on random training set with wei...     0.65
r_5      resnet trained on ensemble training set with w...     0.67
r_6      resnet trained on mix of training sets with we...     0.67
r_7      resnet trained on mix of training sets with we...     0.67
r_10     resnet trained on mix of training sets with we...     0.68
r_15     resnet trained on random training set with wei...     0.68
r_18     resnet trained on random training set with wei...     0.69
r_22     resnet trained on random training set with wei...     0.66
r_26     resnet trained on random training set with wei...     0.65
r_30     resnet trained on random training set with wei...     0.65
r34_3    resnet34 trained on random training set with w...     0.71
r34_5    resnet34 trained on random training set withou...     0.74
r34_6    resnet34 trained on random training set

#### Model descriptions shorted by descending accuracy

In [496]:
desc_df = freesound_ensemble.view_sorted_model_descriptions()
desc_df.index

Index(['lgbm', 'rx_5', 'r34_10', 'rx_9', 'r34_14', 'r_33', 'rx_13', 'rx_7',
       'catboost', 'r34_5', 'r34_11', 'r34_15', 'r34_8', 'rx_2', 'r34_7',
       'rx_11', 'r34_3', 'r34ev_3', 'rx_3', 'r_18', 'alex_16', 'r_15', 'r34_6',
       'r_10', 'r_7', 'r_5', 'alex_6', 'r_6', 'r_2', 'r101_2', 'r_raw_1',
       'r_22', 'r_30', 'r_4', 'r34_12', 'r34b_1', 'r_26', 'r31', 'r34b_2',
       'xbg_1', 'xbg_2', 'xbg_3', 'rb_1', 'rb_2', 'rb_3'],
      dtype='object')

#### Try out an ensemble

In [521]:
bs = 56; lr = 0.003; n_epochs = 3
mnames = ['alex_16', 'catboost', 'xbg_3',
 'alex_6',
 'lgbm',
 'r34_10',
 'r34_15',
 'r34_5',
 'r_10',
 'r_22',
 'r_raw_1',
 'rx_13',
 'rx_5',
 'rx_7',
 'rx_9']

In [529]:
freesound_ensemble.ensemble_magic(bs, lr, n_epochs, mnames)

This ensemble exists already, it's ID is: 569
Would you like to run another experiment on this ensemble?


Button(description='Yes', style=ButtonStyle())

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                      
    0      -0.810172  -0.703729  0.812015  
    1      -0.969878  -0.744093  0.817193                      
    2      -0.9877    -0.757917  0.819265                      
Would you like to add the experiment to your report?


Button(description='Yes', style=ButtonStyle())

    accuracy          datetime  ensemble_ID      mapk
0   0.819265  2018-07-04 19:42          569  0.860953
1   0.818747  2018-07-04 19:31          569  0.862765
2   0.814604  2018-07-04 15:54          461  0.858795
3   0.812532  2018-07-04 15:52          461  0.855688
4   0.801139  2018-07-04 15:44          558  0.849991
5   0.816157  2018-07-04 15:23          461  0.860608
6   0.811497  2018-07-04 15:22          461  0.855602
7   0.812532  2018-07-04 15:22          461  0.856465
8   0.811497  2018-07-04 15:21          445  0.856033
9   0.810979  2018-07-04 15:16          498  0.856810
10  0.813050  2018-07-04 13:52          467  0.855774
11  0.811497  2018-07-04 13:41          466  0.856119
12  0.812015  2018-07-04 13:41          466  0.856637
13  0.812015  2018-07-04 13:31          465  0.856119
14  0.810979  2018-07-04 13:18          464  0.856810
15  0.810461  2018-07-04 13:11          461  0.856551
16  0.815122  2018-07-04 08:48          461  0.859313
17  0.812532  2018-07-03 14:

Button(description='Yes', style=ButtonStyle())

In [478]:
freesound_ensemble.ensemble_magic(bs, lr, n_epochs, ensemble_ID=461)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                      
    0      -0.787297  -0.696785  0.809425  
    1      -0.965889  -0.740519  0.814604                      
    2      -0.985863  -0.754741  0.814604                      
Would you like to add the experiment to your report?


Button(description='Yes', style=ButtonStyle())

    accuracy          datetime  ensemble_ID      mapk
0   0.814604  2018-07-04 15:54          461  0.858795
1   0.812532  2018-07-04 15:52          461  0.855688
2   0.801139  2018-07-04 15:44          558  0.849991
3   0.816157  2018-07-04 15:23          461  0.860608
4   0.811497  2018-07-04 15:22          461  0.855602
5   0.812532  2018-07-04 15:22          461  0.856465
6   0.811497  2018-07-04 15:21          445  0.856033
7   0.810979  2018-07-04 15:16          498  0.856810
8   0.813050  2018-07-04 13:52          467  0.855774
9   0.811497  2018-07-04 13:41          466  0.856119
10  0.812015  2018-07-04 13:41          466  0.856637
11  0.812015  2018-07-04 13:31          465  0.856119
12  0.810979  2018-07-04 13:18          464  0.856810
13  0.810461  2018-07-04 13:11          461  0.856551
14  0.815122  2018-07-04 08:48          461  0.859313
15  0.812532  2018-07-03 14:43          414  0.855688
16  0.806318  2018-07-03 14:36          385  0.851890
17  0.807354  2018-07-03 13:

Button(description='Yes', style=ButtonStyle())

An ens_learn object for this ensemble exists already, 
would you like to replace it?


Button(description='Yes', style=ButtonStyle())

In [None]:
freesound_ensemble.add2report.

#### View report

In [492]:
freesound_ensemble.view_report()

    accuracy          datetime  ensemble_ID      mapk
0   0.818747  2018-07-04 19:31          569  0.862765
1   0.814604  2018-07-04 15:54          461  0.858795
2   0.812532  2018-07-04 15:52          461  0.855688
3   0.801139  2018-07-04 15:44          558  0.849991
4   0.816157  2018-07-04 15:23          461  0.860608
5   0.811497  2018-07-04 15:22          461  0.855602
6   0.812532  2018-07-04 15:22          461  0.856465
7   0.811497  2018-07-04 15:21          445  0.856033
8   0.810979  2018-07-04 15:16          498  0.856810
9   0.813050  2018-07-04 13:52          467  0.855774
10  0.811497  2018-07-04 13:41          466  0.856119
11  0.812015  2018-07-04 13:41          466  0.856637
12  0.812015  2018-07-04 13:31          465  0.856119
13  0.810979  2018-07-04 13:18          464  0.856810
14  0.810461  2018-07-04 13:11          461  0.856551
15  0.815122  2018-07-04 08:48          461  0.859313
16  0.812532  2018-07-03 14:43          414  0.855688
17  0.806318  2018-07-03 14:

#### Model importance report

Model importance is calculated as the difference of accuracy / mapk when a single model is removed from an ensemble. The report calculates model importance for all models included in the ensemble passed in. The report is sorted according to descending accuracy importance.    

In [394]:
ensemble_ID = 545
bs = 56; lr = 0.0041; n_epochs = 3

In [395]:
freesound_ensemble.model_importance(ensemble_ID, bs, lr, n_epochs)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                      
    0      -0.789506  -0.708914  0.815122  
    1      -0.959631  -0.75136   0.816675                      
    2      -0.980802  -0.764941  0.817711                      

ensemble_ID: 548


HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                      
    0      -0.757449  -0.690187  0.812532  
    1      -0.955858  -0.740928  0.816157                      
    2      -0.980167  -0.756487  0.819265                     

ensemble_ID: 549


HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                      
    0      -0.765674  -0.693531  0.81564   
    1      -0.956815  -0.742264  0.817193                      
    2      -0.980198  -0.757527  0.8203                       

ensemble_ID: 550


HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                      
    0      -0.752454  -0.691728  0.814086  
    1      -0.950924  -0.74179   0.818229                      
    2      -0.97629   -0.757229  0.819265                      

ensemble_ID: 551


HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                      
    0      -0.760953  -0.703146  0.811497  
    1      -0.956017  -0.749335  0.815122                      
    2      -0.979463  -0.763239  0.81564                      

ensemble_ID: 552


HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                      
    0      -0.75224   -0.699729  0.814604  
    1      -0.952021  -0.747141  0.814604                      
    2      -0.976993  -0.761379  0.816675                      

ensemble_ID: 553


HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                      
    0      -0.753448  -0.692553  0.815122  
    1      -0.95244   -0.74332   0.816157                      
    2      -0.977616  -0.758766  0.816675                      

ensemble_ID: 554


HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                      
    0      -0.745251  -0.69793   0.813568  
    1      -0.950298  -0.748181  0.818747                      
    2      -0.976158  -0.763044  0.820818                      

ensemble_ID: 555


HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                      
    0      -0.764093  -0.695425  0.818229  
    1      -0.957092  -0.744584  0.820818                      
    2      -0.980518  -0.759837  0.822372                      

ensemble_ID: 556


HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                      
    0      -0.746298  -0.688079  0.81564   
    1      -0.949764  -0.740583  0.815122                      
    2      -0.97586   -0.756184  0.814086                      

ensemble_ID: 557


HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                      
    0      -0.746134  -0.700669  0.808389  
    1      -0.947916  -0.748495  0.808389                      
    2      -0.973677  -0.762684  0.810979                      
   accuracy      mapk      model
9  0.006732  0.004920     {lgbm}
8  0.003625  0.003107   {r34_10}
3  0.002071  0.001726    {xbg_3}
4  0.001036  0.003539  {r_raw_1}
5  0.001036  0.002071     {rx_7}
0 -0.001554 -0.000345     {rx_9}
2 -0.001554  0.000259     {rx_5}
1 -0.002589  0.000432  {r34ev_3}
6 -0.003107 -0.002762  {alex_16}
7 -0.004661 -0.003194   {r34_15}


#### View models in an ensemble

In [497]:
ensemble_ID = 569

In [498]:
freesound_ensemble.view_models_in_ensemble(ensemble_ID)

{'alex_16',
 'alex_6',
 'catboost',
 'lgbm',
 'r34_10',
 'r34_15',
 'r34_5',
 'r_10',
 'r_22',
 'r_raw_1',
 'rx_13',
 'rx_5',
 'rx_7',
 'rx_9',
 'xbg_3'}

In [None]:
'lgbm', 'rx_5', 'r34_10', 'rx_9', 'r34_14', 'r_33', 'rx_13', 'rx_7',
       'catboost', 'r34_5', 'r34_11', 'r34_15', 'r34_8', 'rx_2', 'r34_7',
       'rx_11', 'r34_3', 'r34ev_3', 'rx_3', 'r_18', 'alex_16', 'r_15', 'r34_6',
       'r_10', 'r_7', 'r_5', 'alex_6', 'r_6', 'r_2', 'r101_2', 'r_raw_1',
       'r_22', 'r_30', 'r_4', 'r34_12', 'r34b_1', 'r_26', 'r31', 'r34b_2',
       'xbg_1', 'xbg_2', 'xbg_3', 'rb_1', 'rb_2', 'rb_3'

#### Create a submission file

In [493]:
ensemble_ID = 569
name = 'submission_9.csv' 

freesound_ensemble.create_submission(ensemble_ID, name)

Submission saved to data/freesound/ensemble_files/submission_9.csv


In [494]:
submission_df = pd.read_csv(PATH/f'ensemble_files/{name}')

for idx, fname in bad_files_d.items():    
    row = pd.DataFrame({"fname": fname, "label": 'Laughter Hi-Hat Flute'}, index=[idx])
    submission_df = pd.concat([submission_df.iloc[:idx], row, submission_df.iloc[idx:]])

submission_df.to_csv(PATH/f'ensemble_files/{name}', index=False)
len(submission_df)

9400