In [1]:
import os, sys
os.chdir('../')

In [2]:
import argparse
import torch
from tqdm import tqdm
import data_loader.data_loaders as module_data
import loss as module_loss
import model.metric as module_metric
import model.model as module_arch

import easydict
import torch.nn as nn
import torch.nn.functional as F
import sys
import os
import json
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 

import data_loader.data_loaders as module_data
import model.model as module_arch

from selection.svd_classifier import *
from selection.gmm import *
from selection.util import *

from utils.parse_config import ConfigParser
from utils.util import *
from utils.args import *

In [3]:
config_file = './hyperparams/multistep/config_cifar100_cce_rn34.json'
with open(config_file, 'r') as f:
    config = json.load(f)

# resume_path = './rn34/multistep_asym_40_elr.pth'

In [4]:
def decode(path):
    items = path.split('_')
    noisetype = True if items[2]=='asym' else False
    noiserate = float(items[3]) * 0.01
    
    return noisetype, noiserate, items[4].split('.')[0]

In [5]:
def make_parse(resume_path, config, noise_rate, noisetype):
    parse = easydict.EasyDict({
    "load_name" : resume_path,
    "reinit": False,
    "distill_mode": 'fine-kmeans'
    })
    
    config['trainer']['percent'] = noise_rate
    config['trainer']['asym'] = noisetype
    
    return parse, config

In [10]:
def return_statistics(dataloader, clean_labels, datanum):
    predict = np.zeros(datanum)
    for idx in clean_labels: predict[idx] = 1
        
    isNoisy_list = compute_noiseratio(dataloader)
    r_stats = []
    
    tp = (isNoisy_list[predict==1]==1).sum() # positive clean
    tn = (isNoisy_list[predict==0]==0).sum() # negative noisy
    fp = (isNoisy_list==0).sum() - tn
    fn = (isNoisy_list==1).sum() - tp
    
    print('Noisy: {}, Clean: {}'.format((isNoisy_list==0).sum(), (isNoisy_list==1).sum()))
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    specificity = tn / (tn + fp)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    sel_samples = int(fp + tp)
    frac_clean = tp / (fp + tp)
    F1 =  tp / (tp + (fp + fn)/2)

    r_stats.extend([sel_samples, round(precision, 4), round(recall, 4), round(specificity, 4), round(accuracy, 4), round(frac_clean, 4)])
    print('Selected samples: {} \nPrecision: {} \nRecall: {} \nSpecificity: {}\nAccuracy: {} \nFraction of clean samples/selected samples: {}'.format(r_stats[0], r_stats[1], r_stats[2], r_stats[3], r_stats[4], r_stats[5]))
    
    return r_stats[0], r_stats[1], r_stats[2], r_stats[3], r_stats[4], F1

In [7]:
def extract_cleanidx(teacher, data_loader, parse, print_statistics = True):
    teacher.load_state_dict(torch.load('./checkpoint/' + parse.load_name)['state_dict'])
    teacher = teacher.cuda()

    if not parse.reinit: teacher.load_state_dict(torch.load('./checkpoint/' + parse.load_name)['state_dict'])
    for params in teacher.parameters(): params.requires_grad = False
    
    if 'fine' in parse.distill_mode:
        features, labels = get_features(teacher, data_loader)
        clean_labels = fine(current_features=features, current_labels=labels, fit = parse.distill_mode)
    elif 'loss' in parse.distill_mode:
        clean_labels, labels = cleansing_loss(teacher, data_loader)
    else:
        raise NotImplemented 
    if print_statistics: 
        selected, precision, recall, specificity, accuracy, F1 = return_statistics(data_loader, clean_labels, datanum=len(labels))
    
    return selected, precision, recall, specificity, accuracy, F1

In [8]:
def make_pd_list(root, config, log_filename):
    random.seed(config['seed'])
    torch.manual_seed(config['seed'])
    torch.cuda.manual_seed_all(config['seed'])
    torch.backends.cudnn.deterministic = True
    np.random.seed(config['seed'])
    
    # load checkpoint path
    pathlist = os.listdir(root)
    pathlist = [path for path in pathlist if 'c100' in path]
#     pathlist = [path for path in pathlist if ('.pth' in path) and ('eigen' not in path) and ('kmeans' not in path) and ('c100') not in path]
    
    # initialize model
    model = module_arch.resnet34(num_classes=100)
    
    # make pandas file
    logcolumns = ['noisetype', 'noiserate', 'lossfunction', 'selected', 'precision', 'recall', 'specificity', 'accuracy', 'F1']
    log_pd = pd.DataFrame(np.zeros([len(pathlist), len(logcolumns)]), columns = logcolumns)
    
    # write pandas file
    noisetypelst = ['']
    for i in range(len(pathlist)):
        noisetype, noiserate, lossfunction = decode(pathlist[i])
        parse, config = make_parse('./rn34/' + pathlist[i], config, noiserate, noisetype)
        
        # load original dataloader
        data_loader = getattr(module_data, config['data_loader']['type'])(
        config['data_loader']['args']['data_dir'],
        batch_size= 100,
        shuffle=False,
        validation_split=0.0,
        num_batches=config['data_loader']['args']['num_batches'],
        training=True,
        num_workers=config['data_loader']['args']['num_workers'],
        pin_memory=config['data_loader']['args']['pin_memory'],
        config=config)
        
        selected, precision, recall, specificity, accuracy, F1 = extract_cleanidx(model, data_loader, parse)
        log_pd.loc[i] = [str(noisetype), str(noiserate), lossfunction, selected, precision, recall, specificity, accuracy, F1]
        log_pd.to_csv(log_filename)
        
    return log_pd

In [None]:
make_pd_list(root = './checkpoint/rn34/', config=config, log_filename = 'c100_n_fine_pretrained_statistics.csv')

Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:31<00:00, 16.02it/s]
100%|██████████| 100/100 [00:09<00:00, 10.25it/s]
100%|██████████| 50000/50000 [00:00<00:00, 60268.81it/s]
100%|██████████| 500/500 [00:05<00:00, 105.49it/s]


Noisy: 9993, Clean: 40007
Selected samples: 37441 
Precision: 0.9972 
Recall: 0.9333 
Specificity: 0.9896
Accuracy: 0.9445 
Fraction of clean samples/selected samples: 0.9972
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:24<00:00, 14.58it/s]
100%|██████████| 100/100 [00:07<00:00, 13.92it/s]
100%|██████████| 50000/50000 [00:00<00:00, 103425.98it/s]
100%|██████████| 500/500 [00:04<00:00, 117.35it/s]


Noisy: 9993, Clean: 40007
Selected samples: 37174 
Precision: 0.9838 
Recall: 0.9141 
Specificity: 0.9396
Accuracy: 0.9192 
Fraction of clean samples/selected samples: 0.9838
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:38<00:00, 12.92it/s]
100%|██████████| 100/100 [00:07<00:00, 14.92it/s]
100%|██████████| 50000/50000 [00:00<00:00, 114075.96it/s]
100%|██████████| 500/500 [00:04<00:00, 107.90it/s]


Noisy: 9905, Clean: 40095
Selected samples: 35565 
Precision: 0.9491 
Recall: 0.8419 
Specificity: 0.8173
Accuracy: 0.837 
Fraction of clean samples/selected samples: 0.9491
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:31<00:00, 13.37it/s]
100%|██████████| 100/100 [00:08<00:00, 12.61it/s]
100%|██████████| 50000/50000 [00:00<00:00, 78289.16it/s]
100%|██████████| 500/500 [00:04<00:00, 110.46it/s]


Noisy: 14857, Clean: 35143
Selected samples: 32598 
Precision: 0.9301 
Recall: 0.8627 
Specificity: 0.8465
Accuracy: 0.8579 
Fraction of clean samples/selected samples: 0.9301
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:36<00:00, 11.03it/s]
100%|██████████| 100/100 [00:07<00:00, 13.85it/s]
100%|██████████| 50000/50000 [00:00<00:00, 97406.22it/s] 
100%|██████████| 500/500 [00:04<00:00, 121.36it/s]


Noisy: 19856, Clean: 30144
Selected samples: 28841 
Precision: 0.7914 
Recall: 0.7572 
Specificity: 0.697
Accuracy: 0.7333 
Fraction of clean samples/selected samples: 0.7914
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:41<00:00,  9.45it/s]
100%|██████████| 100/100 [00:07<00:00, 13.13it/s]
100%|██████████| 50000/50000 [00:00<00:00, 83445.12it/s]
100%|██████████| 500/500 [00:04<00:00, 118.50it/s]


Noisy: 14857, Clean: 35143
Selected samples: 31463 
Precision: 0.9556 
Recall: 0.8556 
Specificity: 0.906
Accuracy: 0.8706 
Fraction of clean samples/selected samples: 0.9556
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:22<00:00, 22.03it/s]
100%|██████████| 100/100 [00:07<00:00, 14.39it/s]
100%|██████████| 50000/50000 [00:00<00:00, 72596.13it/s]
100%|██████████| 500/500 [00:04<00:00, 103.16it/s]


Noisy: 19856, Clean: 30144
Selected samples: 31697 
Precision: 0.7314 
Recall: 0.7691 
Specificity: 0.5712
Accuracy: 0.6905 
Fraction of clean samples/selected samples: 0.7314
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:30<00:00, 13.79it/s]
100%|██████████| 100/100 [00:07<00:00, 14.82it/s]
100%|██████████| 50000/50000 [00:00<00:00, 91061.35it/s] 
100%|██████████| 500/500 [00:05<00:00, 98.46it/s] 


Noisy: 29703, Clean: 20297
Selected samples: 28910 
Precision: 0.6517 
Recall: 0.9283 
Specificity: 0.661
Accuracy: 0.7695 
Fraction of clean samples/selected samples: 0.6517
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:40<00:00, 12.42it/s]
100%|██████████| 100/100 [00:08<00:00, 11.72it/s]
100%|██████████| 50000/50000 [00:00<00:00, 60051.91it/s]
100%|██████████| 500/500 [00:04<00:00, 100.33it/s]


Noisy: 39597, Clean: 10403
Selected samples: 20358 
Precision: 0.4501 
Recall: 0.8808 
Specificity: 0.7173
Accuracy: 0.7513 
Fraction of clean samples/selected samples: 0.4501
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:29<00:00, 16.88it/s]
100%|██████████| 100/100 [00:07<00:00, 12.53it/s]
100%|██████████| 50000/50000 [00:00<00:00, 76221.52it/s]
100%|██████████| 500/500 [00:04<00:00, 113.36it/s]


Noisy: 4969, Clean: 45031
Selected samples: 37850 
Precision: 0.9999 
Recall: 0.8404 
Specificity: 0.9992
Accuracy: 0.8562 
Fraction of clean samples/selected samples: 0.9999
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:42<00:00, 11.79it/s]
100%|██████████| 100/100 [00:07<00:00, 15.09it/s]
100%|██████████| 50000/50000 [00:00<00:00, 81610.68it/s]
100%|██████████| 500/500 [00:04<00:00, 116.80it/s]


Noisy: 29703, Clean: 20297
Selected samples: 24285 
Precision: 0.771 
Recall: 0.9225 
Specificity: 0.8127
Accuracy: 0.8573 
Fraction of clean samples/selected samples: 0.771
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:36<00:00, 13.78it/s]
100%|██████████| 100/100 [00:09<00:00, 10.84it/s]
100%|██████████| 50000/50000 [00:00<00:00, 81607.66it/s]
100%|██████████| 500/500 [00:04<00:00, 110.58it/s]


Noisy: 19797, Clean: 30203
Selected samples: 28494 
Precision: 0.9682 
Recall: 0.9134 
Specificity: 0.9542
Accuracy: 0.9295 
Fraction of clean samples/selected samples: 0.9682
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:27<00:00, 18.37it/s]
100%|██████████| 100/100 [00:07<00:00, 10.89it/s]
100%|██████████| 50000/50000 [00:00<00:00, 66336.29it/s]
100%|██████████| 500/500 [00:04<00:00, 104.09it/s]


Noisy: 4969, Clean: 45031
Selected samples: 42437 
Precision: 0.9994 
Recall: 0.9418 
Specificity: 0.995
Accuracy: 0.9471 
Fraction of clean samples/selected samples: 0.9994
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:35<00:00, 12.74it/s]
100%|██████████| 100/100 [00:07<00:00, 13.39it/s]
100%|██████████| 50000/50000 [00:00<00:00, 86719.59it/s]
100%|██████████| 500/500 [00:04<00:00, 124.12it/s]


Noisy: 19797, Clean: 30203
Selected samples: 30160 
Precision: 0.9518 
Recall: 0.9504 
Specificity: 0.9266
Accuracy: 0.941 
Fraction of clean samples/selected samples: 0.9518
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:30<00:00, 16.37it/s]
100%|██████████| 100/100 [00:07<00:00, 11.10it/s]
100%|██████████| 50000/50000 [00:00<00:00, 73368.94it/s]
100%|██████████| 500/500 [00:04<00:00, 106.27it/s]


Noisy: 14857, Clean: 35143
Selected samples: 31936 
Precision: 0.9874 
Recall: 0.8973 
Specificity: 0.9729
Accuracy: 0.9198 
Fraction of clean samples/selected samples: 0.9874
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:36<00:00, 12.72it/s]
100%|██████████| 100/100 [00:07<00:00, 13.22it/s]
100%|██████████| 50000/50000 [00:00<00:00, 90509.69it/s]
100%|██████████| 500/500 [00:05<00:00, 88.20it/s]


Noisy: 4969, Clean: 45031
Selected samples: 42584 
Precision: 0.9988 
Recall: 0.9445 
Specificity: 0.9899
Accuracy: 0.9491 
Fraction of clean samples/selected samples: 0.9988
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:45<00:00, 11.09it/s]
100%|██████████| 100/100 [00:09<00:00, 10.56it/s]
100%|██████████| 50000/50000 [00:00<00:00, 70795.92it/s] 
100%|██████████| 500/500 [00:04<00:00, 116.44it/s]


Noisy: 9905, Clean: 40095
Selected samples: 39132 
Precision: 0.9844 
Recall: 0.9608 
Specificity: 0.9384
Accuracy: 0.9563 
Fraction of clean samples/selected samples: 0.9844
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:29<00:00, 17.03it/s]
100%|██████████| 100/100 [00:08<00:00, 12.44it/s]
100%|██████████| 50000/50000 [00:00<00:00, 76433.79it/s]
100%|██████████| 500/500 [00:04<00:00, 107.47it/s]


Noisy: 9993, Clean: 40007
Selected samples: 36944 
Precision: 0.9898 
Recall: 0.9141 
Specificity: 0.9625
Accuracy: 0.9237 
Fraction of clean samples/selected samples: 0.9898
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:37<00:00,  9.31it/s]
100%|██████████| 100/100 [00:07<00:00, 13.05it/s]
100%|██████████| 50000/50000 [00:00<00:00, 95166.55it/s] 
100%|██████████| 500/500 [00:04<00:00, 123.25it/s]


Noisy: 19856, Clean: 30144
Selected samples: 28525 
Precision: 0.8884 
Recall: 0.8407 
Specificity: 0.8397
Accuracy: 0.8403 
Fraction of clean samples/selected samples: 0.8884
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:28<00:00, 18.61it/s]
100%|██████████| 100/100 [00:09<00:00, 10.33it/s]
100%|██████████| 50000/50000 [00:00<00:00, 76381.45it/s]
100%|██████████| 500/500 [00:05<00:00, 88.17it/s] 


Noisy: 19797, Clean: 30203
Selected samples: 29532 
Precision: 0.9377 
Recall: 0.9169 
Specificity: 0.9071
Accuracy: 0.913 
Fraction of clean samples/selected samples: 0.9377
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:30<00:00, 16.34it/s]
100%|██████████| 100/100 [00:08<00:00,  9.38it/s]
100%|██████████| 50000/50000 [00:00<00:00, 92325.32it/s] 
100%|██████████| 500/500 [00:05<00:00, 99.47it/s] 


Noisy: 39597, Clean: 10403
Selected samples: 27976 
Precision: 0.3172 
Recall: 0.853 
Specificity: 0.5176
Accuracy: 0.5874 
Fraction of clean samples/selected samples: 0.3172
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:41<00:00, 12.14it/s]
100%|██████████| 100/100 [00:07<00:00, 13.25it/s]
100%|██████████| 50000/50000 [00:00<00:00, 99548.20it/s] 
100%|██████████| 500/500 [00:04<00:00, 109.35it/s]


Noisy: 14857, Clean: 35143
Selected samples: 28934 
Precision: 0.9968 
Recall: 0.8207 
Specificity: 0.9938
Accuracy: 0.8721 
Fraction of clean samples/selected samples: 0.9968
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:31<00:00, 16.02it/s]
100%|██████████| 100/100 [00:08<00:00, 12.38it/s]
100%|██████████| 50000/50000 [00:00<00:00, 79478.61it/s] 
100%|██████████| 500/500 [00:04<00:00, 111.16it/s]


Noisy: 19856, Clean: 30144
Selected samples: 29716 
Precision: 0.827 
Recall: 0.8153 
Specificity: 0.7411
Accuracy: 0.7858 
Fraction of clean samples/selected samples: 0.827
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:42<00:00, 11.76it/s]
100%|██████████| 100/100 [00:08<00:00, 13.16it/s]
100%|██████████| 50000/50000 [00:00<00:00, 82704.56it/s] 
100%|██████████| 500/500 [00:04<00:00, 113.18it/s]


Noisy: 29703, Clean: 20297
Selected samples: 22816 
Precision: 0.8054 
Recall: 0.9054 
Specificity: 0.8505
Accuracy: 0.8728 
Fraction of clean samples/selected samples: 0.8054
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:26<00:00, 18.63it/s]
100%|██████████| 100/100 [00:08<00:00, 14.05it/s]
100%|██████████| 50000/50000 [00:00<00:00, 72049.80it/s]
100%|██████████| 500/500 [00:04<00:00, 106.53it/s]


Noisy: 19797, Clean: 30203
Selected samples: 29452 
Precision: 0.9488 
Recall: 0.9252 
Specificity: 0.9238
Accuracy: 0.9246 
Fraction of clean samples/selected samples: 0.9488
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:25<00:00, 19.90it/s]
100%|██████████| 100/100 [00:09<00:00, 13.66it/s]
100%|██████████| 50000/50000 [00:00<00:00, 72918.14it/s] 
100%|██████████| 500/500 [00:04<00:00, 106.72it/s]


Noisy: 39597, Clean: 10403
Selected samples: 27181 
Precision: 0.321 
Recall: 0.8386 
Specificity: 0.5339
Accuracy: 0.5973 
Fraction of clean samples/selected samples: 0.321
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:39<00:00, 12.74it/s]
100%|██████████| 100/100 [00:08<00:00, 11.37it/s]
100%|██████████| 50000/50000 [00:00<00:00, 55961.06it/s]
100%|██████████| 500/500 [00:05<00:00, 116.42it/s]


Noisy: 29703, Clean: 20297
Selected samples: 22034 
Precision: 0.8561 
Recall: 0.9293 
Specificity: 0.8932
Accuracy: 0.9079 
Fraction of clean samples/selected samples: 0.8561
Files already downloaded and verified
Train: 50000 Val: 0


100%|██████████| 500/500 [00:28<00:00, 19.53it/s]
100%|██████████| 100/100 [00:08<00:00, 11.69it/s]
100%|██████████| 50000/50000 [00:00<00:00, 64309.08it/s]
100%|██████████| 500/500 [00:04<00:00, 115.17it/s]


Noisy: 4969, Clean: 45031
Selected samples: 42859 
Precision: 0.9957 
Recall: 0.9477 
Specificity: 0.9632
Accuracy: 0.9492 
Fraction of clean samples/selected samples: 0.9957
Files already downloaded and verified
Train: 50000 Val: 0


 49%|████▊     | 243/500 [00:20<00:23, 11.04it/s]