In [1]:
import os, sys
os.chdir('../')

In [2]:
import argparse
import torch
from tqdm import tqdm
import data_loader.data_loaders as module_data
import loss as module_loss
import model.metric as module_metric
import model.model as module_arch

import easydict
import torch.nn as nn
import torch.nn.functional as F
import sys
import os
import json
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 

import data_loader.data_loaders as module_data
import model.model as module_arch

from selection.svd_classifier import *
from selection.gmm import *
from selection.util import *

from utils.parse_config import ConfigParser
from utils.util import *
from utils.args import *

In [3]:
config_file = './hyperparams/multistep/config_cifar10_cce_rn34.json'
with open(config_file, 'r') as f:
    config = json.load(f)

# resume_path = './rn34/multistep_asym_40_elr.pth'

In [4]:
def decode(path):
    items = path.split('_')
    noisetype = True if items[1]=='asym' else False
    noiserate = float(items[2]) * 0.01
    
    return noisetype, noiserate, items[3].split('.')[0]

In [5]:
def make_parse(resume_path, config, noise_rate, noisetype):
    parse = easydict.EasyDict({
    "load_name" : resume_path,
    "reinit": False,
    "distill_mode": 'fine-kmeans'
    })
    
    config['trainer']['percent'] = noise_rate
    config['trainer']['asym'] = noisetype
    
    return parse, config

In [10]:
def return_statistics(dataloader, clean_labels, datanum):
    predict = np.zeros(datanum)
    for idx in clean_labels: predict[idx] = 1
        
    isNoisy_list = compute_noiseratio(dataloader)
    r_stats = []
    
    tp = (isNoisy_list[predict==1]==1).sum() # positive clean
    tn = (isNoisy_list[predict==0]==0).sum() # negative noisy
    fp = (isNoisy_list==0).sum() - tn
    fn = (isNoisy_list==1).sum() - tp
    
    print('Noisy: {}, Clean: {}'.format((isNoisy_list==0).sum(), (isNoisy_list==1).sum()))
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    specificity = tn / (tn + fp)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    sel_samples = int(fp + tp)
    frac_clean = tp / (fp + tp)
    F1 =  tp / (tp + (fp+fn)/2)

    r_stats.extend([sel_samples, round(precision, 4), round(recall, 4), round(specificity, 4), round(accuracy, 4), round(frac_clean, 4)])
    print('Selected samples: {} \nPrecision: {} \nRecall: {} \nSpecificity: {}\nAccuracy: {} \nFraction of clean samples/selected samples: {}'.format(r_stats[0], r_stats[1], r_stats[2], r_stats[3], r_stats[4], r_stats[5]))
    
    return r_stats[0], r_stats[1], r_stats[2], r_stats[3], r_stats[4], F1

In [7]:
def extract_cleanidx(teacher, data_loader, parse, print_statistics = True):
    teacher.load_state_dict(torch.load('./checkpoint/' + parse.load_name)['state_dict'])
    teacher = teacher.cuda()

    if not parse.reinit: teacher.load_state_dict(torch.load('./checkpoint/' + parse.load_name)['state_dict'])
    for params in teacher.parameters(): params.requires_grad = False
    
    if 'fine' in parse.distill_mode:
        features, labels = get_features(teacher, data_loader)
        clean_labels = fine(current_features=features, current_labels=labels, fit = parse.distill_mode)
    elif 'loss' in parse.distill_mode:
        clean_labels, labels = cleansing_loss(teacher, data_loader)
    else:
        raise NotImplemented 
    if print_statistics: 
        selected, precision, recall, specificity, accuracy, F1 = return_statistics(data_loader, clean_labels, datanum=len(labels))
    
    return selected, precision, recall, specificity, accuracy, F1

In [8]:
def make_pd_list(root, config, log_filename):
    random.seed(config['seed'])
    torch.manual_seed(config['seed'])
    torch.cuda.manual_seed_all(config['seed'])
    torch.backends.cudnn.deterministic = True
    np.random.seed(config['seed'])
    
    # load checkpoint path
    pathlist = os.listdir(root)
    pathlist = [path for path in pathlist if ('.pth' in path) and ('eigen' not in path) and ('kmeans' not in path) and ('c100') not in path]
    
    # initialize model
    model = module_arch.resnet34(num_classes=10)
    
    # make pandas file
    logcolumns = ['noisetype', 'noiserate', 'lossfunction', 'selected', 'precision', 'recall', 'specificity', 'accuracy', 'F1']
    log_pd = pd.DataFrame(np.zeros([len(pathlist), len(logcolumns)]), columns = logcolumns)
    
    # write pandas file
    noisetypelst = ['']
    for i in range(len(pathlist)):
        noisetype, noiserate, lossfunction = decode(pathlist[i])
        parse, config = make_parse('./rn34/' + pathlist[i], config, noiserate, noisetype)
        
        # load original dataloader
        data_loader = getattr(module_data, config['data_loader']['type'])(
        config['data_loader']['args']['data_dir'],
        batch_size= 100,
        shuffle=False,
        validation_split=0.0,
        num_batches=config['data_loader']['args']['num_batches'],
        training=True,
        num_workers=config['data_loader']['args']['num_workers'],
        pin_memory=config['data_loader']['args']['pin_memory'],
        config=config)
        
        selected, precision, recall, specificity, accuracy, F1 = extract_cleanidx(model, data_loader, parse)
        log_pd.loc[i] = [str(noisetype), str(noiserate), lossfunction, selected, precision, recall, specificity, accuracy, F1]
        log_pd.to_csv(log_filename)
        
    return log_pd

In [None]:
make_pd_list(root = './checkpoint/rn34/', config=config, log_filename = 'c10_n_fine_pretrained_statistics.csv')

Files already downloaded and verified
##############
[3 2 1 1 3 0 0 7 6 5]
[3 2 1 1 3 2 2 7 6 5]
Train: 50000 Val: 0


100%|██████████| 500/500 [00:28<00:00, 15.99it/s]
100%|██████████| 10/10 [00:23<00:00,  1.86s/it]
100%|██████████| 50000/50000 [00:00<00:00, 71004.19it/s]
100%|██████████| 500/500 [00:04<00:00, 105.88it/s]


Noisy: 9226, Clean: 40774
Selected samples: 40837 
Precision: 0.9349 
Recall: 0.9364 
Specificity: 0.7119
Accuracy: 0.8949 
Fraction of clean samples/selected samples: 0.9349
Files already downloaded and verified
##############
[8 9 1 9 4 8 3 6 3 6]
[3 2 1 1 3 2 2 7 6 5]
Train: 50000 Val: 0


100%|██████████| 500/500 [00:37<00:00,  9.46it/s]
100%|██████████| 10/10 [00:16<00:00,  1.66s/it]
100%|██████████| 50000/50000 [00:00<00:00, 79307.32it/s] 
100%|██████████| 500/500 [00:03<00:00, 125.98it/s]


Noisy: 36036, Clean: 13964
Selected samples: 12962 
Precision: 0.8709 
Recall: 0.8084 
Specificity: 0.9536
Accuracy: 0.913 
Fraction of clean samples/selected samples: 0.8709
Files already downloaded and verified
##############
[3 2 1 1 3 2 2 7 6 5]
[3 2 1 1 3 2 2 7 6 5]
Train: 50000 Val: 0


100%|██████████| 500/500 [00:36<00:00, 13.52it/s]
100%|██████████| 10/10 [00:21<00:00,  2.06s/it]
100%|██████████| 50000/50000 [00:01<00:00, 40224.03it/s]
100%|██████████| 500/500 [00:04<00:00, 114.02it/s]


Noisy: 2458, Clean: 47542
Selected samples: 45809 
Precision: 0.9969 
Recall: 0.9605 
Specificity: 0.9414
Accuracy: 0.9596 
Fraction of clean samples/selected samples: 0.9969
Files already downloaded and verified
##############
[3 9 1 1 3 8 2 7 6 5]
[3 2 1 1 3 2 2 7 6 5]
Train: 50000 Val: 0


100%|██████████| 500/500 [00:37<00:00, 13.23it/s]
100%|██████████| 10/10 [00:16<00:00,  1.63s/it]
100%|██████████| 50000/50000 [00:00<00:00, 83378.41it/s]
100%|██████████| 500/500 [00:03<00:00, 125.48it/s]


Noisy: 9006, Clean: 40994
Selected samples: 38698 
Precision: 0.9988 
Recall: 0.9428 
Specificity: 0.9948
Accuracy: 0.9522 
Fraction of clean samples/selected samples: 0.9988
Files already downloaded and verified
##############
[8 9 1 9 4 8 3 6 3 6]
[3 2 1 1 3 2 2 7 6 5]
Train: 50000 Val: 0


100%|██████████| 500/500 [00:41<00:00, 14.69it/s]
100%|██████████| 10/10 [00:21<00:00,  2.18s/it]
100%|██████████| 50000/50000 [00:00<00:00, 58989.08it/s]
100%|██████████| 500/500 [00:04<00:00, 109.23it/s]


Noisy: 36036, Clean: 13964
Selected samples: 12943 
Precision: 0.8942 
Recall: 0.8288 
Specificity: 0.962
Accuracy: 0.9248 
Fraction of clean samples/selected samples: 0.8942
Files already downloaded and verified
##############
[8 9 1 9 3 8 2 7 3 5]
[3 2 1 1 3 2 2 7 6 5]
Train: 50000 Val: 0


100%|██████████| 500/500 [00:26<00:00, 10.15it/s]
100%|██████████| 10/10 [00:17<00:00,  1.74s/it]
100%|██████████| 50000/50000 [00:00<00:00, 79150.45it/s] 
100%|██████████| 500/500 [00:04<00:00, 123.02it/s]


Noisy: 17975, Clean: 32025
Selected samples: 30524 
Precision: 0.9902 
Recall: 0.9438 
Specificity: 0.9834
Accuracy: 0.9581 
Fraction of clean samples/selected samples: 0.9902
Files already downloaded and verified
##############
[8 9 1 9 3 8 3 7 3 5]
[3 2 1 1 3 2 2 7 6 5]
Train: 50000 Val: 0


100%|██████████| 500/500 [00:40<00:00, 12.36it/s]
100%|██████████| 10/10 [00:17<00:00,  1.66s/it]
100%|██████████| 50000/50000 [00:00<00:00, 99461.23it/s]
100%|██████████| 500/500 [00:03<00:00, 126.21it/s]


Noisy: 27041, Clean: 22959
Selected samples: 21025 
Precision: 0.9645 
Recall: 0.8832 
Specificity: 0.9724
Accuracy: 0.9314 
Fraction of clean samples/selected samples: 0.9645
Files already downloaded and verified
##############
[3 9 1 1 3 8 2 7 6 5]
[3 2 1 1 3 2 2 7 6 5]
Train: 50000 Val: 0


100%|██████████| 500/500 [00:31<00:00, 11.41it/s]
100%|██████████| 10/10 [00:19<00:00,  1.81s/it]
100%|██████████| 50000/50000 [00:00<00:00, 73966.08it/s]
100%|██████████| 500/500 [00:04<00:00, 116.27it/s]


Noisy: 9006, Clean: 40994
Selected samples: 38366 
Precision: 0.9974 
Recall: 0.9335 
Specificity: 0.9891
Accuracy: 0.9435 
Fraction of clean samples/selected samples: 0.9974
Files already downloaded and verified
##############
[3 2 1 1 3 0 2 7 6 5]
[3 2 1 1 3 2 2 7 6 5]
Train: 50000 Val: 0


100%|██████████| 500/500 [00:42<00:00, 11.73it/s]
100%|██████████| 10/10 [00:20<00:00,  1.65s/it]
100%|██████████| 50000/50000 [00:00<00:00, 72988.13it/s]
100%|██████████| 500/500 [00:04<00:00, 120.10it/s]


Noisy: 7020, Clean: 42980
Selected samples: 40887 
Precision: 0.9816 
Recall: 0.9338 
Specificity: 0.8927
Accuracy: 0.928 
Fraction of clean samples/selected samples: 0.9816
Files already downloaded and verified
##############
[8 9 1 9 3 8 3 7 3 5]
[3 2 1 1 3 2 2 7 6 5]
Train: 50000 Val: 0


100%|██████████| 500/500 [00:31<00:00, 15.87it/s]
100%|██████████| 10/10 [00:20<00:00,  1.84s/it]
100%|██████████| 50000/50000 [00:00<00:00, 85406.42it/s]
100%|██████████| 500/500 [00:04<00:00, 110.40it/s]


Noisy: 27041, Clean: 22959
Selected samples: 22559 
Precision: 0.9495 
Recall: 0.933 
Specificity: 0.9579
Accuracy: 0.9464 
Fraction of clean samples/selected samples: 0.9495
Files already downloaded and verified
##############
[8 9 1 9 4 8 3 6 3 6]
[3 2 1 1 3 2 2 7 6 5]
Train: 50000 Val: 0


100%|██████████| 500/500 [00:35<00:00,  9.03it/s]
100%|██████████| 10/10 [00:18<00:00,  1.83s/it]
100%|██████████| 50000/50000 [00:00<00:00, 72336.09it/s]
100%|██████████| 500/500 [00:04<00:00, 114.66it/s]


Noisy: 36036, Clean: 13964
Selected samples: 16768 
Precision: 0.7241 
Recall: 0.8695 
Specificity: 0.8716
Accuracy: 0.871 
Fraction of clean samples/selected samples: 0.7241
Files already downloaded and verified
##############
[8 9 1 9 4 8 3 6 3 6]
[3 2 1 1 3 2 2 7 6 5]
Train: 50000 Val: 0


100%|██████████| 500/500 [00:36<00:00, 12.27it/s]
100%|██████████| 10/10 [00:19<00:00,  1.75s/it]
100%|██████████| 50000/50000 [00:00<00:00, 83723.72it/s]
100%|██████████| 500/500 [00:04<00:00, 109.97it/s]


Noisy: 36036, Clean: 13964
Selected samples: 19703 
Precision: 0.6529 
Recall: 0.9213 
Specificity: 0.8102
Accuracy: 0.8413 
Fraction of clean samples/selected samples: 0.6529
Files already downloaded and verified
##############
[3 2 1 1 3 0 2 7 6 5]
[3 2 1 1 3 2 2 7 6 5]
Train: 50000 Val: 0


100%|██████████| 500/500 [00:34<00:00, 14.66it/s]
100%|██████████| 10/10 [00:19<00:00,  1.63s/it]
100%|██████████| 50000/50000 [00:00<00:00, 76152.60it/s]
100%|██████████| 500/500 [00:05<00:00, 91.25it/s] 


Noisy: 7020, Clean: 42980
Selected samples: 41634 
Precision: 0.9783 
Recall: 0.9476 
Specificity: 0.8711
Accuracy: 0.9369 
Fraction of clean samples/selected samples: 0.9783
Files already downloaded and verified
##############
[3 9 1 1 3 8 2 7 6 5]
[3 2 1 1 3 2 2 7 6 5]
Train: 50000 Val: 0


100%|██████████| 500/500 [00:44<00:00, 11.23it/s]
100%|██████████| 10/10 [00:22<00:00,  2.11s/it]
100%|██████████| 50000/50000 [00:00<00:00, 62642.99it/s]
100%|██████████| 500/500 [00:05<00:00, 95.38it/s] 


Noisy: 9006, Clean: 40994
Selected samples: 39441 
Precision: 0.9975 
Recall: 0.9598 
Specificity: 0.9892
Accuracy: 0.9651 
Fraction of clean samples/selected samples: 0.9975
Files already downloaded and verified
##############
[3 2 1 1 3 0 2 7 6 5]
[3 2 1 1 3 2 2 7 6 5]
Train: 50000 Val: 0


100%|██████████| 500/500 [00:31<00:00, 16.12it/s]
100%|██████████| 10/10 [00:19<00:00,  1.74s/it]
100%|██████████| 50000/50000 [00:00<00:00, 76632.90it/s]
100%|██████████| 500/500 [00:04<00:00, 131.59it/s]


Noisy: 4786, Clean: 45214
Selected samples: 43125 
Precision: 0.994 
Recall: 0.9481 
Specificity: 0.9459
Accuracy: 0.9479 
Fraction of clean samples/selected samples: 0.994
Files already downloaded and verified
##############
[8 9 1 9 3 8 3 7 3 5]
[3 2 1 1 3 2 2 7 6 5]
Train: 50000 Val: 0


100%|██████████| 500/500 [00:38<00:00,  9.51it/s]
100%|██████████| 10/10 [00:22<00:00,  2.38s/it]
100%|██████████| 50000/50000 [00:00<00:00, 51725.81it/s]
100%|██████████| 500/500 [00:04<00:00, 109.51it/s]


Noisy: 27041, Clean: 22959
Selected samples: 20675 
Precision: 0.9823 
Recall: 0.8846 
Specificity: 0.9865
Accuracy: 0.9397 
Fraction of clean samples/selected samples: 0.9823
Files already downloaded and verified
##############
[8 9 1 9 3 8 2 7 3 5]
[3 2 1 1 3 2 2 7 6 5]
Train: 50000 Val: 0


100%|██████████| 500/500 [00:29<00:00, 17.04it/s]
100%|██████████| 10/10 [00:17<00:00,  1.69s/it]
100%|██████████| 50000/50000 [00:00<00:00, 84837.21it/s]
100%|██████████| 500/500 [00:03<00:00, 129.81it/s]


Noisy: 17975, Clean: 32025
Selected samples: 30919 
Precision: 0.9869 
Recall: 0.9528 
Specificity: 0.9775
Accuracy: 0.9617 
Fraction of clean samples/selected samples: 0.9869
Files already downloaded and verified
##############
[3 2 1 1 3 2 2 7 6 5]
[3 2 1 1 3 2 2 7 6 5]
Train: 50000 Val: 0


100%|██████████| 500/500 [00:41<00:00, 10.06it/s]
100%|██████████| 10/10 [00:17<00:00,  1.80s/it]
100%|██████████| 50000/50000 [00:00<00:00, 72485.85it/s]
100%|██████████| 500/500 [00:04<00:00, 117.78it/s]


Noisy: 2458, Clean: 47542
Selected samples: 44931 
Precision: 0.9963 
Recall: 0.9416 
Specificity: 0.9325
Accuracy: 0.9411 
Fraction of clean samples/selected samples: 0.9963
Files already downloaded and verified
##############
[3 2 1 1 3 0 2 7 6 5]
[3 2 1 1 3 2 2 7 6 5]
Train: 50000 Val: 0


100%|██████████| 500/500 [00:30<00:00, 14.23it/s]
100%|██████████| 10/10 [00:24<00:00,  2.20s/it]
100%|██████████| 50000/50000 [00:00<00:00, 83579.91it/s]
100%|██████████| 500/500 [00:04<00:00, 115.77it/s]


Noisy: 7020, Clean: 42980
Selected samples: 42301 
Precision: 0.9526 
Recall: 0.9376 
Specificity: 0.7144
Accuracy: 0.9062 
Fraction of clean samples/selected samples: 0.9526
Files already downloaded and verified
##############
[3 2 1 1 3 0 2 7 6 5]
[3 2 1 1 3 2 2 7 6 5]
Train: 50000 Val: 0


100%|██████████| 500/500 [00:42<00:00, 11.76it/s]
100%|██████████| 10/10 [00:21<00:00,  2.03s/it]
100%|██████████| 50000/50000 [00:00<00:00, 60094.55it/s]
100%|██████████| 500/500 [00:04<00:00, 100.54it/s]


Noisy: 4786, Clean: 45214
Selected samples: 42736 
Precision: 0.9917 
Recall: 0.9374 
Specificity: 0.9262
Accuracy: 0.9363 
Fraction of clean samples/selected samples: 0.9917
Files already downloaded and verified
##############
[8 9 1 9 3 8 3 7 3 5]
[3 2 1 1 3 2 2 7 6 5]
Train: 50000 Val: 0


100%|██████████| 500/500 [00:26<00:00, 14.26it/s]
100%|██████████| 10/10 [00:21<00:00,  2.17s/it]
100%|██████████| 50000/50000 [00:00<00:00, 56852.18it/s]
100%|██████████| 500/500 [00:04<00:00, 103.29it/s]


Noisy: 27041, Clean: 22959
Selected samples: 21718 
Precision: 0.967 
Recall: 0.9148 
Specificity: 0.9735
Accuracy: 0.9465 
Fraction of clean samples/selected samples: 0.967
Files already downloaded and verified
##############
[3 2 1 1 3 2 2 7 6 5]
[3 2 1 1 3 2 2 7 6 5]
Train: 50000 Val: 0


100%|██████████| 500/500 [00:39<00:00,  9.61it/s]
100%|██████████| 10/10 [00:17<00:00,  1.76s/it]
100%|██████████| 50000/50000 [00:00<00:00, 82207.02it/s]
100%|██████████| 500/500 [00:04<00:00, 110.79it/s]


Noisy: 2458, Clean: 47542
Selected samples: 45035 
Precision: 0.9969 
Recall: 0.9443 
Specificity: 0.943
Accuracy: 0.9443 
Fraction of clean samples/selected samples: 0.9969
Files already downloaded and verified
##############
[3 9 1 1 3 8 2 7 6 5]
[3 2 1 1 3 2 2 7 6 5]
Train: 50000 Val: 0


100%|██████████| 500/500 [00:30<00:00, 11.12it/s]
100%|██████████| 10/10 [00:19<00:00,  1.88s/it]
100%|██████████| 50000/50000 [00:00<00:00, 69430.26it/s]
100%|██████████| 500/500 [00:04<00:00, 119.43it/s]


Noisy: 9006, Clean: 40994
Selected samples: 39960 
Precision: 0.9979 
Recall: 0.9728 
Specificity: 0.9908
Accuracy: 0.976 
Fraction of clean samples/selected samples: 0.9979
Files already downloaded and verified
##############
[8 9 1 9 3 8 2 7 3 5]
[3 2 1 1 3 2 2 7 6 5]
Train: 50000 Val: 0


 40%|████      | 202/500 [00:12<00:26, 11.04it/s]