In [1]:
import argparse
import torch
import sys
import os
import json
import random
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt 
import sklearn.covariance
import scipy
import pdb

import data_loader.data_loaders as module_data
import loss as module_loss
import model.metric as module_metric
import model.model as module_arch
import torch.nn as nn
import torch.nn.functional as F
import model.model as module_arch

from sklearn import metrics
from sklearn import cluster
from tqdm import tqdm
from torch.autograd import Variable
from parse_config import ConfigParser


In [2]:
def return_statistics(isNoisy_list, predict):
    r_stats = []
    
    tp = (isNoisy_list[predict==0]==0).sum()
    tn = isNoisy_list[predict==1].sum()
    fp = isNoisy_list.sum() - tn
    fn = ((isNoisy_list.shape - isNoisy_list.sum()) - tp).item()
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    specificity = tn / (tn + fp)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    sel_samples = int(fp + tp)
    frac_clean = tp / (fp + tp)

    r_stats.extend([sel_samples, round(precision, 5), round(recall, 5), round(specificity, 5), round(accuracy, 5), round(frac_clean, 5)])
    
    return r_stats

# return_statistics(isNoisy_list, sing_lbl)

def stat_summary(name, stat_list):
    print('Dataset: {}, Net: {}, Noise{}_{}, Loss: {}'
      .format(config['data_loader']['type'], config['arch']['type'], config['trainer']['asym'], config['trainer']['percent'], config['train_loss']['type']))
    
    print("="*50, name , "="*50)

    print('Selected samples by {}: {} \nPrecision: {} \nRecall: {} \nSpecificity: {}\nAccuracy: {} \nFraction of clean samples/selected samples: {}'
                  .format(name, stat_list[0], stat_list[1], stat_list[2], stat_list[3], stat_list[4], stat_list[5]))


# Config, Model Path Setting

In [3]:
# config_file = './hyperparams/multistep/config_cifar10_cce_rn34.json'
config_file = './saved/models/cifar10/resnet34/MultiStepLR/CCELoss/sym/80/config_123.json' #cifar 100
with open(config_file, 'r') as f:
    config = json.load(f)
    
device = torch.device('cuda:0')
config['n_gpu'] = 0

# resume_path = './saved/models/cifar10/resnet34/MultiStepLR/ELRLoss/sym/80/model_best123.pth'
resume_path = './checkpoint/cifar10_rn34_multistep_sym_80_cce.pth'
base_model = getattr(module_arch, config["arch"]['type'])()
checkpoint = torch.load(resume_path)
state_dict = checkpoint['state_dict']
base_model.load_state_dict(state_dict)

config

{'name': 'cifar10_resnet34_multistep',
 'n_gpu': 0,
 'seed': 123,
 'arch': {'type': 'resnet34', 'args': {'num_classes': 10}},
 'num_classes': 10,
 'data_loader': {'type': 'CIFAR10DataLoader',
  'args': {'data_dir': './dir_to_data',
   'batch_size': 128,
   'shuffle': True,
   'num_batches': 0,
   'validation_split': 0,
   'num_workers': 8,
   'pin_memory': True}},
 'optimizer': {'type': 'SGD',
  'args': {'lr': 0.02, 'momentum': 0.9, 'weight_decay': 0.001}},
 'train_loss': {'type': 'CCELoss'},
 'val_loss': 'CrossEntropyLoss',
 'metrics': ['my_metric', 'my_metric2'],
 'lr_scheduler': {'type': 'MultiStepLR',
  'args': {'milestones': [40, 80], 'gamma': 0.01}},
 'trainer': {'epochs': 120,
  'warmup': 0,
  'save_dir': 'saved/',
  'save_period': 1,
  'verbosity': 2,
  'label_dir': 'saved/',
  'monitor': 'max test_my_metric',
  'early_stop': 2000,
  'tensorboard': False,
  'mlflow': True,
  '_percent': 'Percentage of noise',
  'percent': 0.8,
  '_begin': 'When to begin updating labels',
  'beg

In [4]:
# set seed
random.seed(config['seed'])
torch.manual_seed(config['seed'])
torch.cuda.manual_seed_all(config['seed'])
torch.backends.cudnn.deterministic = True
np.random.seed(config['seed'])

data_loader = getattr(module_data, config['data_loader']['type'])(
    config['data_loader']['args']['data_dir'],
    batch_size= 100,
    shuffle=config['data_loader']['args']['shuffle'],
    validation_split=0.0,
    num_batches=config['data_loader']['args']['num_batches'],
    training=True,
    num_workers=config['data_loader']['args']['num_workers'],
    pin_memory=config['data_loader']['args']['pin_memory'],
    config=config
)

if hasattr(data_loader.dataset, 'num_raw_example'):
    num_examp = data_loader.dataset.num_raw_example
else:
    num_examp = len(data_loader.dataset)

critenrion = nn.CrossEntropyLoss()

Files already downloaded and verified
Train: 50000 Val: 0


# Base ResNet

In [5]:
class Represent(nn.Module):
    def __init__(self, base_model):
        super(Represent, self).__init__()
        self.conv1 = base_model.conv1
        self.bn1 = base_model.bn1
        self.layer1 = base_model.layer1
        self.layer2 = base_model.layer2
        self.layer3 = base_model.layer3
        self.layer4 = base_model.layer4
        self.linear = base_model.linear
        
        
    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        y = out.view(out.size(0), -1)
        
        return y
    
    #Feature Extractting
    def feature_list(self, x):
        output_list = []
        out = F.relu(self.bn1(self.conv1(x)))
        for name, module in self.layer1._modules.items():
            out = module(out)
        for name, module in self.layer2._modules.items():
            out = module(out)
        for name, module in self.layer3._modules.items():
            out = module(out)
        for name, module in self.layer4._modules.items():
            out = module(out)
            output_list.append(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        y = self.linear(out)
        return y, output_list

In [6]:
model = Represent(base_model)

In [7]:
isNoisy_list = np.empty((0,))
isFalse_list = np.empty((0,))
label_list = np.empty((0,))
gt_list = np.empty((0,))
conf_list = np.empty((0,))
loss_list = np.empty((0,))

# SAME Threshold

In [8]:
model.eval()
model.to(device)
loss = 0
with tqdm(data_loader) as progress:
    for batch_idx, (data, label, index, label_gt) in enumerate(progress):
        data = data.to(device)
        label, label_gt = label.long().to(device), label_gt.long().to(device)
        output = model(data)
        _, prediction = base_model(data)
        loss = torch.nn.CrossEntropyLoss(reduction='none')(prediction, label)
        confidence, _ = torch.max(torch.nn.functional.softmax(prediction, dim=1), dim=1)
        isNoisy  = label != label_gt
        
        gt_list = np.concatenate((gt_list, label_gt.cpu()))
        label_list = np.concatenate((label_list, label.cpu()))
        isNoisy_list = np.concatenate((isNoisy_list, isNoisy.cpu()))#clean = 0 noise = 1
        conf_list = np.concatenate((conf_list, confidence.detach().cpu()))
        loss_list = np.concatenate((loss_list, loss.detach().cpu()))

        if batch_idx == 0:
            out_list = output.detach().cpu()
        else:
            out_list = np.concatenate((out_list, output.detach().cpu()), axis=0)

100%|██████████| 500/500 [00:29<00:00, 17.01it/s]


In [9]:
def get_singular_value_vector(label_list, out_list):
    
    singular_dict = {}
    v_ortho_dict = {}
    
    for index in np.unique(label_list):
        u, s, v = np.linalg.svd(out_list[label_list==index])
        singular_dict[index] = s[0] / s[1]
        v_ortho_dict[index] = torch.from_numpy(v[:2])

    return singular_dict, v_ortho_dict

In [10]:
# Max eigen vector 보다 작으면 1(Noisy Label)
def singular_label(v_ortho_dict, model_represents, label):
    
    model_represents = torch.from_numpy(model_represents).to(device)
    sing_lbl = torch.zeros(model_represents.shape[0]) 
    sin_score_lbl = torch.zeros(model_represents.shape[0])
    
    for i, data in enumerate(model_represents):
        sin_score_lbl[i] = torch.dot(v_ortho_dict[label[i]][0], data).abs() - torch.dot(v_ortho_dict[label[i]][1], data).abs()
        if torch.dot(v_ortho_dict[label[i]][0], data).abs() < torch.dot(v_ortho_dict[label[i]][1], data).abs():
            sing_lbl[i] = 1
        
    return sing_lbl, sin_score_lbl

In [11]:
def same_score(v_ortho_dict, model_represents, label):
    
    model_represents = torch.from_numpy(model_represents).to(device)
    top1 = torch.zeros(model_represents.shape[0])
    top1_abs = torch.zeros(model_represents.shape[0])
    top12 = torch.zeros(model_represents.shape[0])
    
    for i, data in enumerate(model_represents):
        top1[i] = torch.dot(v_ortho_dict[label[i]][0], data)
        top1_abs[i] = torch.dot(v_ortho_dict[label[i]][0], data).abs()
        top12[i] = torch.dot(v_ortho_dict[label[i]][0], data).abs() - torch.dot(v_ortho_dict[label[i]][1], data).abs()
        
    return top1, top1_abs, top12

In [12]:
def kmean_singular_label(v_ortho_dict, model_represents, label):
    
    model_represents = torch.from_numpy(model_represents).to(device)
    sing_lbl = torch.zeros(model_represents.shape[0])
    sin_score_lbl = torch.zeros(model_represents.shape[0])
    
    for i, data in enumerate(model_represents):
        sin_score_lbl[i] = torch.dot(v_ortho_dict[label[i]][0], data).abs() - torch.dot(v_ortho_dict[label[i]][1], data).abs()
        
    kmeans = cluster.KMeans(n_clusters=2, random_state=0).fit(sin_score_lbl.reshape(-1, 1))
    
    return kmeans, sin_score_lbl

In [13]:
singular_dict, v_ortho_dict = get_singular_value_vector(label_list, out_list)

for key in v_ortho_dict.keys():
    v_ortho_dict[key] = v_ortho_dict[key].to(device)

#For SAME Kmenas
# ksin_score_lbl, sin_score_lbl = kmean_singular_label(v_ortho_dict, out_list, label_list)

top1_score, top1_abs_score, top12_score = same_score(v_ortho_dict, out_list, label_list)

# (SAME) Classwise # number of clean samples(top 1%, 3% , 5%)

In [15]:
# class_lbl_list = []
# class_isNoisy_list = []
# class_sin_score = []
# class_sel_list = [] #top-k 뽑은 것들의 noise(1) clean(0) 여부 

def same_topk(lbl_list, noise_list, score_metric, topk_frac):
    class_lbl_list, class_isNoisy_list = [], []
    class_sin_score, class_sel_list = [], []

    for i in range(0, config['num_classes']):
        frac = int(topk_frac * np.sum(lbl_list == i))
        
        class_lbl_list.append(lbl_list == i)
        tmp_isNoisy = noise_list[class_lbl_list[i]]
        tmp_isNoisy = torch.from_numpy(tmp_isNoisy)
        class_isNoisy_list.append(tmp_isNoisy)

        class_sin_score.append(score_metric[class_lbl_list[i]])
        tmp_sort, tmp_idx = torch.sort(class_sin_score[i], descending=True)

        tmp_nc = torch.index_select(tmp_isNoisy, 0, tmp_idx[:frac])
        
        class_sel_list.append(tmp_nc)

    return class_lbl_list, class_isNoisy_list, class_sin_score, class_sel_list

In [16]:
# 각 class 별 purity reporting 하고, 전체 purtiy 뽑아보기 / Trust set이 정말 믿을 수있는지를 판단하기 위해서.

In [17]:
topk_lbl_list, topk_isNoisy_list, topk_sin_list, topk_list = same_topk(label_list, isNoisy_list, top1_abs_score, 0.01)

In [18]:
#classwise clean samples fractions(clean samples / selected samples)

In [19]:
selected_data, selected_noise = 0, 0

for i in range (0, len(topk_list)):
    tmp_selected, tmp_noise = topk_list[i].shape[0], torch.sum(topk_list[i], 0)
    
    selected_data += tmp_selected
    selected_noise += tmp_noise
    print('class {} top {}% clean/selected: {}' \
          .format(i, round(100*tmp_selected / topk_sin_list[i].shape[0]), 1 - (tmp_noise / tmp_selected)))
    print((tmp_selected - tmp_noise).item())
    print(tmp_selected)
    
print('Total top {}% clean/selected: {}'.format(1, 1 - (selected_noise / selected_data)))
print(selected_data)
print((selected_data - selected_noise).item())

class 0 top 1% clean/selected: 1.0
49.0
49
class 1 top 1% clean/selected: 1.0
50.0
50
class 2 top 1% clean/selected: 0.9795918367346939
48.0
49
class 3 top 1% clean/selected: 0.98
49.0
50
class 4 top 1% clean/selected: 1.0
50.0
50
class 5 top 1% clean/selected: 0.6799999999999999
34.0
50
class 6 top 1% clean/selected: 0.98
49.0
50
class 7 top 1% clean/selected: 1.0
50.0
50
class 8 top 1% clean/selected: 1.0
49.0
49
class 9 top 1% clean/selected: 1.0
49.0
49
Total top 1% clean/selected: 0.9616935483870968
496
477.0


In [20]:
# class 0
class_0 = label_list == 5
print(class_0.shape)
print(np.sum(class_0))
isNoisy_0 = isNoisy_list[class_0]
print(np.sum(isNoisy_0))
isNoisy_0 = torch.from_numpy(isNoisy_0)

sin_score_0 = top1_abs_score[class_0]

(50000,)
5027
3650.0


In [21]:
sort_0, idx_0 = torch.sort(sin_score_0, descending=True)
print(sort_0)
print(idx_0)

tensor([1.9518, 1.9359, 1.8716,  ..., 0.5528, 0.5469, 0.5421])
tensor([2676, 3015, 2944,  ..., 2650, 2022, 2700])


In [22]:
torch.index_select(isNoisy_0, 0, idx_0[:50])

tensor([1., 1., 0., 1., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1.,
        0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 1.,
        0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       dtype=torch.float64)

# (Loss) Classwise # number of clean samples(top 1%, 3%, 5%)

In [23]:
# class_lbl_list = []
# class_isNoisy_list = []
# class_sin_score = []
# class_sel_list = [] #top-k 뽑은 것들의 noise(1) clean(0) 여부 

def loss_topk(lbl_list, noise_list, loss_score, topk_frac):
    class_lbl_list, class_isNoisy_list = [], []
    class_sin_score, class_sel_list = [], []

    for i in range(0, config['num_classes']):
        frac = int(topk_frac * np.sum(lbl_list == i))
        
        class_lbl_list.append(lbl_list == i)
        tmp_isNoisy = noise_list[class_lbl_list[i]]
        tmp_isNoisy = torch.from_numpy(tmp_isNoisy)
        class_isNoisy_list.append(tmp_isNoisy)

        class_sin_score.append(loss_score[class_lbl_list[i]])
        tmp_sort, tmp_idx = torch.sort(class_sin_score[i])

        tmp_nc = torch.index_select(tmp_isNoisy, 0, tmp_idx[:frac])
        
        class_sel_list.append(tmp_nc)

    return class_lbl_list, class_isNoisy_list, class_sin_score, class_sel_list

In [24]:
t_loss_list = torch.from_numpy(loss_list)
topk_lbl_list, topk_isNoisy_list, topk_sin_list, topk_list = loss_topk(label_list, isNoisy_list, t_loss_list, 0.5)

In [25]:
selected_data, selected_noise = 0, 0

for i in range (0, len(topk_list)):
    tmp_selected, tmp_noise = topk_list[i].shape[0], torch.sum(topk_list[i], 0)
    
    selected_data += tmp_selected
    selected_noise += tmp_noise
    print('class {} top {}% clean/selected: {}' \
          .format(i, round(100*tmp_selected / topk_sin_list[i].shape[0]), 1 - (tmp_noise / tmp_selected)))
    print((tmp_selected - tmp_noise).item())
    print(tmp_selected)
    
print('Total top {}% clean/selected: {}'.format(1, 1 - (selected_noise / selected_data)))
print(selected_data)
print((selected_data - selected_noise).item())

class 0 top 50% clean/selected: 0.5534565916398714
1377.0
2488
class 1 top 50% clean/selected: 0.5756979944946914
1464.0
2543
class 2 top 50% clean/selected: 0.5213501423342822
1282.0
2459
class 3 top 50% clean/selected: 0.5252485089463221
1321.0
2515
class 4 top 50% clean/selected: 0.5234685759745425
1316.0
2514
class 5 top 50% clean/selected: 0.5220851571826501
1312.0
2513
class 6 top 50% clean/selected: 0.541335453100159
1362.0
2516
class 7 top 50% clean/selected: 0.5359712230215827
1341.0
2502
class 8 top 50% clean/selected: 0.5494728304947283
1355.0
2466
class 9 top 50% clean/selected: 0.5384925433293026
1336.0
2481
Total top 1% clean/selected: 0.5387046445573469
24997
13466.0
