In [1]:
import argparse
import torch
import sys
import os
import json
import random
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt 
import sklearn.covariance
import scipy
import pdb

import data_loader.data_loaders as module_data
import loss as module_loss
import model.metric as module_metric
import model.model as module_arch
import torch.nn as nn
import torch.nn.functional as F
import model.model as module_arch

from sklearn import metrics
from sklearn import cluster
from tqdm import tqdm
from torch.autograd import Variable
from parse_config import ConfigParser

In [2]:
config_file = './hyperparams/multistep/config_cifar10_gce_rn34.json'
with open(config_file, 'r') as f:
    config = json.load(f)
    
device = torch.device('cuda:0')

In [3]:
config['trainer']['percent'] = 0.4
config['trainer']['asym'] = False
config['train_loss']['type'] = 'GCELoss'
config

{'name': 'cifar10_resnet34_multistep',
 'n_gpu': 1,
 'seed': 123,
 'arch': {'type': 'resnet34', 'args': {'num_classes': 10}},
 'num_classes': 10,
 'data_loader': {'type': 'CIFAR10DataLoader',
  'args': {'data_dir': './dir_to_data',
   'batch_size': 128,
   'shuffle': True,
   'num_batches': 0,
   'validation_split': 0,
   'num_workers': 8,
   'pin_memory': True}},
 'optimizer': {'type': 'SGD',
  'args': {'lr': 0.02, 'momentum': 0.9, 'weight_decay': 0.001}},
 'train_loss': {'type': 'GCELoss',
  'args': {'q': 0.7, 'k': 0.5, 'truncated': False}},
 'val_loss': 'CrossEntropyLoss',
 'metrics': ['my_metric', 'my_metric2'],
 'lr_scheduler': {'type': 'MultiStepLR',
  'args': {'milestones': [40, 80], 'gamma': 0.01}},
 'trainer': {'epochs': 120,
  'warmup': 0,
  'save_dir': 'saved/',
  'save_period': 1,
  'verbosity': 2,
  'label_dir': 'saved/',
  'monitor': 'max test_my_metric',
  'early_stop': 2000,
  'tensorboard': False,
  'mlflow': True,
  '_percent': 'Percentage of noise',
  'percent': 0.4,

In [4]:
resume_path = './saved/models/cifar10/resnet34/MultiStepLR/GCELoss/sym/40/model_best123.pth'
base_model = getattr(module_arch, config["arch"]['type'])()
checkpoint = torch.load(resume_path)
state_dict = checkpoint['state_dict']
base_model.load_state_dict(state_dict)

<All keys matched successfully>

In [5]:
# set seed
random.seed(config['seed'])
torch.manual_seed(config['seed'])
torch.cuda.manual_seed_all(config['seed'])
torch.backends.cudnn.deterministic = True
np.random.seed(config['seed'])

data_loader = getattr(module_data, config['data_loader']['type'])(
    config['data_loader']['args']['data_dir'],
    batch_size= 100,
    shuffle=config['data_loader']['args']['shuffle'],
    validation_split=0.0,
    num_batches=config['data_loader']['args']['num_batches'],
    training=True,
    num_workers=config['data_loader']['args']['num_workers'],
    pin_memory=config['data_loader']['args']['pin_memory'],
    config=config
)

if hasattr(data_loader.dataset, 'num_raw_example'):
    num_examp = data_loader.dataset.num_raw_example
else:
    num_examp = len(data_loader.dataset)


criterion = getattr(module_loss, 'GCELoss')(q=config['train_loss']['args']['q'],
                                            k=config['train_loss']['args']['k'],
                                            trainset_size=num_examp,
                                            truncated=config['train_loss']['args']['truncated'])

# criterion = getattr(module_loss, 'GCELoss')(q=config['train_loss']['args']['q'],
#                                                      k=config['train_loss']['args']['k'],
#                                                      truncated=False)

Files already downloaded and verified
Train: 50000 Val: 0


# Base Resnet

In [6]:
class Represent(nn.Module):
    def __init__(self, base_model):
        super(Represent, self).__init__()
        self.conv1 = base_model.conv1
        self.bn1 = base_model.bn1
        self.layer1 = base_model.layer1
        self.layer2 = base_model.layer2
        self.layer3 = base_model.layer3
        self.layer4 = base_model.layer4
        self.linear = base_model.linear
        
        
    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        y = out.view(out.size(0), -1)
        
        return y
    
    #Feature Extractting
    def feature_list(self, x):
        output_list = []
        out = F.relu(self.bn1(self.conv1(x)))
        for name, module in self.layer1._modules.items():
            out = module(out)
        for name, module in self.layer2._modules.items():
            out = module(out)
        for name, module in self.layer3._modules.items():
            out = module(out)
        for name, module in self.layer4._modules.items():
            out = module(out)
            output_list.append(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        y = self.linear(out)
        return y, output_list

In [7]:
model = Represent(base_model)

In [8]:
isNoisy_list = np.empty((0,))
isFalse_list = np.empty((0,))
label_list = np.empty((0,))
gt_list = np.empty((0,))

In [9]:
noisy_label = data_loader.dataset.train_labels
gt_label = data_loader.dataset.train_labels_gt

In [10]:
#Noise rate check
tmp = 0
for i in range(len(gt_label)):
    if noisy_label[i] != gt_label[i]:
        tmp += 1
print('Noise rate: ', tmp/len(gt_label))

Noise rate:  0.3606


# FNAES / K-means

In [12]:
isNoisy_list = np.empty((0,))
isFalse_list = np.empty((0,))
label_list = np.empty((0,))
gt_list = np.empty((0,))
conf_list = np.empty((0,))
loss_list = np.empty((0,))

model.eval()
model.to(device)

with tqdm(data_loader) as progress:
    for batch_idx, (data, label, index, label_gt) in enumerate(progress):
        data = data.to(device)
        label, label_gt = label.long().to(device), label_gt.long().to(device)
        output = model(data)
        _,prediction = base_model(data)
        loss = torch.nn.CrossEntropyLoss(reduction='none')(prediction, label)
        confidence, _ = torch.max(torch.nn.functional.softmax(prediction, dim=1), dim=1)
        isNoisy = label != label_gt
        
        gt_list = np.concatenate((gt_list, label_gt.cpu()))
        label_list = np.concatenate((label_list, label.cpu()))
        isNoisy_list = np.concatenate((isNoisy_list, isNoisy.cpu()))
        conf_list = np.concatenate((conf_list, confidence.detach().cpu()))
        loss_list = np.concatenate((loss_list, loss.detach().cpu()))
        if batch_idx == 0:
            out_list = output.detach().cpu()
        else:
            out_list = np.concatenate((out_list, output.detach().cpu()), axis=0)

100%|██████████| 500/500 [00:29<00:00, 16.79it/s]


In [13]:
#check noisy rate
isNoisy_list.sum()/len(data_loader.dataset.targets)

0.3606

In [14]:
def get_singular_value_vector(label_list, out_list):
    
    singular_dict = {}
    v_ortho_dict = {}
    
    for index in np.unique(label_list):
        u, s, v = np.linalg.svd(out_list[label_list==index])
        singular_dict[index] = s[0] / s[1]
        v_ortho_dict[index] = torch.from_numpy(v[:2])

    return singular_dict, v_ortho_dict

In [15]:
def singular_label(v_ortho_dict, model_represents, label):
    
    model_represents = torch.from_numpy(model_represents).to(device)
    sing_lbl = torch.zeros(model_represents.shape[0]) 
    sin_score_lbl = torch.zeros(model_represents.shape[0])
    
    for i, data in enumerate(model_represents):
        sin_score_lbl[i] = torch.dot(v_ortho_dict[label[i]][0], data).abs() - torch.dot(v_ortho_dict[label[i]][1], data).abs()
        if torch.dot(v_ortho_dict[label[i]][0], data).abs() < torch.dot(v_ortho_dict[label[i]][1], data).abs():
            sing_lbl[i] = 1
        
    return sing_lbl, sin_score_lbl

In [16]:
singular_dict, v_ortho_dict = get_singular_value_vector(label_list, out_list)

for key in v_ortho_dict.keys():
    v_ortho_dict[key] = v_ortho_dict[key].to(device)

sing_lbl, sin_score_lbl = singular_label(v_ortho_dict, out_list, label_list)
kmeans = cluster.KMeans(n_clusters=2, random_state=0).fit(loss_list.reshape(-1,1))

In [17]:
len(isNoisy_list)

50000

In [48]:
def return_statistics(isNoisy_list, predict):
    r_stats = []
    
    tp = isNoisy_list[predict==1].sum()
    tn = (isNoisy_list[predict==0]==0).sum()
    fp = (isNoisy_list.shape - isNoisy_list.sum()) - tn
    fn = isNoisy_list.sum() - tp
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    specificity = tn / (tn + fp)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    sel_samples = fn + tn
    frac_clean = tn / (fn + tn)
    
    r_stats.extend([sel_samples, round(precision[0], 5), round(recall, 5), round(specificity[0], 5), round(accuracy[0], 5), round(frac_clean, 5)])
    
    return r_stats

return_statistics(isNoisy_list, 1-kmeans.labels_)

[32274.0, 0.9538, 0.93771, 0.97438, 0.96116, 0.9652]

In [44]:
def stat_summary(name, stat_list):
    print('Dataset: {}, Net: {}, Noise{}_{}, Loss: {}'
      .format(config['data_loader']['type'], config['arch']['type'], config['trainer']['asym'], config['trainer']['percent'], config['train_loss']['type']))
    
    print("="*50, name , "="*50)

    print('Selected samples by {}: {} \nPrecision: {} \nRecall: {} \nSpecificity: {}\nAccuracy: {} \nFraction of clean samples/selected samples: {}'
                  .format(name, stat_list[0], stat_list[1], stat_list[2], stat_list[3], stat_list[4], stat_list[5]))


In [45]:
k_mean_stat = return_statistics(isNoisy_list, 1-kmeans.labels_)
stat_summary('K-means Clustering', k_mean_stat)

Dataset: CIFAR10DataLoader, Net: resnet34, NoiseFalse_0.4, Loss: GCELoss
Selected samples by K-means Clustering: 32274.0 
Precision: 0.9538 
Recall: 0.93771 
Specificity: 0.97438
Accuracy: 0.96116 
Fraction of clean samples/selected samples: 0.9652


In [46]:
fnaes_stat = return_statistics(isNoisy_list, sing_lbl)
stat_summary('FNAES', fnaes_stat)

Dataset: CIFAR10DataLoader, Net: resnet34, NoiseFalse_0.4, Loss: GCELoss
Selected samples by FNAES: 32143.0 
Precision: 0.94994 
Recall: 0.94082 
Specificity: 0.97204
Accuracy: 0.96078 
Fraction of clean samples/selected samples: 0.9668


# MCD

In [49]:
#get raw data

for batch_idx, (data, target, index, label_gt) in enumerate(data_loader):
    data, target, label_gt = data.cuda(), target.cuda(), label_gt.cuda()
    if batch_idx == 0:
        total_data = data
        total_target = target
        total_label_gt = label_gt
    else:
        total_data = torch.cat((total_data, data), 0)
        total_target = torch.cat((total_target, target), 0)
        total_label_gt = torch.cat((total_label_gt, label_gt), 0)
        
    

In [50]:
#Noise rate check
tmp = 0
for i in range(len(total_label_gt)):
    if total_target[i] != total_label_gt[i]:
        tmp += 1
print('Noise rate: ', tmp/len(total_label_gt))

Noise rate:  0.3606


In [51]:
model.eval()
temp_x = torch.rand(2,3,32,32).cuda()
temp_x = Variable(temp_x, volatile=True)
temp_list = model.feature_list(temp_x)[1]
num_output = len(temp_list) # Number of layers that extracts feature
total_final_feature = [0]*num_output #Extracted Features
total = 0
batch_size = 100

for data_index in range(int(np.floor(total_data.size(0)/batch_size))):
    data = total_data[total : total + batch_size]
    data = Variable(data, volatile=True)

    _, out_features = model.feature_list(data)
    for i in range(num_output):
        out_features[i] = out_features[i].view(out_features[i].size(0), out_features[i].size(1), -1)
        out_features[i] = torch.mean(out_features[i].data, 2)
        if total == 0:
            total_final_feature[i] = out_features[i].cpu().clone()
        else:
            total_final_feature[i] = torch.cat((total_final_feature[i], out_features[i].cpu().clone()), 0)
    total += batch_size



  temp_x = Variable(temp_x, volatile=True)
  data = Variable(data, volatile=True)


In [52]:
total_final_feature[0].shape

torch.Size([50000, 512])

### Random Initailzation of samples

In [53]:
#Random Sample Mean
def random_sample_mean(feature, total_label, num_classes):
    
    group_lasso = sklearn.covariance.EmpiricalCovariance(assume_centered = False)    
    new_feature, fraction_list = [], []
    frac = 0.7
    sample_mean_per_class = torch.Tensor(num_classes, feature.size(1)).fill_(0).cuda()
    total_label = total_label.cuda()

    
    total_selected_list = []
    for i in range(num_classes):
        index_list = total_label.eq(i)
        temp_feature = feature[index_list.nonzero(), :]
        temp_feature = temp_feature.view(temp_feature.size(0), -1)
        shuffler_idx = torch.randperm(temp_feature.size(0))
        index = shuffler_idx[:int(temp_feature.size(0)*frac)]
        fraction_list.append(int(temp_feature.size(0)*frac))
        total_selected_list.append(index_list.nonzero()[index.cuda()])

        selected_feature = torch.index_select(temp_feature, 0, index.cuda())
        new_feature.append(selected_feature)
        sample_mean_per_class[i].copy_(torch.mean(selected_feature, 0))
    
    total_covariance = 0
    for i in range(num_classes):
        flag = 0
        X = 0
        for j in range(fraction_list[i]):
            temp_feature = new_feature[i][j]
            temp_feature = temp_feature - sample_mean_per_class[i]
            temp_feature = temp_feature.view(-1,1)
            if flag  == 0:
                X = temp_feature.transpose(0,1)
                flag = 1
            else:
                X = torch.cat((X,temp_feature.transpose(0,1)),0)
            # find inverse            
        group_lasso.fit(X.cpu().numpy())
        inv_sample_conv = group_lasso.covariance_
        inv_sample_conv = torch.from_numpy(inv_sample_conv).float().cuda()
        if i == 0:
            total_covariance = inv_sample_conv*fraction_list[i]
        else:
            total_covariance += inv_sample_conv*fraction_list[i]
        total_covariance = total_covariance/sum(fraction_list)
    new_precision = scipy.linalg.pinvh(total_covariance.cpu().numpy())
    new_precision = torch.from_numpy(new_precision).float().cuda()
    
    return sample_mean_per_class, new_precision, total_selected_list


In [55]:
print('Random Sample Mean')
sample_mean_list, sample_precision_list = [], []
total_label_list = [total_target for i in range(num_output)]

for index in range(num_output):
    sample_mean, sample_precision, _ = random_sample_mean(total_final_feature[index].cuda(), total_label_list[index].cuda(), config['num_classes'])
    sample_mean_list.append(sample_mean)
    sample_precision_list.append(sample_precision)

Random Sample Mean


	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /opt/conda/conda-bld/pytorch_1607370172916/work/torch/csrc/utils/python_arg_parser.cpp:882.)
  temp_feature = feature[index_list.nonzero(), :]


In [56]:
#mean for each entity corresponding to feature vector
sample_mean_list[2][1].shape

torch.Size([512])

### Minimum Covariance determinent

In [58]:
#MCD single
def MCD_single(feature, sample_mean, inverse_covariance):
    group_lasso = sklearn.covariance.EmpiricalCovariance(assume_centered=False)
    temp_batch = 100
    total, mahalanobis_score = 0, 0
    frac = 0.7 #fraction N_c+d+1 / 2
    for data_index in range(int(np.ceil(feature.size(0)/temp_batch))):
        temp_feature = feature[total : total + temp_batch].cuda()        
        gaussian_score = 0
        batch_sample_mean = sample_mean
        zero_f = temp_feature - batch_sample_mean
        term_gau = -0.5*torch.mm(torch.mm(zero_f, inverse_covariance), zero_f.t()).diag()
        # concat data
        if total == 0:
            mahalanobis_score = term_gau.view(-1,1)
        else:
            mahalanobis_score = torch.cat((mahalanobis_score, term_gau.view(-1,1)), 0)
        total += temp_batch
        
    mahalanobis_score = mahalanobis_score.view(-1)
    feature = feature.view(feature.size(0), -1)
    _, selected_idx = torch.topk(mahalanobis_score, int(feature.size(0)*frac))
    selected_feature = torch.index_select(feature, 0, selected_idx.cuda())
    new_sample_mean = torch.mean(selected_feature, 0)
    
    # compute covariance matrix
    X = 0
    flag = 0
    for j in range(selected_feature.size(0)):
        temp_feature = selected_feature[j]
        temp_feature = temp_feature - new_sample_mean
        temp_feature = temp_feature.view(-1,1)
        if flag  == 0:
            X = temp_feature.transpose(0,1)
            flag = 1
        else:
            X = torch.cat((X, temp_feature.transpose(0,1)),0)
    # find inverse            
    group_lasso.fit(X.cpu().numpy())
    new_sample_cov = group_lasso.covariance_
    
    return new_sample_mean, new_sample_cov, selected_idx

In [61]:
print('Single MCD and merge the parameters')
new_sample_mean_list = []
new_sample_precision_list = []
selected_feature = []
layer_selected_index = []
for index in range(num_output):
    tmp_selected_idx = []

    new_sample_mean = torch.Tensor(config['num_classes'], total_final_feature[index].size(1)).fill_(0).cuda()
    new_covariance = 0
    for i in range(config['num_classes']):
        index_list = total_label_list[index].eq(i)
        temp_feature = total_final_feature[index][index_list.nonzero(), :]
        tmp_idx_list = index_list.nonzero().view(-1).detach().cpu()
        print(temp_feature.shape)
        temp_feature = temp_feature.view(temp_feature.size(0), -1)
        temp_mean, temp_cov, tmp_idx = MCD_single(temp_feature.cuda(), sample_mean_list[index][i], sample_precision_list[index])
        print('selcted index for class', i, ':', tmp_idx.shape)
        new_sample_mean[i].copy_(temp_mean)
        tmp_real_idx = tmp_idx_list[tmp_idx.detach().cpu()]
        tmp_selected_idx.extend(tmp_real_idx.tolist())


        if i  == 0:
            new_covariance = temp_feature.size(0)*temp_cov
        else:
            new_covariance += temp_feature.size(0)*temp_cov
        
    layer_selected_index.append(tmp_selected_idx)
            
    new_covariance = new_covariance / total_final_feature[index].size(0)
    new_precision = scipy.linalg.pinvh(new_covariance)
    new_precision = torch.from_numpy(new_precision).float().cuda()
    new_sample_mean_list.append(new_sample_mean)
    new_sample_precision_list.append(new_precision)

G_soft_list = []
target_mean = new_sample_mean_list 
target_precision = new_sample_precision_list

Single MCD and merge the parameters
torch.Size([5035, 1, 512])
selcted index for class 0 : torch.Size([3524])
torch.Size([4992, 1, 512])
selcted index for class 1 : torch.Size([3494])
torch.Size([4882, 1, 512])
selcted index for class 2 : torch.Size([3417])
torch.Size([4955, 1, 512])
selcted index for class 3 : torch.Size([3468])
torch.Size([5044, 1, 512])
selcted index for class 4 : torch.Size([3530])
torch.Size([4969, 1, 512])
selcted index for class 5 : torch.Size([3478])
torch.Size([4988, 1, 512])
selcted index for class 6 : torch.Size([3491])
torch.Size([5040, 1, 512])
selcted index for class 7 : torch.Size([3528])
torch.Size([4995, 1, 512])
selcted index for class 8 : torch.Size([3496])
torch.Size([5100, 1, 512])
selcted index for class 9 : torch.Size([3570])
torch.Size([5035, 1, 512])
selcted index for class 0 : torch.Size([3524])
torch.Size([4992, 1, 512])
selcted index for class 1 : torch.Size([3494])
torch.Size([4882, 1, 512])
selcted index for class 2 : torch.Size([3417])
to

In [67]:
new_sample_mean_list[0].shape

print(len(new_sample_mean_list))
print(len(tmp_selected_idx))
print(tmp_real_idx)

3
34996
tensor([13872, 35878,  3958,  ..., 44155,  9615, 41085])


In [68]:
len(layer_selected_index[0])

34996

In [69]:
layer4_0 = layer_selected_index[0]
layer4_1 = layer_selected_index[1]
layer4_2 = layer_selected_index[2]

print(len(layer4_0))
print(len(layer4_1))
print(len(layer4_2))

34996
34996
34996


In [72]:
#Check for same index
import collections
for i in range(len(layer_selected_index)):
    print(len([item for item, count in collections.Counter(layer_selected_index[i]).items() if count > 1]))

0
0
0


In [82]:
print('Selected data Difference \n4_0 vs 4_1: {} \n4_1 vs 4_2: {} \n4_0 vs 4_2: {}'
      .format(len(set(layer4_0) & set(layer4_1)), len(set(layer4_1) & set(layer4_2)), len(set(layer4_0) & set(layer4_2))))


Selected data Difference 
4_0 vs 4_1: 32036 
4_1 vs 4_2: 33097 
4_0 vs 4_2: 30842


In [100]:
saved_path = './saved' 
tmp_asym = 'asym' if config['trainer']['asym'] else 'sym'
data_name, net_fam = config['name'].split('_')[0], config['name'].split('_')[1]

if not os.path.isdir(saved_path):
    os.mkdir(saved_path)
next_path = os.path.join(saved_path, 'mahalanobis')
if not os.path.isdir(next_path):
    os.mkdir(next_path)
next_path = os.path.join(next_path, data_name)
if not os.path.isdir(next_path):
    os.mkdir(next_path)
next_path = os.path.join(next_path, net_fam)
if not os.path.isdir(next_path):
    os.mkdir(next_path)
next_path = os.path.join(next_path, config['lr_scheduler']['type'])
if not os.path.isdir(next_path):
    os.mkdir(next_path)
next_path = os.path.join(next_path, config['train_loss']['type'])
if not os.path.isdir(next_path):
    os.mkdir(next_path)
next_path = os.path.join(next_path, tmp_asym)
if not os.path.isdir(next_path):
    os.mkdir(next_path)
file_root = os.path.join(next_path, str(config['trainer']['percent']))
if not os.path.isdir(file_root):
    os.mkdir(file_root)


In [85]:
#Save output_feature, target_noise, label_gt
for i in range(num_output):
    file_name_data = '%s/%s_feature_4_%s.npy' % (file_root, data_name, str(i))
    total_feature = total_final_feature[i].numpy()
    np.save(file_name_data , total_feature)

file_name_label = '%s/%s_target_noise.npy' % (file_root, data_name)
np.save(file_name_label, total_target.detach().cpu())

file_name_gt = '%s/%s_label_gt.npy' % (file_root, data_name)
np.save(file_name_gt, total_label_gt.detach().cpu())
    

In [86]:
#Generate predicted noise index(Unselected)

total_index = [i for i in range(len(total_target))]
len(total_index)
total_index[-1]

predicted_noise_layer = []
layer_unselected_index = []

for layer in layer_selected_index:
    tmpp = set(total_index) - set(layer)
    layer_unselected_index.append(list(tmpp))

print(len(layer_unselected_index[0]) + len(layer_selected_index[0]))
print(len(layer_unselected_index[1]) + len(layer_selected_index[1]))
print(len(layer_unselected_index[2]) + len(layer_selected_index[2]))

for layer in layer_unselected_index:
    tmpp_noisy = []
    num_noisy = 0
    for i in layer:
        if total_target[i] != total_label_gt[i]:
            num_noisy += 1 
            tmpp_noisy.append(1)
        else:
            tmpp_noisy.append(0)
    predicted_noise_layer.append(tmpp_noisy)

50000
50000
50000


In [87]:
print('Dataset: {}, Net: {}, Noise{}_{}, Loss: {}'
      .format(config['data_loader']['type'], config['arch']['type'], config['trainer']['asym'], config['trainer']['percent'], config['train_loss']['type']))
print("="*50, 'Mahalanobis Distance', "="*50)

flag = 0
predicted_clean_layer = []

for layer in layer_selected_index:
    tmp_noisy = []
    num_noisy = 0
    for i in layer:
        if total_target[i] != total_label_gt[i]:
            num_noisy +=1 
            tmp_noisy.append(1)
        else:
            tmp_noisy.append(0)
    print('layer4_{} \nSelected samples by Mahalanobis distance: {} \nFraction of clean samples/selected samples: {}'
          .format(flag, len(layer), 1-(num_noisy/len(layer))))
    print(num_noisy)
    flag += 1
    predicted_clean_layer.append(tmp_noisy)
# config['trainer']['percent']

print("="*100)
print("="*50, 'Mahalanobis Distance', "="*50)


Dataset: CIFAR10DataLoader, Net: resnet34, NoiseFalse_0.4, Loss: GCELoss
layer4_0 
Selected samples by Mahalanobis distance: 34996 
Fraction of clean samples/selected samples: 0.6937364270202309
10718
layer4_1 
Selected samples by Mahalanobis distance: 34996 
Fraction of clean samples/selected samples: 0.7062807177963195
10279
layer4_2 
Selected samples by Mahalanobis distance: 34996 
Fraction of clean samples/selected samples: 0.7173962738598697
9890


In [90]:
recall, specificity, precision, accuracy, frac_clean, sel_samples = [], [], [], [], [], []

for i in range(len(predicted_clean_layer)):

    tp, fn = sum(predicted_noise_layer[i]), sum(predicted_clean_layer[i])
    fp, tn = len(predicted_noise_layer[i]) - tp, len(predicted_clean_layer[i]) - fn

    frac_clean.append( round(tn / (fn + tn), 5))
    recall.append(round(tp / (tp + fn), 5))
    precision.append(round(tp / (tp + fp), 5))
    specificity.append(round(tn / (tn + fp), 5))
    accuracy.append(round((tp + tn) / (tp + tn + fp + fn), 5))
    sel_samples.append(fn + tn)


In [91]:
sel_samples

[34996, 34996, 34996]

In [92]:
frac_clean

[0.69374, 0.70628, 0.7174]

In [93]:
recall

[0.40555, 0.42989, 0.45147]

In [94]:
precision

[0.48734, 0.5166, 0.54252]

In [95]:
specificity

[0.7594, 0.77313, 0.7853]

In [96]:
def report_metric(sel_samples, precision, recall, specificity, accuracy, frac_clean):
    print('Dataset: {}, Net: {}, Noise{}_{}, Loss: {}'
      .format(config['data_loader']['type'], config['arch']['type'], config['trainer']['asym'], config['trainer']['percent'], config['train_loss']['type']))
    print("="*50, 'Mahalanobis Distance', "="*50)
    
    if len(recall) > 1:
        for i in range(len(recall)):
            print('layer4_{} \nSelected samples by Mahalanobis distance: {} \nPrecision: {} \nRecall: {} \nSpecificity: {}\nAccuracy: {} \nFraction of clean samples/selected samples: {}'
                  .format(i, sel_samples[i], precision[i], recall[i], specificity[i], accuracy[i], frac_clean[i]))
    else:
        print('layer4_{} \nSelected samples by Mahalanobis distance: {} \nPrecision: {} \nRecall: {} \nSpecificity: {}\nAccuracy: {} \nFraction of clean samples/selected samples: {}'
                  .format(sel_samples, precision, recall, specificity, accuracy, frac_clean))

In [97]:
report_metric(sel_samples, precision, recall, specificity, accuracy, frac_clean)

Dataset: CIFAR10DataLoader, Net: resnet34, NoiseFalse_0.4, Loss: GCELoss
layer4_0 
Selected samples by Mahalanobis distance: 34996 
Precision: 0.48734 
Recall: 0.40555 
Specificity: 0.7594
Accuracy: 0.6318 
Fraction of clean samples/selected samples: 0.69374
layer4_1 
Selected samples by Mahalanobis distance: 34996 
Precision: 0.5166 
Recall: 0.42989 
Specificity: 0.77313
Accuracy: 0.64936 
Fraction of clean samples/selected samples: 0.70628
layer4_2 
Selected samples by Mahalanobis distance: 34996 
Precision: 0.54252 
Recall: 0.45147 
Specificity: 0.7853
Accuracy: 0.66492 
Fraction of clean samples/selected samples: 0.7174


In [102]:
# Save as txt
df = pd.DataFrame(columns = ['Layer4_0', 'Layer4_1', 'Layer4_2'])

df.loc[len(df)] = sel_samples
df.loc[len(df)] = precision
df.loc[len(df)] = recall
df.loc[len(df)] = specificity
df.loc[len(df)] = accuracy
df.loc[len(df)] = frac_clean
df.insert(0, 'Metric', ['Samples', 'Precision', 'Recall', 'Specificity', 'Accuracy', 'Fraction'])

print(df)
print(file_root)


        Metric Layer4_0 Layer4_1 Layer4_2
0      Samples    34996    34996    34996
1    Precision  0.48734   0.5166  0.54252
2       Recall  0.40555  0.42989  0.45147
3  Specificity   0.7594  0.77313   0.7853
4     Accuracy   0.6318  0.64936  0.66492
5     Fraction  0.69374  0.70628   0.7174
./saved/mahalanobis/cifar10/resnet34/MultiStepLR/GCELoss/sym/0.4


In [103]:
df.to_csv(file_root+'/metric.txt', index=False, header=True, sep="\t")