In [None]:
import torch
from datasets import IndexedDataset, WeightedDataset
from torch.utils.data import DataLoader, DistributedSampler

from utils import get_args
from architectures import load_architecture

from samplers import DistributedCustomSampler
from losses import trades_loss
from tqdm.notebook import tqdm
from architectures import load_architecture, load_statedict, add_lora

args = get_args()
args.arch = 'LeNet5'
args.dataset = 'MNIST'
args.selection_method = 'uncertainty'

# Example Usage

model, target_layers = load_architecture(args)
model.to('cuda')

# statedict = load_statedict(args)
# model.load_state_dict(statedict)
# add_lora(target_layers, model)

args.pruning_ratio = 0
args.delta = 1
args.batch_size = 128
args.pruning_strategy = 'random'
args.batch_strategy = 'random'
args.sample_size= 128

# train_dataset = IndexedDataset()
print('init weighted dataset')
train_dataset = WeightedDataset(args, train=True, prune_ratio = args.pruning_ratio,  )

train_sampler = DistributedCustomSampler(args, train_dataset, num_replicas=2, rank=0, drop_last=True)

print('init dataloder')
trainloader = DataLoader(train_dataset, batch_size=None, sampler = train_sampler,) 

In [None]:
from losses import get_loss, get_eval_loss

iterations = 1
rank = 'cuda'

optimizer = torch.optim.SGD( model.parameters(),lr=args.init_lr, weight_decay=args.weight_decay, momentum=args.momentum, nesterov=True, )

for iteration in range(iterations):

    model.train()
    train_sampler.set_epoch(iteration)

    for batch_id, batch in tqdm(enumerate( trainloader ) ):

        optimizer.zero_grad()

        data, target, idxs = batch

        print(idxs)

        data, target = data.to(rank), target.to(rank) 
         
        loss_values, clean_values, robust_values, logits_nat, logits_adv = get_loss(args, model, data, target, optimizer)

        # assert torch.isfinite(loss_values).all(), "Loss contains NaNs!"
        # assert torch.isfinite(logits_nat).all(), "Logits_nat contains NaNs!"
        # assert torch.isfinite(logits_adv).all(), "Logits_adv contains NaNs!"

        # train_dataset.update_scores(iteration, idxs,loss_values)
        train_dataset.update_scores(rank, idxs, clean_values, robust_values, loss_values, logits_nat, logits_adv)
        # loss = train_dataset.compute_loss(idxs, loss_values)

        # loss.backward()
        # optimizer.step()

In [None]:
from torchvision import datasets, transforms

transform = transforms.Compose([
                transforms.ToTensor(),   ])
                # transforms.Normalize( mean=(0.4914, 0.4822, 0.4465), std=(0.2471, 0.2435, 0.2616) 
                                    #  )  ])


dataset = datasets.CIFAR10(root=args.data_dir, train=True, download=True, transform=transform)


In [None]:
import numpy as np
import torch

# Load the .npz file using NumPy
npz_file = np.load('/home/mheuillet/Downloads/1m.npz')

# Print keys to see what arrays are available in the .npz file
print("Available arrays in the .npz file:", npz_file.files)

# Example: Load a specific array by its key
# Replace 'array_key' with the actual key in your .npz file
array_key = 'image'  # Change to your actual key
numpy_array = npz_file[array_key]



train_transform = transforms.Compose([transforms.RandomCrop(32, padding=4), 
                                      transforms.RandomHorizontalFlip(0.5), 
                                      transforms.ToTensor()])





In [None]:
# transform = transforms.Compose([
#                 transforms.ToTensor(),
#                 transforms.Normalize( mean=(0.4914, 0.4822, 0.4465), std=(0.2471, 0.2435, 0.2616) )  ])
# # 
#         if train:
#             dataset = datasets.CIFAR10(root=args.data_dir, train=True, download=True, transform=transform)
#         else:
#             dataset = datasets.CIFAR10(root=args.data_dir, train=False, download=True, transform=transform)

#         # pool_dataset = IndexedDataset('CIFAR10', train_folder, transform= transform ) 
#         # test_dataset = IndexedDataset('CIFAR10', test_folder, transform= transform) 
#         N = 10

#         print('load dataloader')

: 

In [None]:
import torch

N = 10
# Initializing tensors with specified shapes
indices = torch.randint(0, 10, (N, ))
print("indices shape:", indices.shape)

indices.shape[0]
# clean_loss_val = torch.randn((N, ))
# print("clean_loss_val shape:", clean_loss_val.shape)

# robust_loss_val = torch.randn((N, ))
# print("robust_loss_val shape:", robust_loss_val.shape)

# clean_pred = torch.randn((N, 5))
# print("clean_pred shape:", clean_pred.shape)

# robust_pred = torch.randn((N, 5))
# print("robust_pred shape:", robust_pred.shape)



# # Concatenating all tensors along the column dimension (dim=1)
# iv = torch.cat([indices_reshaped,
#                 clean_loss_val_reshaped,
#                 robust_loss_val_reshaped,
#                 clean_pred,
#                 robust_pred], dim=1)

# print("iv shape:", iv.shape)


In [None]:
import torch

def check_for_nans(tensors, tensor_names):
    for tensor, name in zip(tensors, tensor_names):
        if torch.isnan(tensor).any():
            print(f"{name} contains NaNs!")

# Example tensors with potential NaN values
loss_values = torch.randn((10, 1))
clean_values = torch.randn((10,))
robust_values = torch.randn((10,))
logits_nat = torch.randn((10, 5))
logits_adv = torch.randn((10, 5))

# Introducing NaNs for testing purposes
loss_values[0, 0] = float('nan')  # Introducing a NaN for demonstration

# List of tensors and their names for easy reference in the check
tensors = [loss_values, clean_values, robust_values, logits_nat, logits_adv]
tensor_names = ['loss_values', 'clean_values', 'robust_values', 'logits_nat', 'logits_adv']

check_for_nans(tensors, tensor_names)



In [None]:
train_dataset.global_scores2

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Creating a sample tensor with dimensions (10, 60000)
# Each of the 10 rows represents an epoch, and each column represents a loss value for one of the 60,000 observations.
np.random.seed(0)  # For reproducibility
tensor = train_dataset.global_scores2 #np.random.rand(10, 60000)  # Simulating loss values

# Sampling 1000 observations from the 60,000

sample_indices = np.random.choice(tensor.shape[1], size=60000, replace=False)

sampled_tensor = tensor[:, sample_indices]

# Plotting the evolution of the loss for the 60,000 observations over 10 epochs
plt.figure(figsize=(12, 6))

# Plotting each observation's loss over the 10 epochs
for i in tqdm(range(sampled_tensor.shape[1])):
    plt.plot(range(10), sampled_tensor[:, i], alpha=0.25, linewidth=0.5)  # Plotting with low opacity and thin lines for clarity

plt.xlabel('Epochs')
plt.ylabel('Loss Value')
plt.yscale('log')  # Setting y-axis to log scale

plt.title('Evolution of Loss for 60,000 Observations Over 10 Epochs')
plt.grid(True)
plt.tight_layout()

plt.show()


In [None]:
tensor

In [None]:

# for i in range(len(train_dataset)):
#     print(train_dataset[i])
import numpy as np

def obtain_latent_dataset(model, dataset, batch_size=32):

    # Assuming the dataset is a list or similar iterable with a known length
    num_samples = len(dataset)

    # Assume the dimensionality of the latent representation can be determined from one sample
    image,label, idx = dataset[0]
    image = torch.Tensor(image).to('cuda').unsqueeze(0)
    print(image.shape)
    first_latent_rep = model.get_latent_representation(image)
    latent_dim = first_latent_rep.shape[1]
    print(first_latent_rep.shape)
        
    # Preallocate the array for the latent representations
    latent_dataset = torch.zeros((num_samples, latent_dim))


    for i in tqdm( range(0, num_samples, batch_size) ):
        # Get the current batch of data
        batch_indices = list(range(i, min(i + batch_size, num_samples)))
        images,labels,idxs = dataset[batch_indices]
        images = images.to('cuda')
            
        # Process the batch to get latent representations
        batch_latent_reps = model.get_latent_representation(images) 
            
        # Store the results in the preallocated array
        latent_dataset[i:i + batch_size] = batch_latent_reps.detach().cpu()

    return latent_dataset

features =  obtain_latent_dataset(model,train_dataset,64)
train_dataset.define_latent_features(features)

