<a href="https://colab.research.google.com/github/MT-Blachetta/clPcl_SingleGPU/blob/main/clPcl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install spherecluster
!pip install scikit-learn==0.20.0
!wget https://github.com/MT-Blachetta/clPcl_SingleGPU/archive/refs/heads/main.zip
!unzip main.zip
!rm main.zip

Collecting spherecluster
  Downloading spherecluster-0.1.7-py3-none-any.whl (14 kB)
Collecting nose
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[?25l[K     |██▏                             | 10 kB 26.1 MB/s eta 0:00:01[K     |████▎                           | 20 kB 14.4 MB/s eta 0:00:01[K     |██████▍                         | 30 kB 13.4 MB/s eta 0:00:01[K     |████████▌                       | 40 kB 12.8 MB/s eta 0:00:01[K     |██████████▋                     | 51 kB 4.8 MB/s eta 0:00:01[K     |████████████▊                   | 61 kB 5.6 MB/s eta 0:00:01[K     |██████████████▉                 | 71 kB 5.7 MB/s eta 0:00:01[K     |█████████████████               | 81 kB 6.4 MB/s eta 0:00:01[K     |███████████████████             | 92 kB 7.0 MB/s eta 0:00:01[K     |█████████████████████▏          | 102 kB 5.3 MB/s eta 0:00:01[K     |███████████████████████▎        | 112 kB 5.3 MB/s eta 0:00:01[K     |█████████████████████████▍      | 122 kB 5.3 MB/s eta 0:0

In [3]:
#--config_env configs/env.yml 
#--config_exp configs/pretext/clPcl_stl10.yml

import argparse
import os
import torch
import numpy as np

from utils.config import create_config
from utils.common_config import get_criterion, get_backbone_model , get_instance_model,get_group_model, get_train_dataset,\
                                get_val_dataset, get_train_dataloader,\
                                get_val_dataloader, get_train_transformations,\
                                get_val_transformations, get_optimizer,\
                                adjust_learning_rate, get_clustering
from utils.evaluate_utils import contrastive_evaluate
from utils.memory import MemoryBank
from utils.train_utils import pcl_cld_train
from utils.utils import fill_memory_bank
from termcolor import colored

In [4]:
    #1# Retrieve config file
p = create_config("configs/env.yml", "configs/pretext/clPcl_stl10.yml")
print(colored(p, 'red'))
print(colored('Retrieve model', 'blue'))
    
backbone = get_backbone_model(p)
print('Model is {}'.format(backbone.__class__.__name__))
#print('Model parameters: {:.2f}M'.format(sum(p.numel() for p in backbone.parameters()) / 1e6))
print(backbone)
    
    
instance_model = get_instance_model(p, backbone)
instance_head = instance_model.get_head()
    
group_model = get_group_model(p, backbone)
group_head = group_model.get_head()
backbone_model = group_model.get_backbone()
print('Model is {}'.format(instance_model.__class__.__name__))
print('Model parameters: {:.2f}M'.format(sum(p.numel() for p in instance_model.parameters()) / 1e6))
print(instance_model)
print('Model is {}'.format(group_model.__class__.__name__))
print('Model parameters: {:.2f}M'.format(sum(p.numel() for p in group_model.parameters()) / 1e6))
print(group_model)
    #instance_model = instance_model.cuda()
    #group_model = group_model.cuda()
   


[31m{'setup': 'clPcl', 'clustering': [2, 4, 8, 16], 'backbone': 'resnet18', 'model_kwargs': {'head': 'linear', 'features_dim': 128}, 'train_db_name': 'stl-10', 'val_db_name': 'stl-10', 'num_classes': 10, 'criterion': 'clPcl', 'criterion_kwargs': {'temperature': 0.1}, 'epochs': 500, 'optimizer': 'sgd', 'optimizer_kwargs': {'nesterov': False, 'weight_decay': 0.0001, 'momentum': 0.9, 'lr': 0.4}, 'scheduler': 'cosine', 'scheduler_kwargs': {'lr_decay_rate': 0.1}, 'batch_size': 512, 'num_workers': 0, 'augmentation_strategy': 'simclr', 'augmentation_kwargs': {'random_resized_crop': {'size': 96, 'scale': [0.2, 1.0]}, 'color_jitter_random_apply': {'p': 0.8}, 'color_jitter': {'brightness': 0.4, 'contrast': 0.4, 'saturation': 0.4, 'hue': 0.1}, 'random_grayscale': {'p': 0.2}, 'normalize': {'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225]}}, 'transformation_kwargs': {'crop_size': 96, 'normalize': {'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225]}}, 'pretext_dir': 'RESULTS/stl

AttributeError: ignored

In [None]:
     #> CUDNN
print(colored('Set CuDNN benchmark', 'blue')) 
torch.backends.cudnn.benchmark = True    
    ###


#3# Dataset                                                       OK
    #A - get transformormations for the dataset
print(colored('Retrieve dataset', 'blue'))
train_transforms = get_train_transformations(p) 
print('Train transforms:', train_transforms)
val_transforms = get_val_transformations(p)
    
    #B - get Dataset from files
print('Validation transforms:', val_transforms)
split_ = 'train'

if p['train_db_name'] == 'stl-10': 
  split_ = 'train+unlabeled'

train_dataset = get_train_dataset(p, train_transforms, to_augmented_dataset=True,
                                        split=split_) # Split is for stl-10
                                        
val_dataset = get_val_dataset(p, val_transforms)
    
#C - put the dataset to the dataloader for training purposes
train_dataloader = get_train_dataloader(p, train_dataset)
val_dataloader = get_val_dataloader(p, val_dataset)
print('Dataset contains {}/{} train/val samples'.format(len(train_dataset), len(val_dataset)))
        #4# Memory Bank
print(colored('Build MemoryBank', 'blue'))
base_dataset = get_train_dataset(p, val_transforms, split='train') # Dataset w/o augs for knn eval
base_dataloader = get_val_dataloader(p, base_dataset) 
    
memory_bank_base = MemoryBank(len(base_dataset), 
                                p['model_kwargs']['features_dim'],
                                p['num_classes'], p['criterion_kwargs']['temperature'])

    
memory_bank_val = MemoryBank(len(val_dataset),
                                p['model_kwargs']['features_dim'],
                                p['num_classes'], p['criterion_kwargs']['temperature'])

iloss = torch.nn.CrossEntropyLoss()
iloss.cuda()

# Criterion
print(colored('Retrieve criterion', 'blue'))
criterion = get_criterion(p)
print('Criterion is {}'.format(criterion.__class__.__name__))
criterion = criterion.cuda()

    # Optimizer and scheduler                                       
print(colored('Retrieve optimizer', 'blue'))
optimizer = get_optimizer(p, group_model)
print(optimizer)
    ###
M_num_clusters = get_clustering(p)


In [None]:
    #6# Checkpoint to continue last training phase                             OK
if os.path.exists(p['pretext_checkpoint_backbone']):
  print(colored('Restart from checkpoint (backbone) {}'.format(p['pretext_checkpoint_backbone']), 'blue'))
  checkpoint = torch.load(p['pretext_checkpoint_backbone'], map_location='cpu')
  optimizer.load_state_dict(checkpoint['optimizer'])
  backbone_model.load_state_dict(checkpoint['model'])
	instance_model.set_backbone(backbone_model)
	group_model.set_backbone(backbone_model)
        #backbone.cuda()
  start_epoch = checkpoint['epoch']

else:
  print(colored('No checkpoint file at {}'.format(p['pretext_checkpoint']), 'blue'))
  start_epoch = 0
      
  
if os.path.exists(p['pretext_checkpoint_instance']):
  print(colored('Restart from checkpoint (instance_model) {}'.format(p['pretext_checkpoint_instance']), 'blue'))
  checkpoint = torch.load(p['pretext_checkpoint_instance'], map_location='cpu')
  optimizer.load_state_dict(checkpoint['optimizer'])
  instance_head.load_state_dict(checkpoint['model'])  
        
	instance_model.set_head(instance_head)
  instance_model = instance_model.cuda()
  start_epoch = checkpoint['epoch']

else:
  print(colored('No checkpoint file at {}'.format(p['pretext_checkpoint']), 'blue'))
  start_epoch = 0
  instance_model = instance_model.cuda()
        
if os.path.exists(p['pretext_checkpoint_group']):
  print(colored('Restart from checkpoint (group_model) {}'.format(p['pretext_checkpoint_group']), 'blue'))
  checkpoint = torch.load(p['pretext_checkpoint_group'], map_location='cpu')
  optimizer.load_state_dict(checkpoint['optimizer'])
  group_head.load_state_dict(checkpoint['model'])
	
	group_model.set_head(group_head)
  group_model = group_model.cuda()
  start_epoch = checkpoint['epoch']
        
else:
  print(colored('No checkpoint file at {}'.format(p['pretext_checkpoint']), 'blue'))
  start_epoch = 0
  group_model = group_model.cuda()
        
    ###

In [None]:
print(colored('Starting main loop', 'blue'))

for epoch in range(start_epoch, p['epochs']):
  print(colored('Epoch %d/%d' %(epoch, p['epochs']), 'yellow'))
  print(colored('-'*15, 'yellow'))

        #a - Adjust lr
  lr = adjust_learning_rate(p, optimizer, epoch)
  print('Adjusted learning rate to {:.5f}'.format(lr))
        
        #b - Train the model with the clPcl method for one epoch (iteration)
  print('Train ...')
  pcl_cld_train(train_loader = train_dataloader, instance_branch = instance_model, group_branch = group_model, criterion = criterion, optimizer = optimizer, epoch = epoch, M_num_clusters = M_num_clusters)

        #c - Fill memory bank (Data Structure for nearest neighbors of input-instances)
  print('Fill memory bank for kNN...')
  fill_memory_bank(base_dataloader, group_model, memory_bank_base)

        #d - Evaluate (To monitor progress - Not for validation)
  print('Evaluate ...')
  top1 = contrastive_evaluate(val_dataloader, group_model, memory_bank_base)
  print('Result of kNN evaluation is %.2f' %(top1)) 
        
        #e - Checkpoint
  print('Checkpoint ...')
        
  torch.save({'optimizer': optimizer.state_dict(), 'model': group_model.get_backbone().state_dict(), 
                    'epoch': epoch + 1}, p['pretext_checkpoint_backbone'])
        
  torch.save({'optimizer': optimizer.state_dict(), 'model': instance_model.get_head().state_dict(), 
                    'epoch': epoch + 1}, p['pretext_checkpoint_instance'])
                    
  torch.save({'optimizer': optimizer.state_dict(), 'model': group_model.get_head().state_dict(), 
                    'epoch': epoch + 1}, p['pretext_checkpoint_group'])
                    
        

    # Save final model
  torch.save(instance_model.state_dict(), p['pretext_model_instance'])
  torch.save(group_model.state_dict(),p['pretext_model_group'])

# single run

In [None]:
        #7# Training
print(colored('Starting main loop', 'blue'))
#for epoch in range(start_epoch, 1):
    #print(colored('Epoch %d/%d' %(epoch, p['epochs']), 'yellow'))
    #print(colored('-'*15, 'yellow'))

        #a - Adjust lr
lr = adjust_learning_rate(p, optimizer, 1)
print('Adjusted learning rate to {:.5f}'.format(lr))
        
        #b - Train the model with the clPcl method for one epoch (iteration)
print('Train ...')
#def pcl_cld_train(args, train_loader, instance_branch, group_branch, criterion, optimizer, epoch, M_num_clusters):
import torch
import numpy as np
#from spherecluster import VonMisesFisherMixture
import nltk
from nltk.cluster.kmeans import KMeansClusterer
from utils.utils import AverageMeter, ProgressMeter
#backbone = get_backbone_model(p)
#instance_model = get_instance_model(p, backbone)
#group_model = get_group_model(p, backbone)
#losses = AverageMeter('Loss', ':.4e')
#progress = ProgressMeter(len(train_dataloader),[losses],prefix="Epoch: [{}]".format(1))
        
instance_model.train()
group_model.train()
#instance_model = instance_model.cuda()
#group_model = group_model.cuda()
print("initialized pcl_cld_train")

i = 1
batch = next(iter(train_dataloader))
originImage_batch = batch['image']
augmentedImage_batch = batch['image_augmented']
originImage_batch = originImage_batch.cuda(non_blocking=True)
augmentedImage_batch = augmentedImage_batch.cuda(non_blocking=True)
print("batch_image_shape: "+str(originImage_batch.shape))
type(batch)

[34mStarting main loop[0m
Adjusted learning rate to 0.40000
Train ...
initialized pcl_cld_train
batch_image_shape: torch.Size([64, 3, 96, 96])


dict

In [None]:

#print('Model is {}'.format(backbone.__class__.__name__))
#print('Model parameters: {:.2f}M'.format(sum(p.numel() for p in backbone.parameters()) / 1e6))
#print(backbone)
#originImage_batch.cuda()
#augmentedImage_batch.cuda()

logits, labels = instance_model(originImage_batch,augmentedImage_batch)
logits

IN MoCo->_dequeue_and_enqueue: batch_size = 64
queue_ptr = 0
next ptr = 64


tensor([[13.6168,  1.0472,  0.1281,  ..., -0.1120,  0.1565, -1.8276],
        [13.7274,  1.5451, -0.0224,  ..., -0.9390, -0.2487, -1.7784],
        [13.8654,  2.0759,  0.0908,  ..., -1.5972, -0.1473, -2.2851],
        ...,
        [12.9380,  1.9026,  0.2363,  ..., -1.4277,  0.0797, -1.6944],
        [13.8260,  1.3726,  0.0316,  ..., -0.5932, -0.0624, -1.8486],
        [13.8354,  1.0272,  0.1188,  ..., -0.5746,  0.0777, -2.1932]],
       device='cuda:0', grad_fn=<DivBackward0>)

In [None]:
#import torch.nn.functional as F
#labels.cuda() 
#instance_loss = F.cross_entropy(logits,labels)
instance_loss = iloss(logits,labels)
instance_loss

tensor(0.2057, device='cuda:0', grad_fn=<NllLossBackward0>)

In [None]:
original_view = group_model(originImage_batch)
augmented_view = group_model(augmentedImage_batch)

M_kmeans_results = []
MI_kmeans_results = []
concentration_matrices = []
concentration_matrices_I = []
M_labels = []
M_labels_I = []

feature_dim = len(original_view[0])
batch_size = len(original_view)

# ov = original_view.cpu().detach().numpy()
# av = augmented_view.cpu().detach().numpy()
print(feature_dim)
print(batch_size)
print(original_view.shape)

128
64
torch.Size([64, 128])


In [None]:
from spherecluster import SphericalKMeans
from utils.utils import AverageMeter, ProgressMeter
import copy

alpha = 0.1
divzero = 0.1
ov = original_view.cpu().detach().numpy()
print(ov.shape)
av = augmented_view.cpu().detach().numpy()

for k in M_num_clusters:
  #from spherecluster import SphericalKMeans
  skm = SphericalKMeans(n_clusters=k)
  skm.fit(ov)
  skm_I = SphericalKMeans(n_clusters=k)
  skm_I.fit(av)

  M_kmeans_results.append(torch.Tensor(skm.cluster_centers_))
  MI_kmeans_results.append(torch.Tensor(skm_I.cluster_centers_))
            # c -> k
  center = [ torch.Tensor(skm.cluster_centers_[i]) for i in range(len(skm.cluster_centers_)) ]
  center_I = [ torch.Tensor(skm_I.cluster_centers_[i]) for i in range(len(skm_I.cluster_centers_)) ]
  cdat = [ x.unsqueeze(0).expand(batch_size,feature_dim) for x in center]
  cmatrix = torch.cat(cdat,1)
  cdat_I = [ x.unsqueeze(0).expand(batch_size,feature_dim) for x in center_I]
  cmatrix_I = torch.cat(cdat_I,1)

  original_cpu = original_view.cpu()
  augmented_cpu = augmented_view.cpu()          
  fmatrix = torch.Tensor(copy.deepcopy(ov))
  fmatrix_I = torch.Tensor(copy.deepcopy(av))
  #fmatrix = copy.deepcopy(original_cpu)
  #fmatrix_I = copy.deepcopy(augmented_cpu)

  for _ in range(1,k): fmatrix = torch.cat((fmatrix,original_cpu),1)
  for _ in range(1,k): fmatrix_I = torch.cat((fmatrix_I,augmented_cpu),1)
                
  cmatrix = cmatrix.cuda()
  fmatrix = fmatrix.cuda()
  cmatrix_I = cmatrix_I.cuda()
  fmatrix_I = fmatrix_I.cuda()
            
  zmatrix = fmatrix-cmatrix
  zmatrix = zmatrix*zmatrix
  result = zmatrix.flatten(0).view(batch_size,k,feature_dim)
  result = torch.sum(result,2)

  zmatrix_I = fmatrix_I-cmatrix_I
  zmatrix_I = zmatrix_I*zmatrix_I
  result_I = zmatrix_I.flatten(0).view(batch_size,k,feature_dim)
  result_I = torch.sum(result_I,2)
  assign = torch.zeros(batch_size,k)
  assign_I = torch.zeros(batch_size,k)

  for i in range(batch_size):
    assign[i][ int(skm.labels_[i]) ] = 1
    assign_I[i][ int(skm_I.labels_[i]) ] = 1
                
  assign = assign.cuda()
  assign_I = assign_I.cuda()
            
  avgDistance = torch.sum(assign*result,0)
  Z = torch.sum(assign,0) + 1
  Zlog = torch.log(Z+alpha)
  divisor = Z*Zlog
  concentrations = (avgDistance/divisor) + divzero
  concentrations = concentrations.cpu()
            #avgDistance = avgDistance.cuda()
            #divisor = divisor.cuda()
  avgDistance_I = torch.sum(assign_I*result_I,0)
  Z_I = torch.sum(assign_I,0) + 1
  Zlog_I = torch.log(Z_I+alpha)
  divisor_I = Z_I*Zlog_I
  concentrations_I = (avgDistance_I/divisor_I) + divzero
  concentrations_I = concentrations_I.cpu()
            
  concentration_matrices.append(concentrations)
  concentration_matrices_I.append(concentrations_I)
            
  M_labels.append( skm.labels_ )
  M_labels_I.append( skm_I.labels_ )

(64, 128)


In [None]:
# concentration_matrices - list of Tensors
# M_kmeans_results - list of Tensors
# M_labels - list of numpy.ndarray's
# features = original view = cuda-Tensor
concentration_matrices

[tensor([0.1138, 0.1156], grad_fn=<ToCopyBackward0>),
 tensor([0.1164, 0.1114, 0.1126, 0.1159], grad_fn=<ToCopyBackward0>),
 tensor([0.1115, 0.1150, 0.1143, 0.1157, 0.1159, 0.1136, 0.1106, 0.1000],
        grad_fn=<ToCopyBackward0>),
 tensor([0.1106, 0.1120, 0.1000, 0.1139, 0.1095, 0.1096, 0.1072, 0.1093, 0.1132,
         0.1000, 0.1133, 0.1000, 0.1103, 0.1000, 0.1127, 0.1000],
        grad_fn=<ToCopyBackward0>)]

# **Group Loss (debugging)**

In [None]:
#group_loss = criterion
#import math

features = original_view 
features_I = augmented_view 
M_kmeans = M_kmeans_results  
M_kmeans_I = MI_kmeans_results 
concentrations = concentration_matrices 
concentrations_I = concentration_matrices_I 
labels = M_labels 
labels_I = M_labels_I
lb = 1
M_num = len(concentrations)
print(M_num)
batch_size = features.size()[0]
#batch_size = original_view.size()[0]
M_kmeans = M_kmeans_results 
M_kmeans_I = MI_kmeans_results

M_logits = []
M_logits_I = []

#if k == 2: print()
        
for k in range(M_num):
  c = len(concentrations[k]) # c = num_clusters of Mk
  M_cmatrix = torch.zeros(c,batch_size)
  MI_cmatrix = torch.zeros(c,batch_size)
  for i in range(c):
    M_cmatrix[i,:] = 1/concentrations[k][i]
    MI_cmatrix[i,:] = 1/concentrations_I[k][i]

  #if k == 2: print(M_cmatrix)          
  M_cmatrix = M_cmatrix.cuda()
  MI_cmatrix = MI_cmatrix.cuda()     
  centroids = M_kmeans[k].cuda()
  centroids_I = M_kmeans_I[k].cuda()
  gLoss_or = torch.mm(centroids,features_I.T) # OK 
  gLoss_au = torch.mm(centroids_I,features.T)
  #gLoss_or = torch.mm(centroids,augmented_view.T) # OK 
  #gLoss_au = torch.mm(centroids_I,original_view.T)
  print("gLoss_or type: "+str(type(gLoss_or)) )
  print("gLoss_or shape: "+str(gLoss_or.shape) )
#--------------------------------------------------------
  summing_logits = gLoss_or * M_cmatrix # OK
  summing_logits_I = gLoss_au * MI_cmatrix
            
  exp_logits = torch.exp(summing_logits)
  exp_logits_I = torch.exp(summing_logits_I)
  log_sum = torch.sum(exp_logits,0)
  print("log_sum type: "+str(type(log_sum)))
  print("log_sum shape: "+str(log_sum.shape))
  log_sum_I = torch.sum(exp_logits_I,0)
    
  positive_pair = torch.zeros(batch_size)
  positive_pair_I = torch.zeros(batch_size)
  
  exlogCPU = exp_logits.cpu()
  exlogCPU_I = exp_logits_I.cpu()
  #lcpu = labels[k].cuda()
  #lcpu_ = labels_I[k].cuda()
  for l in range(batch_size):
    positive_pair[l] = exlogCPU[int(labels[k][l])][l]
    positive_pair_I[l] = exlogCPU_I[int(labels_I[k][l])][l]
  
  positive_pair = positive_pair.cuda()
  positive_pair_I = positive_pair_I.cuda()
            #positive_pair = torch.exp(torch.mm(positive_pair,gLoss_or))
            #positive_pair_I = torch.exp(torch.mm(positive_pair_I,gLoss_au))
            
  M_logits.append( torch.sum( torch.log(positive_pair/log_sum) ).cpu() ) # +0.0001 ),0).cpu()       ) # check type shape size and len !!!!!!
  M_logits_I.append( torch.sum( torch.log(positive_pair_I/log_sum_I) ).cpu() ) # +0.0001 ),0).cpu() ) 
            
result = lb*(-1/M_num)*0.5*( sum(M_logits) + sum(M_logits_I) )

# if math.isnan(result): result = 1000000
loss = instance_loss + result

result

4
gLoss_or type: <class 'torch.Tensor'>
gLoss_or shape: torch.Size([2, 64])
log_sum type: <class 'torch.Tensor'>
log_sum shape: torch.Size([64])
gLoss_or type: <class 'torch.Tensor'>
gLoss_or shape: torch.Size([4, 64])
log_sum type: <class 'torch.Tensor'>
log_sum shape: torch.Size([64])
gLoss_or type: <class 'torch.Tensor'>
gLoss_or shape: torch.Size([8, 64])
log_sum type: <class 'torch.Tensor'>
log_sum shape: torch.Size([64])
gLoss_or type: <class 'torch.Tensor'>
gLoss_or shape: torch.Size([16, 64])
log_sum type: <class 'torch.Tensor'>
log_sum shape: torch.Size([64])


tensor(107.0621, grad_fn=<MulBackward0>)

In [None]:
#import math
#if math.isnan(result): print("hit")

#M_logits[3]
gLoss_or[15]

tensor([0.7720, 0.8382, 0.9848, 0.9325, 0.9402, 0.9391, 0.9725, 0.7768, 0.9377,
        0.9297, 0.9822, 0.7676, 0.8109, 0.8661, 0.9301, 0.8629, 0.9385, 0.9184,
        0.9419, 0.8366, 0.9571, 0.9364, 0.7582, 0.9660, 0.9743, 0.8421, 0.8107,
        0.9721, 0.8074, 0.9561, 0.8316, 0.6752, 0.8958, 0.6500, 0.8663, 0.9596,
        0.9185, 0.8886, 0.9598, 0.6563, 0.9769, 0.8240, 0.9089, 0.7385, 0.7065,
        0.7262, 0.9192, 0.9111, 0.7214, 0.9165, 0.9661, 0.8357, 0.8414, 0.7493,
        0.9021, 0.9420, 0.8866, 0.9535, 0.9483, 0.8401, 0.8741, 0.9685, 0.9406,
        0.9285], device='cuda:0', grad_fn=<SelectBackward0>)

In [None]:
#losses.update(loss.item())

optimizer.zero_grad()
loss.backward()
optimizer.step()

#if i % 25 == 0:
#  progress.display(i)