In [1]:
# pytorch optimizer 让动量参与计算，以及手动修改lr

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import numpy as np

import ray
import resnet.models as models
import random,time
from time import sleep
import copy 
import datetime
import argparse
import sys


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
import numpy as np
import os
import shutil
from torch.utils.tensorboard import SummaryWriter
from filelock import FileLock

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [14]:
def generate_train_loader(batch_size,kwargs):
    train_loader = torch.utils.data.DataLoader(
        datasets.CIFAR10('./data.cifar10', train=True, download=True,
        transform=transforms.Compose([
            transforms.Pad(4),
            transforms.RandomCrop(32),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
        ])),
    batch_size=batch_size, shuffle=True, **kwargs)
    return train_loader

def generate_test_loader(test_batch_size):
    test_loader = torch.utils.data.DataLoader(
        datasets.CIFAR10('./data.cifar10', train=False, transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
        ])),batch_size=test_batch_size, shuffle=True)
    return test_loader

def _get_params(model):
    bns = {}
    non_bns = {}
    param_count = 0.
    bn_param_count = 0.
    for name,param in model.named_parameters():
        param_count += len(param)
        if 'bn' in name:
            bns[name] = param
            bn_param_count += len(param)
        else:
            non_bns[name] = param
#     print("bn params occupies: ",bn_param_count/param_count)
    return bns,non_bns

@ray.remote
class ParameterServer():
    def __init__(self,args,test_loader):
        self.model = models.__dict__["resnet"](dataset="cifar10",depth=args.depth)
        self.stalness_table = [0] * args.num_workers
        self.stalness_limit = args.stalness_limit 
        self.global_step = 0
        self.lr = args.lr
        self.args = args
        self.eva_model = models.__dict__["resnet"](dataset="cifar10",depth=args.depth)
        self.optimizer = optim.SGD(self.model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)
        self.test_loader = test_loader
        self.model.cpu()
        self.eva_model.cpu()
        self.ps_writer = SummaryWriter(os.path.join(os.getcwd(),(args.tb_path+'/ps')))
        self.save_path = args.save
        self.num_workers = args.num_workers
        
        # get point to all non_bns parameters
        self.non_bns = [param.data for name,param in self.model.named_parameters() if 'bn' not in name]
        self.bns = [param.data for name,param in self.model.named_parameters() if 'bn' in name]
        self.bns_sync = [None] * args.num_workers
        
        if args.resume:
            if os.path.isfile(args.resume):
                print("=> loading checkpoint '{}'".format(args.resume))
                checkpoint = torch.load(args.resume)
                self.global_step = checkpoint['global_step']
                self.model.load_state_dict(checkpoint['state_dict'])
                self.optimizer.load_state_dict(checkpoint['optimizer'])
                self.stalness_table = [self.global_step/args.num_workers] * args.num_workers
                print("=> loaded checkpoint '{}' (global step: {})".format(args.resume, checkpoint['global_step']))                
                if 'epoch' in checkpoint: print("epoch: {}".format(checkpoint['epoch']))
            else:
                print("=> no checkpoint found at '{}'".format(args.resume))                    
                
    def apply_gradients(self, iter_diff, wk_idx, epoch):
        if args.debug: print("applying gradients from the ",wk_idx, " worker")
        
        # updata all params
        for idx, p in enumerate(self.model.parameters()):
            p.data -= iter_diff[idx]

#         if wk_idx == 0:
#             # updata all params
#             for idx, p in enumerate(self.model.parameters()):
#                 p.data -= iter_diff[idx]
#         else:
#             # only update non_bns params
#             for idx, tensor in enumerate(self.non_bns):
#                 tensor -= iter_diff[idx]
        
#         # only update non_bns params
#         for idx, tensor in enumerate(self.non_bns):
#             tensor -= iter_diff[idx]
        
        self.stalness_table[wk_idx] += 1
        self.global_step += 1
        if args.debug: print("finished applying gradients from the ",wk_idx, " worker")
        if self.global_step % 1000 == 0:
#             print("global_step: ",self.global_step," and prepare evaluate")
#             self.evaluate()
            self.save_ckpt({
                'epoch':epoch,
                'global_step':self.global_step,
                'state_dict':self.model.state_dict(),
                'optimizer':self.optimizer.state_dict()
            },filepath=os.path.join(os.getcwd(),self.save_path))
    
    
    def apply_gradients_non_bns(self,iter_diff,wk_idx,epoch):
        for i in range(len(self.non_bns)):
            self.non_bns[i] -= iter_diff[i]
#         print(wk_idx,"finished updating non bns on ps")
        self.stalness_table[wk_idx] += 1
        self.global_step += 1

        
    def apply_gradients_bns(self,iter_diff,wk_idx,epoch):
        for idx, tensor in enumerate(self.bns):
#             if idx == len(iter_diff) / 2:
#                 print(wk_idx,"is in the middle of updating bns on ps")
            tensor -= iter_diff[idx]
#         print(wk_idx,"finished updating non bns on ps")
#         self.stalness_table[wk_idx] += 1
#         self.global_step += 1

    def apply_gradients_partical_bns(self, iter_diff, wk_idx, epoch):
        if wk_idx == 0 :
            for i in range(0,len(self.bns)/self.num_workers):
                self.bns[i] -= iter_diff[i]
        elif wk_idx == self.num_workers:
            for i in range(len(self.bns) * wk_idx / self.num_workers + 1 , len(self.bns)):
                self.bns[i] -= iter_diff[i]
        else:
            for i in range(len(self.bns) * wk_idx / self.num_workers + 1 , len(self.bns) * (wk_idx + 1) / self.num_workers):
                self.bns[i] -= iter_diff[i]
        
    def pull_weights(self):
        return self.model.state_dict()
    
    def pull_non_bn_weights(self):
        self.non_bns = [param.data for name,param in self.model.named_parameters() if 'bn' not in name]
        return copy.deepcopy(self.non_bns)
    
    def pull_bn_weights(self):
        self.bns = [param.data for name,param in self.model.named_parameters() if 'bn' in name]
        return copy.deepcopy(self.bns)
    
    def get_optim(self):
        return self.optimizer
    
    def pull_optimizer_state(self):
        return self.optimizer.state_dict()

    def check_stalness(self,wk_idx):
        min_iter = min(self.stalness_table)
        return self.stalness_table[wk_idx] - min_iter < self.stalness_limit
        
    def get_stalness(self):
        return min(self.stalness_table)
    
    def get_stalness_table(self):
        return self.stalness_table
    
    def get_global_step(self):
        return self.global_step
    
    def get_model(self):
        return self.model
    
    def save_ckpt(self,state,filepath):
        torch.save(state,os.path.join(filepath,'checkpoint.pth.tar'))
    
    def get_bns_ready(self):
        return any(self.bns_sync) == False
        
    def aggregate_bns(self,wk_bns,worker_index):
        self.bns_sync[worker_index] = wk_bns
        if all(self.bns_sync):
            for i in range(len(self.bns)):
                tmp = copy.deepcopy(self.bns_sync[0][i])
                for j in range(1, self.num_workers):
                    tmp += self.bns_sync[j][i]
                self.bns[i] = tmp / self.num_workers    
            self.bns_sync = [None] * self.num_workers
        return self.bns_sync
    
    def evaluate(self):
        print("going to evaluate")
        test_loss = 0.
        correct = 0.
        print("pulled weights")
        self.eva_model.load_state_dict(copy.deepcopy(self.model.state_dict()))
        print("loaded weights")
        print("length of the test_loader dataset is : ",len(self.test_loader.dataset))
        self.eva_model.eval()
        count = 0
        for data,target in self.test_loader:
            count += 1
            if count % 20 == 0: print("in eval, the batch is: ",count)
            data, target = Variable(data,volatile=True),Variable(target)
            output = self.eva_model(data)
            batch_loss = F.cross_entropy(output, target, size_average=False).data
            test_loss += batch_loss
            pred = output.data.max(1, keepdim=True)[1]
            correct += pred.eq(target.data.view_as(pred)).cpu().sum()
        len_testset = len(self.test_loader.dataset)
        test_loss /= len_testset 
        accuracy = correct / len_testset
        # log 
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.1f}%)\n'.format(
        test_loss, correct, len_testset,accuracy))

        self.ps_writer.add_scalar('Accuracy/eval', accuracy, self.global_step)
        self.ps_writer.add_scalar('Loss/eval',test_loss , self.global_step)
        
        



IndentationError: expected an indented block (<ipython-input-14-3dfd6e541c5a>, line 130)

In [3]:
@ray.remote(num_gpus=1)
def worker_task(args,ps,worker_index, train_loader):
    # Initialize the model.
#     if args.debug: print(worker_index, " worker is going to sleep ",worker_index*5000)
#     time.sleep(worker_index * 5000)
    
    model = models.__dict__["resnet"](dataset="cifar10",depth=args.depth)
    local_step = 0
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)
    wk_non_bns = [param.data for name,param in model.named_parameters() if 'bn' not in name]
    wk_bns = [param.data for name,param in model.named_parameters() if 'bn' in name]
    
    if args.cuda:
        starttime = datetime.datetime.now()
        model.cuda()
        endtime = datetime.datetime.now()
        time_cost = (endtime - starttime).seconds
        if args.debug: print("move model to gpu takes: ", time_cost, "seconds")
    if args.resume:
        checkpoint = torch.load(args.resume)
        local_step = checkpoint['global_step'] / args.num_workers
        optimizer.load_state_dict(checkpoint['optimizer'])
        if 'epoch' in checkpoint:
            args.start_epoch = checkpoint['epoch']

    wk_writer = SummaryWriter(os.path.join(os.getcwd(),args.tb_path,('wk_'+str(worker_index))))
    print("worker #",worker_index," is online")
    
    # all workers owns the same init values
    init_wei = ray.get(ps.pull_weights.remote())
    model.load_state_dict(init_wei)
    sync_bns_flag = False
    
    
    for epoch in range(args.start_epoch,args.epochs):
        avg_loss = 0.
        train_correct = 0.
        for batch_idx,(data,target) in enumerate(train_loader):
            if args.cuda:
                starttime = datetime.datetime.now()
                data,target = data.cuda(),target.cuda()
                mid = datetime.datetime.now()
                if args.debug: print("move data to gpu takes: ", (mid - starttime).seconds, "seconds")
                model.cuda()
                endtime = datetime.datetime.now()
                time_cost = (endtime - starttime).seconds
                if args.debug: print("move model to gpu takes: ", time_cost, "seconds")
                
            while(local_step - ray.get(ps.get_stalness.remote()) > args.stalness_limit):
                sleep(1)
            

            # Get all weights from the parameter server.
            if args.debug: print("the ",worker_index," pulls wei from ps.")
            init_wei = ray.get(ps.pull_weights.remote())
            model.load_state_dict(init_wei)
            if args.debug: print("the ",worker_index," loaded the latest wei from ps.")
                
#             # Get only non-bn weights from the parameter server.
#             ps_non_bns = ray.get(ps.pull_non_bn_weights.remote())
#             print(worker_index,"pulled non bns from ps")
#             assert len(ps_non_bns) == len(wk_non_bns)
#             for i in range(len(ps_non_bns)):
#                 if i == len(ps_non_bns) / 2:
#                     print(worker_index,"is in the middle of updating non bns")
#                 wk_non_bns[i] = ps_non_bns[i]
#             print(worker_index,"updated non bns from ps and is going to pull bns ")
            
#             # Get only bn weights from the parameter server.
#             ps_bns = ray.get(ps.pull_bn_weights.remote())
#             print(worker_index,"pulled bns from ps")
#             assert len(ps_bns) == len(wk_bns)
#             for i in range(len(ps_bns)):
#                 if i == len(ps_bns) / 2 :
#                     print(worker_index, "is in the middle of updating bns")
#                 wk_bns[i] = ps_bns[i]
#             print(worker_index,"updated bns from ps")

                
            # Compute an update and push it to the parameter server.        
            data, target = Variable(data), Variable(target)
            optimizer.zero_grad()
            if args.debug: print(worker_index,' is generating output')
            output = model(data)
            if args.debug: print(worker_index,' generated output done and going to calculate loss')
            loss = F.cross_entropy(output,target)
            avg_loss += loss
            pred = output.data.max(1,keepdim=True)[1]
            batch_correct = pred.eq(target.data.view_as(pred)).cpu().sum()
            train_correct += batch_correct
            if args.debug: print(worker_index,' calculated loss and going to bp')
            loss.backward()
            if args.debug: print(worker_index,' bp done')
            starttime = datetime.datetime.now()
            model.cpu()
            endtime = datetime.datetime.now()
            time_cost = (endtime - starttime).seconds
            if args.debug: print("move model to cpu takes: ", time_cost, "seconds")
            
            
            if worker_index == 0:
                # calculate difference for this iteration
                old_tensors = copy.deepcopy([p.data for p in model.parameters()])    
                optimizer.step()
                new_tensors = [p.data for p in model.parameters()]
                iter_diff = [(old_tensor - new_tensor)/args.num_workers for (old_tensor, new_tensor) in zip(old_tensors,new_tensors)]
                # print("passing sizes: ",sys.getsizeof(iter_diff))
                ps.apply_gradients.remote(iter_diff,worker_index,epoch)
            
            else:
                old_tensors_non_bns = copy.deepcopy([param.data for name,param in model.named_parameters() if 'bn' not in name])
                optimizer.step()
                wk_non_bns = [param.data for name,param in model.named_parameters() if 'bn' not in name]
                iter_diff_non_bns = [(old_tensor_non_bns - new_tensor_non_bns)/args.num_workers for (old_tensor_non_bns, new_tensor_non_bns) in zip(old_tensors_non_bns,wk_non_bns)]
                ps.apply_gradients_non_bns.remote(iter_diff_non_bns,worker_index,epoch)
        

                
            
#             if worker_index == 0:
# #                 print(worker_index, "passing all params")
#                 # calculate difference for this iteration
#                 old_tensors = copy.deepcopy([p.data for p in model.parameters()])    
#                 optimizer.step()
#                 new_tensors = [p.data for p in model.parameters()]
#                 iter_diff = [(old_tensor - new_tensor)/args.num_workers for (old_tensor, new_tensor) in zip(old_tensors,new_tensors)]
#                 # print("passing sizes: ",sys.getsizeof(iter_diff))
#                 ps.apply_gradients.remote(iter_diff,worker_index,epoch)
#             else:
# #                 print(worker_index, "passing non bn params")
#                 #calculate only non-bn parameters difference 
#                 old_tensors = copy.deepcopy([param.data for name,param in model.named_parameters() if 'bn' not in name])
#                 optimizer.step()
#                 new_tensors = [param.data for name,param in model.named_parameters() if 'bn' not in name]
#                 iter_diff = [(old_tensor - new_tensor)/args.num_workers for (old_tensor, new_tensor) in zip(old_tensors,new_tensors)]
#                 # print("passing sizes: ",sys.getsizeof(iter_diff))
#                 ps.apply_gradients.remote(iter_diff,worker_index,epoch)

              # calculate parameters difference 
#             old_tensors_non_bns = copy.deepcopy([param.data for name,param in model.named_parameters() if 'bn' not in name])
# #             old_tensors_bns = copy.deepcopy([param.data for name,param in model.named_parameters() if 'bn' in name])
#             optimizer.step()
#             new_tensors_non_bns = [param.data for name,param in model.named_parameters() if 'bn' not in name]
# #             new_tensors_bns = [param.data for name,param in model.named_parameters() if 'bn' in name]
            
#             iter_diff_non_bns = [(old_tensor_non_bns - new_tensor_non_bns)/args.num_workers for (old_tensor_non_bns, new_tensor_non_bns) in zip(old_tensors_non_bns,new_tensors_non_bns)]
# #             iter_diff_bns = [(old_tensor_bns - new_tensor_bns)/args.num_workers for (old_tensor_bns, new_tensor_bns) in zip(old_tensors_bns,new_tensors_bns)]
#             # print("passing sizes: ",sys.getsizeof(iter_diff))
#             ps.apply_gradients_non_bns.remote(iter_diff_non_bns,worker_index,epoch)
#             print(worker_index, "pushed non bns to ps and is going to push bns to ps")
#             ps.apply_gradients_bns.remote(iter_diff_bns,worker_index,epoch)
#             print(worker_index, "pushed bns to ps")
            
            
#             # aggregate and sync bn parameters    
#             if sync_bns_flag:
#                 print("SYNC BNS: worker #",worker_index," is checking to pull bns from ps ")
#                 if ray.get(ps.get_bns_ready.remote()):
#                     print("SYNC BNS: worker #",worker_index," is pulling bns from ps ")
#                     ps_wei = ray.get(ps.pull_weights.remote())
#                     model.load_state_dict(ps_wei)
#                     sync_bns_flag = False
#             if local_step % args.sync_bns == 0:
#                 sync_bns_flag = True
#                 model.cpu()
#                 if args.debug: print("SYNC BNS: goint to aggregate bns")
#                 wk_bns = [param.data for name,param in model.named_parameters() if 'bn' in name]
#                 bns_sync = ray.get(ps.aggregate_bns.remote(wk_bns,worker_index))
#                 print("SYNC BNS: bns of worker #",worker_index," have been pushed")
                
#                 if any(bns_sync) == False: # when all workers have pushed their own bns parameters to ps
#                     print("SYNC BNS: worker #",worker_index," is last worker pushed its bns")
#                     ps_wei = ray.get(ps.pull_weights.remote())
#                     model.load_state_dict(ps_wei)
#                     sync_bns_flag = False
#                 if args.cuda: model.cuda()
            
                    
                
                
                
            local_step += 1
            if batch_idx % args.log_interval == 0:
                print('The {} worker, Train Epoch: {} [{}/{} ({:.1f}%)]\tLoss: {:.6f}'.format(
                worker_index, epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.data))
                
                for name,param in model.named_parameters():
                    wk_writer.add_histogram(name, param, local_step)

                wk_writer.add_scalar("Loss/worker_train",loss,local_step)
                wk_writer.add_scalar("Accuracy/worker_train",batch_correct.float()/len(data),local_step)
                
        print("The {} worker finished its {} epoch with loss: {} and accuracy: {}".format(
            worker_index,
            epoch,
            avg_loss / float(len(train_loader.dataset)),
            train_correct.float() / float(len(train_loader.dataset))
        ))

In [9]:
parser = argparse.ArgumentParser(description='Distributed SSP CIFAR-10 Restnet train with network slimming')
parser.add_argument('--ray-master',type=str,default='127.0.0.1')
parser.add_argument('--redis-port',type=str,default='6379')
parser.add_argument('--batch-size',type=int,default=64)
parser.add_argument('--test-batch-size', type=int, default=64)
parser.add_argument('--epochs', type=int, default=160)
parser.add_argument('--start-epoch', default=0, type=int)
parser.add_argument('--lr', type=float, default=0.1)
parser.add_argument('--momentum', type=float, default=0.9)
parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float)
parser.add_argument('--resume', default=None, type=str) 
parser.add_argument('--no-cuda', action='store_true', default=False)
parser.add_argument('--save', default='./logs', type=str)
parser.add_argument('--depth', default=164, type=int)
parser.add_argument('--tb-path', default='./logs', type=str)
parser.add_argument('--log-interval', type=int, default=100)
parser.add_argument('--num-workers',type=int,default=1)
parser.add_argument('--stalness-limit',type=int,default=5)
parser.add_argument('--debug',action='store_true',default=False)
parser.add_argument('--sync-bns',type=int, default=194)

args = parser.parse_args(args=['--num-workers=3'])
# '--resume=/userhome/34/gyu/logs/checkpoint.pth.tar'
# '--tb-path=logs_no_bns','--save=logs_no_bns'

args.cuda = not args.no_cuda and torch.cuda.is_available()

In [10]:
if ray.is_initialized():
    ray.shutdown()

In [11]:
ray.init(address=args.ray_master+':'+args.redis_port)

    

{'node_ip_address': '10.21.5.171',
 'redis_address': '10.21.5.171:6379',
 'object_store_address': '/tmp/ray/session_2019-12-05_14-53-20_978335_17868/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2019-12-05_14-53-20_978335_17868/sockets/raylet',
 'webui_url': 'http://10.21.5.171:8080/?token=6afbafabcdbb1b0f41cd13f86a44c962d1cc3f4a279b841d',
 'session_dir': '/tmp/ray/session_2019-12-05_14-53-20_978335_17868'}

In [12]:
kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}

test_loader = generate_test_loader(args.test_batch_size)
train_loaders = [generate_train_loader(args.batch_size,kwargs) for _ in range(args.num_workers)]

resume_from_ckpt = args.resume if (args.resume and os.path.isfile(args.resume)) else None

ps = ParameterServer.remote(args,test_loader)



Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified




In [13]:
worker_tasks = [worker_task.remote(args,ps,idx,train_loaders[idx]) for idx in range(args.num_workers)]

[2m[36m(pid=29165, ip=10.21.5.173)[0m worker # 0  is online
[2m[36m(pid=17901)[0m worker # 1  is online
[2m[36m(pid=13523, ip=10.21.5.172)[0m worker # 2  is online
[2m[36m(pid=17901)[0m The 1 worker finished its 0 epoch with loss: 0.04055042937397957 and accuracy: 0.2582800090312958
[2m[36m(pid=29165, ip=10.21.5.173)[0m The 0 worker finished its 0 epoch with loss: 0.038807328790426254 and accuracy: 0.26263999938964844
[2m[36m(pid=13523, ip=10.21.5.172)[0m The 2 worker finished its 0 epoch with loss: 0.03870898485183716 and accuracy: 0.2603200078010559
[2m[36m(pid=29165, ip=10.21.5.173)[0m The 0 worker finished its 1 epoch with loss: 0.0270216204226017 and accuracy: 0.37373998761177063
[2m[36m(pid=17901)[0m The 1 worker finished its 1 epoch with loss: 0.02700280211865902 and accuracy: 0.37303999066352844
[2m[36m(pid=13523, ip=10.21.5.172)[0m The 2 worker finished its 1 epoch with loss: 0.026932919397950172 and accuracy: 0.3771800100803375


[2m[36m(pid=17901)[0m The 1 worker finished its 2 epoch with loss: 0.024714648723602295 and accuracy: 0.43050000071525574
[2m[36m(pid=29165, ip=10.21.5.173)[0m The 0 worker finished its 2 epoch with loss: 0.024671431630849838 and accuracy: 0.4288400113582611
[2m[36m(pid=13523, ip=10.21.5.172)[0m The 2 worker finished its 2 epoch with loss: 0.024657024070620537 and accuracy: 0.4309000074863434
[2m[36m(pid=29165, ip=10.21.5.173)[0m The 0 worker finished its 3 epoch with loss: 0.023182259872555733 and accuracy: 0.46522000432014465
[2m[36m(pid=17901)[0m The 1 worker finished its 3 epoch with loss: 0.023138610646128654 and accuracy: 0.4679200053215027
[2m[36m(pid=13523, ip=10.21.5.172)[0m The 2 worker finished its 3 epoch with loss: 0.023221731185913086 and accuracy: 0.4657000005245209
[2m[36m(pid=29165, ip=10.21.5.173)[0m The 0 worker finished its 4 epoch with loss: 0.021674249321222305 and accuracy: 0.5038800239562988
[2m[36m(pid=17901)[0m The 1 worker finished its

[2m[36m(pid=17901)[0m The 1 worker finished its 5 epoch with loss: 0.019414596259593964 and accuracy: 0.5579800009727478
[2m[36m(pid=29165, ip=10.21.5.173)[0m The 0 worker finished its 5 epoch with loss: 0.019409097731113434 and accuracy: 0.5584999918937683
[2m[36m(pid=13523, ip=10.21.5.172)[0m The 2 worker finished its 5 epoch with loss: 0.019476091489195824 and accuracy: 0.5571799874305725
[2m[36m(pid=29165, ip=10.21.5.173)[0m The 0 worker finished its 6 epoch with loss: 0.01763073168694973 and accuracy: 0.6018400192260742
[2m[36m(pid=17901)[0m The 1 worker finished its 6 epoch with loss: 0.017630264163017273 and accuracy: 0.6019999980926514
[2m[36m(pid=13523, ip=10.21.5.172)[0m The 2 worker finished its 6 epoch with loss: 0.01774599216878414 and accuracy: 0.5967599749565125
[2m[36m(pid=29165, ip=10.21.5.173)[0m The 0 worker finished its 7 epoch with loss: 0.016106506809592247 and accuracy: 0.6348199844360352
[2m[36m(pid=17901)[0m The 1 worker finished its 7 e

[2m[36m(pid=29165, ip=10.21.5.173)[0m The 0 worker finished its 8 epoch with loss: 0.013450773432850838 and accuracy: 0.698199987411499
[2m[36m(pid=17901)[0m The 1 worker finished its 8 epoch with loss: 0.013541066087782383 and accuracy: 0.6970999836921692
[2m[36m(pid=13523, ip=10.21.5.172)[0m The 2 worker finished its 8 epoch with loss: 0.013458478264510632 and accuracy: 0.699940025806427


In [35]:
print(1)

1
[2m[36m(pid=17901)[0m The 1 worker finished its 21 epoch with loss: 0.00375644164159894 and accuracy: 0.9160599708557129
[2m[36m(pid=29165, ip=10.21.5.173)[0m The 0 worker finished its 21 epoch with loss: 0.003850584849715233 and accuracy: 0.9132000207901001
[2m[36m(pid=13523, ip=10.21.5.172)[0m The 2 worker finished its 21 epoch with loss: 0.00378871476277709 and accuracy: 0.915880024433136
[2m[36m(pid=17901)[0m The 1 worker finished its 22 epoch with loss: 0.0036460247356444597 and accuracy: 0.9182599782943726
[2m[36m(pid=29165, ip=10.21.5.173)[0m The 0 worker finished its 22 epoch with loss: 0.0036702940706163645 and accuracy: 0.9179999828338623
[2m[36m(pid=13523, ip=10.21.5.172)[0m The 2 worker finished its 22 epoch with loss: 0.0036380342207849026 and accuracy: 0.9184600114822388
[2m[36m(pid=29165, ip=10.21.5.173)[0m The 0 worker finished its 23 epoch with loss: 0.0034960494376719 and accuracy: 0.9209799766540527
[2m[36m(pid=17901)[0m The 1 worker finishe

[2m[36m(pid=17901)[0m The 1 worker finished its 24 epoch with loss: 0.0033512385562062263 and accuracy: 0.9241600036621094
[2m[36m(pid=29165, ip=10.21.5.173)[0m The 0 worker finished its 24 epoch with loss: 0.003363002324476838 and accuracy: 0.9261000156402588
[2m[36m(pid=13523, ip=10.21.5.172)[0m The 2 worker finished its 24 epoch with loss: 0.0033660982735455036 and accuracy: 0.9248200058937073
[2m[36m(pid=29165, ip=10.21.5.173)[0m The 0 worker finished its 25 epoch with loss: 0.0032768428791314363 and accuracy: 0.9257799983024597
[2m[36m(pid=17901)[0m The 1 worker finished its 25 epoch with loss: 0.0032924541737884283 and accuracy: 0.9259799718856812
[2m[36m(pid=13523, ip=10.21.5.172)[0m The 2 worker finished its 25 epoch with loss: 0.0032450456637889147 and accuracy: 0.9266200065612793


In [None]:
print(1)

In [50]:
local_test_loader = generate_test_loader(64)
local_train_loader = generate_train_loader(64,{'num_workers': 1, 'pin_memory': True})

test_writer = SummaryWriter()

local_test_model = models.__dict__["resnet"](dataset="cifar10",depth=args.depth)
checkpoint = torch.load('/userhome/34/gyu/logs/checkpoint.pth.tar')
local_test_model.load_state_dict(checkpoint['state_dict'])
local_test_model.cuda()


Files already downloaded and verified


ResNet(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace)
      (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace)
      (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2):

In [51]:
def test_model(model, test_dataloader):
    local_test_model = model
    local_test_loader = test_dataloader
    # local_test_model.train()
    local_test_model.eval()
    # test dataset loader 
    test_loss = 0.
    correct = 0.
    batch_count = 0.
    for data, target in local_test_loader:
        data,target = data.cuda(),target.cuda()
        batch_count += 1
        data, target = Variable(data, volatile=True), Variable(target)
        output = local_test_model(data)
        test_loss += F.cross_entropy(output, target, size_average=False).data # sum up batch loss
        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
        batch_correct = pred.eq(target.data.view_as(pred)).sum()
        correct += batch_correct
#         if batch_count % 100  == 0:
#             print("        with model.eval(), batch num: ",batch_count, " with correct: ",int(batch_correct.data), " / ",len(data))

    test_loss /= len(local_test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.6f})\n'.format(
        test_loss, correct, len(local_test_loader.dataset),
        correct / float(len(local_test_loader.dataset))))

In [52]:
# local_test_model.train()
# test_model(local_test_model,local_test_loader)


In [53]:
# train dataset loader, but set model.eval(), acc=11% for one epoch 
# local_test_model.eval()
local_test_model.train()
# train dataset loader

test_loss = 0.
correct = 0.
train_batch_count = 0.
num_batch = 0
for data, target in local_train_loader:
    if train_batch_count % 10 == 0:
        print(train_batch_count)
        local_test_model.eval()
        test_model(local_test_model,local_test_loader)
        
    if train_batch_count==100:
        break
        
    num_batch += 1   
    local_test_model.train()
    data,target = data.cuda(),target.cuda()
    train_batch_count += 1
    data, target = Variable(data, volatile=True), Variable(target)
    output = local_test_model(data)
    test_loss += F.cross_entropy(output, target, size_average=False).data # sum up batch loss
    pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
    batch_correct = pred.eq(target.data.view_as(pred)).sum()
    correct += batch_correct
    
    for name,param in local_test_model.named_parameters():
        test_writer.add_histogram(name, param, num_batch)

    
#     if train_batch_count % 100  == 0:
#         print("With model.train(), batch num: ",train_batch_count, " , with correct: ",int(batch_correct.data), " / ", len(data))

test_loss /= len(local_train_loader.dataset)
print('\nTrain set: Average loss: {:.4f}, Accuracy: {}/{} ({:.6f})\n'.format(
    test_loss, correct, len(local_train_loader.dataset),
    correct / float(len(local_train_loader.dataset))))

0.0


  del sys.path[0]



Test set: Average loss: 5.4273, Accuracy: 1396/10000 (0.000000)





10.0

Test set: Average loss: 2.2098, Accuracy: 1824/10000 (0.000000)

20.0

Test set: Average loss: 0.9871, Accuracy: 7004/10000 (0.000000)

30.0

Test set: Average loss: 0.4489, Accuracy: 8526/10000 (0.000000)

40.0

Test set: Average loss: 0.4247, Accuracy: 8711/10000 (0.000000)

50.0

Test set: Average loss: 0.4270, Accuracy: 8738/10000 (0.000000)

60.0

Test set: Average loss: 0.4355, Accuracy: 8719/10000 (0.000000)

70.0

Test set: Average loss: 0.4320, Accuracy: 8730/10000 (0.000000)

80.0

Test set: Average loss: 0.4339, Accuracy: 8728/10000 (0.000000)

90.0

Test set: Average loss: 0.4339, Accuracy: 8722/10000 (0.000000)

100.0

Test set: Average loss: 0.4405, Accuracy: 8713/10000 (0.000000)


Train set: Average loss: 0.0241, Accuracy: 5975/50000 (0.000000)



In [None]:
local_bns

In [None]:
local_bns2,local_nonbns2 = _get_params(local_test_model)


In [None]:
local_bns2

In [None]:
state_dict = ray.get(ps.pull_weights.remote())

In [None]:
non_bn={}
for ele in state_dict:
    if 'bn' not in ele:
        non_bn[ele]=state_dict[ele]


In [None]:
init_wei = ray.get(ps.pull_weights.remote())

In [None]:
init_wei[]