In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import numpy as np

import ray
import resnet.models as models
import random,time
from time import sleep
import copy 

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
import argparse
import numpy as np
import os
import shutil

if ray.is_initialized():
    ray.shutdown()
ray.init(memory=5000000000,object_store_memory=3000000000)


2019-10-31 22:17:52,795	INFO resource_spec.py:205 -- Starting Ray with 4.64 GiB memory available for workers and up to 2.79 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).


{'node_ip_address': '10.21.5.177',
 'redis_address': '10.21.5.177:28261',
 'object_store_address': '/tmp/ray/session_2019-10-31_22-17-52_788610_13636/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2019-10-31_22-17-52_788610_13636/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2019-10-31_22-17-52_788610_13636'}

In [2]:
arch = "resnet"
depth = 56
# cuda = torch.cuda.is_available()
cuda = False
seed = 1
save = "./logs"
dataset = "cifar10"
batch_size = 1000
test_batch_size = 1000
kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}
lr = 0.1
momentum=0.9
weight_decay=1e-4
log_interval=100
start_epoch = 0
epochs=160

In [3]:
if not os.path.exists(save):
    os.makedirs(save)

In [4]:
if dataset == "cifar10":
    train_loader = torch.utils.data.DataLoader(
        datasets.CIFAR10('./data.cifar10', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.Pad(4),
                           transforms.RandomCrop(32),
                           transforms.RandomHorizontalFlip(),
                           transforms.ToTensor(),
                           transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
                       ])),
        batch_size=batch_size, shuffle=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(
        datasets.CIFAR10('./data.cifar10', train=False, transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
                       ])),
        batch_size=test_batch_size, shuffle=True, **kwargs)

Files already downloaded and verified


In [5]:
@ray.remote(num_cpus=1)
# @ray.remote
class ParameterServer():
    def __init__(self,lr,num_workers,stalness_limit,test_loader):
        self.lr = lr
        self.model = models.__dict__[arch](dataset=dataset,depth=depth)
        self.stalness_table = [0] * num_workers
        self.stalness_limit = stalness_limit 
        self.global_step = 0
        self.eva_model = models.__dict__[arch](dataset=dataset,depth=depth)
        self.optimizer = optim.SGD(self.model.parameters(),
                          lr=lr,
                          momentum=momentum,
                          weight_decay=weight_decay)
        self.test_loader = test_loader
        
    def apply_gradients(self, gradients, wk_idx):
        for p in self.model.parameters():
            p.data -= self.lr * p.grad.data
        self.stalness_table[wk_idx] += 1
        self.global_step += 1
        if self.global_step % 100 == 0:
            print("global_step: ",self.global_step," and prepare evaluate")
            self.evaluate()
        
    def pull_weights(self):
        return self.model.state_dict()
    
    def check_stalness(self,wk_idx):
        min_iter = min(self.stalness_table)
        return self.stalness_table[wk_idx] - min_iter < self.stalness_limit
        
    def get_stalness(self):
        return min(self.stalness_table)
    
    def evaluate(self):
        self.eva_model.eval()
        test_loss = 0.
        correct = 0.
        cur_wei = self.pull_weights()
        self.eva_model.load_state_dict(cur_wei)
        for data,target in test_loader:
            if cuda: 
                data,target = data.cuda(),target.cuda()
            data,target = Variable(data,volatile=True),Variable(target)
            output = self.eva_model(data)
            test_loss += F.cross_entropy(output,target,size_average=False)
            pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
            correct += pred.eq(target.data.view_as(pred)).cpu().sum()
        test_loss /= len(test_loader.dataset)
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.1f}%)\n'.format(
            test_loss, 
            correct, 
            len(test_loader.dataset),
            100. * correct / len(test_loader.dataset)))


        
@ray.remote(num_cpus=1)
# @ray.remote
def worker_task(ps,worker_index,stale_limit, train_loader,lr,momentum,weight_decay,batch_size=50):
    # Initialize the model.
    model = models.__dict__[arch](dataset=dataset,depth=depth)
    local_step = 0
    optimizer = optim.SGD(model.parameters(),
                          lr=lr,
                          momentum=momentum,
                          weight_decay=weight_decay)
    for batch_idx,(data,target) in enumerate(train_loader):
        if cuda:
            data,target = data.cuda(),target.cuda()
        while(local_step - ray.get(ps.get_stalness.remote()) > stale_limit):
            print(worker_index," works too fast")
            sleep(1)
        # Get the current weights from the parameter server.
        init_wei = ray.get(ps.pull_weights.remote())
        model.load_state_dict(init_wei)
        
        # Compute an update and push it to the parameter server.        
        data, target = Variable(data), Variable(target)
        optimizer.zero_grad()
        loss = F.cross_entropy(model(data),target)
        loss.backward()
        grad = [p.grad for p in model.parameters()]
        local_step += 1
        ps.apply_gradients.remote(grad,worker_index)
        optimizer.step()
        print(worker_index,"has finished update")


In [6]:
num_worker = 1
stalness_table = [0] * num_worker
stalness_limit = 4

ps = ParameterServer.remote(0.1,num_worker,stalness_limit,test_loader)
# worker_tasks = [worker_task.remote(ps,i,stalness_limit) for i in range(num_worker)]



In [7]:
worker_tasks = [worker_task.remote(ps,i,stalness_limit,train_loader,lr,momentum,weight_decay) 
                for i in range(num_worker)]

[2m[36m(pid=13674)[0m THCudaCheck FAIL file=/pytorch/aten/src/THC/THCGeneral.cpp line=51 error=38 : no CUDA-capable device is detected


2019-10-31 22:18:23,513	ERROR worker.py:1719 -- Possible unhandled error from worker: [36mray_worker[39m (pid=13674, host=gpu-comp-207)
  File "<ipython-input-5-71b42fe33335>", line 82, in worker_task
  File "/userhome/34/gyu/anaconda3/envs/pytorch_env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 493, in __call__
    result = self.forward(*input, **kwargs)
  File "/userhome/34/gyu/git_folder/dissertation/src/SSP/resnet/models/resnet.py", line 119, in forward
    x = self.layer2(x)  # 16x16
  File "/userhome/34/gyu/anaconda3/envs/pytorch_env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 493, in __call__
    result = self.forward(*input, **kwargs)
  File "/userhome/34/gyu/anaconda3/envs/pytorch_env/lib/python3.7/site-packages/torch/nn/modules/container.py", line 92, in forward
    input = module(input)
  File "/userhome/34/gyu/anaconda3/envs/pytorch_env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 493, in __call__
    result = self.forw