In [20]:
import matplotlib.pylab as plt
%matplotlib inline
import numpy as np
import torch
import tqdm
from sklearn.model_selection import StratifiedShuffleSplit
import json

In [4]:
#!rm *txt
#!rm *jsonl


In [32]:
EPOCH_NUM = 1000
TEST_NUM = 2000
TRAIN_NUM = 16 # training dataset size
POLY_NUM = 3
D = 4 # number of domains
HEAD_NUM = D
RUN_NUM = 5
GRAD_CLIP = 1500.0 # for unstable gradients pruning

def rmse(y, y_hat):
    return torch.sqrt(((y-y_hat)**2).mean())
crit2 = rmse # eval criterion

In [33]:
def make_angles():
    if D == 2:
        return [0, 180] # hard-coding angles for 2D cases (to be opposite)
    step = 360/D
    return [i*step for i in range(D)]
make_angles()

[0.0, 90.0, 180.0, 270.0]

In [34]:
# data generation
from scipy.ndimage import rotate
def gen(N1, N2, sigma=.1, seed=42, angle = 0.0):
    """
    N1: train size
    N2: test size
    """
    angle = np.pi/2 * angle/90
    N = N1+N2
    rs = np.random.RandomState(seed)
    x = rs.randn(N, 2) 
    y = x[:,0]*0.25 + x[:,1]*0.75 # y = X1 * 0.25 + x2 * 0.75, other  X components are irrelevant 
    
    if angle: # making rotation if angle != 0
        M = np.array([[np.cos(angle), -np.sin(angle)], [np.sin(angle), np.cos(angle)]])
        x[:,:2] = (M@x[:,:2].T).T
    # shuffling
    elems = list(range(len(x)))
    rs.shuffle(elems)
    trainval, test = elems[:N1], elems[N1:]
    xtrainval, ytrainval = x[trainval], y[trainval]
    xtest,ytest = x[test], y[test]
    elems = list(range(len(xtrainval)))
    rs.shuffle(elems)
    # actually this code is a bit messy, it was inherited from classification task
    train, val = elems[:N1//2], elems[N1//2:]
    
    xtrain, ytrain = xtrainval[train], ytrainval[train]
    xval, yval = xtrainval[val], ytrainval[val]
    return (xtrain, ytrain), (xval, yval), (xtest, ytest)
xy, _,_ = gen(4, 2, angle=0)


In [24]:
def poly_generate(x):
    result = [x]
    for poly in range(POLY_NUM):
        deg = poly+2 
        new_x = x ** deg#/(deg)
        result.append(new_x)
    result =  torch.hstack(result)
    return result
poly_generate(torch.tensor(xy[0])).std(0)

tensor([0.6789, 0.2316, 0.0226, 0.1399, 0.1570, 0.0696, 0.0104, 0.0330],
       dtype=torch.float64)

In [25]:
# one-model for strictly one domain, no NAS
accs = []
for k in range(RUN_NUM):
    torch.manual_seed(42+k)
    (x_train, y_train), (x_val, y_val), (x_test, y_test) = gen(TRAIN_NUM, TEST_NUM, seed=42+k)
    x_train = np.concatenate([x_train, x_val]) # we don't use train/validation step
    y_train = np.concatenate([y_train, y_val])
    
    lin_model = torch.nn.Linear(2*(POLY_NUM+1),1)
    model = lambda x: lin_model(poly_generate(x))
    opt = torch.optim.SGD(lin_model.parameters(), lr=1e-3)
    crit = torch.nn.MSELoss()
    tq = tqdm.tqdm_notebook(range(EPOCH_NUM))
    
    for e in tq:
        opt.zero_grad()
        out = model(torch.tensor(x_train).float())[:,0]
        loss = crit(out, torch.tensor(y_train).float())
        loss.backward()
        torch.nn.utils.clip_grad_value_(lin_model.parameters(), GRAD_CLIP)
        opt.step()
        tq.set_description(str(loss.item()))
    acc = crit2(model(torch.tensor(x_test).float())[:,0], torch.tensor(y_test).float()).item()
    accs.append(acc)
    
print (accs, np.mean(accs)) 

with open('one_model.txt', 'a') as out:
    out.write(f'D={D}, HEAD_NUM={HEAD_NUM}, RUN_NUM={RUN_NUM}, CRIT={np.mean(accs)}+-{np.std(accs)}.   MIN={np.min(accs)}, MAX={np.max(accs)}\n')
with open('one_model.jsonl', 'a') as out:
    out.write(json.dumps({'D': D, 'HEAD_NUM':HEAD_NUM, 'RUN_NUM': RUN_NUM, 'accs': accs})+'\n')
    

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tq = tqdm.tqdm_notebook(range(EPOCH_NUM))


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

[0.5906937122344971, 0.5878298878669739, 1.6036850214004517, 0.7943270206451416, 2.0315659046173096] 1.1216203093528747


In [35]:
# shared-model:
# one-model for all the domains, without any rotation, without NAS
accs = []
for k in range(RUN_NUM):
    torch.manual_seed(42+k)
    x_train, y_train, x_test, y_test = [],[],[],[]
    for angle in make_angles():
        (_x_train, _y_train), (x_val, y_val), (_x_test, _y_test) = gen(TRAIN_NUM,
                                                                       TEST_NUM, seed=42+k+int(angle),
                                                                       angle=angle)
        
            
        
        _x_train = np.concatenate([_x_train, x_val]) # we don't use train/validation step
        _y_train = np.concatenate([_y_train, y_val])
        x_train.extend(_x_train)
        y_train.extend(_y_train)
        x_test.extend(_x_test)
        y_test.extend(_y_test)
    
    lin_model = torch.nn.Linear(2*(1+POLY_NUM),1) 
    opt = torch.optim.SGD(lin_model.parameters(), lr=1e-3)
    model = lambda x: lin_model(poly_generate(x))
    crit = torch.nn.MSELoss()
    tq = tqdm.tqdm_notebook(range(EPOCH_NUM  * D)) # in darts we make D iterations per epoch. So here we multiply EPOCH_NUM  * D for compensation
    
    for e in tq:
        opt.zero_grad()
        out = model(torch.tensor(x_train).float())[:,0]
        loss = crit(out, torch.tensor(y_train).float())
        loss.backward()
        torch.nn.utils.clip_grad_value_(lin_model.parameters(), GRAD_CLIP)
        opt.step()
        tq.set_description(str(loss.item()))
    acc = crit2(model(torch.tensor(x_test).float())[:,0], torch.tensor(y_test).float()).item()
    accs.append(acc)
    print (acc)
print (accs, np.mean(accs)) 

with open('shared_model.txt', 'a') as out:
    out.write(f'D={D}, HEAD_NUM={HEAD_NUM}, RUN_NUM={RUN_NUM},  CRIT={np.mean(accs)}+-{np.std(accs)}. MIN={np.min(accs)}, MAX={np.max(accs)}\n')
with open('shared_model.jsonl', 'a') as out:
    out.write(json.dumps({'D': D, 'HEAD_NUM':HEAD_NUM, 'RUN_NUM': RUN_NUM, 'accs': accs})+'\n')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tq = tqdm.tqdm_notebook(range(EPOCH_NUM  * D)) # in darts we make D iterations per epoch. So here we multiply EPOCH_NUM  * D for compensation


  0%|          | 0/4000 [00:00<?, ?it/s]

1.3885122537612915


  0%|          | 0/4000 [00:00<?, ?it/s]

1.4878994226455688


  0%|          | 0/4000 [00:00<?, ?it/s]

0.8789204359054565


  0%|          | 0/4000 [00:00<?, ?it/s]

1.004591941833496


  0%|          | 0/4000 [00:00<?, ?it/s]

0.8532770276069641
[1.3885122537612915, 1.4878994226455688, 0.8789204359054565, 1.004591941833496, 0.8532770276069641] 1.1226402163505553


In [None]:
lin_model.weight

In [36]:
# perfect-case:
# one model for all the data, but the data is perfectly rotated
accs = []
for k in range(RUN_NUM):
    torch.manual_seed(42+k)
    x_train, y_train, x_test, y_test = [],[],[],[]
    for angle in make_angles():
        (_x_train, _y_train), (x_val, y_val), (_x_test, _y_test) = gen(TRAIN_NUM, 2000, seed=42+k+int(angle),
                                                                       angle=0.0)
        
        _x_train = np.concatenate([_x_train, x_val]) # we don't use train/validation step
        _y_train = np.concatenate([_y_train, y_val])
        x_train.extend(_x_train)
        y_train.extend(_y_train)
        x_test.extend(_x_test)
        y_test.extend(_y_test)
    
    
    lin_model = torch.nn.Linear(2*(POLY_NUM+1),1) 
    opt = torch.optim.SGD(lin_model.parameters(), lr=1e-3)
    model = lambda x: lin_model(poly_generate(x))
    crit = torch.nn.MSELoss()
    tq = tqdm.tqdm_notebook(range(EPOCH_NUM * D))
    
    for e in tq:
        opt.zero_grad()
        out = model(torch.tensor(x_train).float())[:,0]
        loss = crit(out, torch.tensor(y_train).float())
        loss.backward()
        torch.nn.utils.clip_grad_value_(lin_model.parameters(), GRAD_CLIP)
        opt.step()
        tq.set_description(str(loss.item()))
    acc = crit2(model(torch.tensor(x_test).float())[:,0], torch.tensor(y_test).float()).item()
    accs.append(acc)
    print (acc)
print(np.mean(accs)) 
with open('perfect_model.txt', 'a') as out:
    out.write(f'D={D}, HEAD_NUM={HEAD_NUM}, RUN_NUM={RUN_NUM}, CRIT={np.mean(accs)}+-{np.std(accs)}.   MIN={np.min(accs)}, MAX={np.max(accs)}\n')
with open('perfect_model.jsonl', 'a') as out:
    out.write(json.dumps({'D': D, 'HEAD_NUM':HEAD_NUM, 'RUN_NUM': RUN_NUM, 'accs': accs})+'\n')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tq = tqdm.tqdm_notebook(range(EPOCH_NUM * D))


  0%|          | 0/4000 [00:00<?, ?it/s]

0.0724780336022377


  0%|          | 0/4000 [00:00<?, ?it/s]

0.35640406608581543


  0%|          | 0/4000 [00:00<?, ?it/s]

0.11975093930959702


  0%|          | 0/4000 [00:00<?, ?it/s]

0.16009056568145752


  0%|          | 0/4000 [00:00<?, ?it/s]

0.0829872339963913
0.1583421677350998


In [37]:
# rotation module for torch. Angles are in degrees
class Rot(torch.nn.Module):
    def __init__(self, angle = None):
        super().__init__()
        if angle is None:
            angle = torch.randn(1)*90
        else:
            angle = torch.tensor(angle).float()
        self.angle = torch.nn.Parameter(angle)
    
    def forward(self, x):
        angle = self.angle * np.pi/2 / 90
        c = torch.cos(angle)
        s = torch.sin(angle)
        M = torch.vstack([torch.hstack([c, -s]), torch.hstack([s, c])])
        result = x * 1.0
        result[:,:2] = (M@x[:,:2].T).T
        return result


In [38]:
# helper for make inference for a model

def make_inference(x, rots, models, gammas, use_softmax=True, only_angle=False, c_head = False):
    if use_softmax:
        g0 = torch.nn.functional.softmax(gammas[0])
    else:
        g0 = (gammas[0] == gammas[0].max()) * 1
        
    x = torch.sum(torch.cat([(r(x)*g0[i]).unsqueeze(2) for i, r in enumerate(rots)], 2), 2)
    if only_angle:
        return x 
    x = poly_generate(x)
    if c_head:
        g1 = torch.zeros(gammas[1].shape)
        g1[0] += 0
    if use_softmax:
        g1 = torch.nn.functional.softmax(gammas[1])
    else:
        g1 = (gammas[1] == gammas[1].max()) * 1

    x = torch.sum(torch.cat([(m(x)*g1[i]).unsqueeze(2) for i, m in enumerate(models)], 2), 2)
    
    return x


In [39]:
nn = torch.nn
# simple triplet loss: for each objects takes a random triplet (with triplet conditions)
class MdTripletLoss(nn.Module):
    def __init__(self, m =0.0, p=2, subset_size = 9999, sample_num = 1):
        super(MdTripletLoss, self).__init__()
        self.triplet_loss = nn.TripletMarginLoss(margin=m, p=p)
        self.rs = np.random.RandomState(21)
        self.subset_size = subset_size
        self.sample_num = sample_num
    
    def forward(self, h1: torch.Tensor, h2: torch.Tensor, labels1: torch.LongTensor, labels2: torch.LongTensor):
        """
        :param: h1: hidden representations of size (bs, *), anchors
        :param: h1: hidden representations of size (bs, *), positives and negatives candidates
        """
        FEATURE_TRIPLET_DIM = 2
        h1 = h1[:, :FEATURE_TRIPLET_DIM]
        h2 = h2[:, :FEATURE_TRIPLET_DIM]
        
        bs = h1.size(0)
        if bs > self.subset_size:
            elems = list(range(bs))
            self.rs.shuffle(elems)
            h1 = h1[elems[:self.subset_size]]
            labels1 = labels1[elems[:self.subset_size]]
            self.rs.shuffle(elems)
            h2 = h2[elems[:self.subset_size]]
            labels2 = labels2[elems[:self.subset_size]]
            bs = self.subset_size
            
        
        h1 = h1.view(bs, -1)
        h2 = h2.view(bs, -1)
        
        loss = 0.0
        anch =  []
        pos = []
        neg = []
        ids1 = list(range(len(h1)))
        
        ids2 = list(range(len(h2)))
        rs = self.rs
        for _ in range(self.sample_num):
           
            #ids2 = ids2[::-1]
            rs.shuffle(ids1)
            rs.shuffle(ids2)
            h1 = h1[ids1]
            h2 = h2[ids2]
            labels1 = labels1[ids1]
            labels2 = labels2[ids2]

            for i in range(len(h1)):
                found = False 
                for j in range(len(h2)):

                    if found:
                        break
                    for k in range(j+1, len(h2)):
                        if   abs(labels1[i] - labels2[j]) > abs(labels1[i] - labels2[k]): #\
                        #and torch.linalg.norm(h1[i] - h2[j]) < torch.linalg.norm(h1[i] - h2[k]):
                            anch.append(h1[i])
                            pos.append(h2[k])
                            neg.append(h2[j])

                            found = True
                            break
        
        if len(anch) >0:
            a = torch.vstack(anch)
            p = torch.vstack(pos)
            n = torch.vstack(neg)
            loss = self.triplet_loss(a, p, n)
            return loss
        else:
            return 0.0

In [40]:
# our architecture
# consists of HEAD_NUM rotations and HEAD_NUN linear models

class ArchModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
        self.gammas = [torch.nn.Parameter(torch.rand(2, HEAD_NUM)) for _ in range(D)]
        self.rots = torch.nn.ModuleList([Rot() for _ in range(HEAD_NUM)])
        self.models = torch.nn.ModuleList([torch.nn.Linear(2*(POLY_NUM+1), 1) for _ in range(HEAD_NUM)])
    def forward(self, x, d, gammas = None, only_angle = False, use_softmax = True):
        if gammas is None:
            gammas = self.gammas[d]
        return make_inference(x, self.rots, self.models, self.gammas[d],
                              use_softmax=use_softmax, only_angle=only_angle)
   

def multi_js_divergence(alphas) -> torch.tensor:
    js = 0.0
    for j in range(2): # rotation and head
        alpha_full = torch.stack([a[j] for a in alphas], dim=0)
        alpha_full = torch.softmax(alpha_full, dim=1)
        
        centroid = torch.mean(alpha_full, 0)
        
        d_centroid = torch.distributions.Categorical(probs=centroid)
        
        for alpha in alphas:
            d = torch.distributions.Categorical(probs=torch.softmax(alpha[j], 0))
            js += 1.0/len(alphas) * \
                torch.distributions.kl.kl_divergence(d, d_centroid)
    return js

# param number calculation   
def calc_params(gammas):
    cnt = 0
    rots = set()
    for i in range(D):
        rots.add(gammas[i][0].argmax().item())
    
    linears = set()
    for i in range(D):
        linears.add(gammas[i][1].argmax().item())
    return len(rots) + len(linears)*2

#gammas = [torch.randn(2, 5) for _ in range(3)]
#multi_js_divergence(gammas)

In [41]:
def dartslike(fname, triplet_coef_search = 0.0, triplet_coef_train = 0.0, js_coef = 0.0, unroll: bool = True,
              lr=1e-3, lr0=0.1, lr2=.01):
    h = []
    accs = []
    param_num = []

    
    tl = MdTripletLoss(p=2.0, subset_size=9999, sample_num=1) # oleg
    
    for k in range(RUN_NUM):
        torch.manual_seed(42+k)
        x_train, y_train, x_val, y_val, x_test, y_test = [],[],[],[],[],[]
        for angle in make_angles():
            (_x_train, _y_train), (_x_val, _y_val), (_x_test, _y_test) = gen(TRAIN_NUM, TEST_NUM, seed=42+k+int(angle),
                                                                             angle=angle)
            x_train.append(_x_train)
            y_train.append(_y_train)
            x_val.append(_x_val)
            y_val.append(_y_val)
            x_test.append(_x_test)
            y_test.append(_y_test)
        model = ArchModel()
        
    
        rs = np.random.RandomState(42+k)
        
        opt = torch.optim.SGD([{'params': model.rots.parameters(), 'lr': lr0},
                              {'params': model.models.parameters(), 'lr': lr}],
                              lr=lr)
                              
                              
        opt2 = torch.optim.SGD(model.gammas, lr=lr2)

        crit = torch.nn.MSELoss()
        tq = tqdm.tqdm_notebook(range(EPOCH_NUM )) # taking half of epoch num for NAS
        losses = []
        losses2 = []
        for e in tq:
            
            
            for d in range(D):
            
                opt2.zero_grad()
                d2 = rs.choice([i for i in range(D) if i != d])
            
                # unrolling step: done manually without approximation
                if unroll:
                    opt.zero_grad()
                    x = torch.tensor(x_train[d]).float()
                    y = torch.tensor(y_train[d]).float()

                    x2 = torch.tensor(x_train[d2]).float()
                    y2 = torch.tensor(y_train[d2]).float()

                    out_angle1 = model(x, d, only_angle=True)
                    out_angle2 = model(x2, d2, only_angle=True)

                    out = model(x, d)[:,0]
                    loss = crit(out, y)
                    if triplet_coef_search:
                        tloss = tl(out_angle1, out_angle2, y, y2) * triplet_coef_search
                        loss += tloss
                    grads = torch.autograd.grad(loss, model.parameters())
                    grads = [torch.clamp(g, -GRAD_CLIP, GRAD_CLIP) for g in grads]
                        
                    with torch.no_grad():
                        for p,g in zip(model.parameters(), grads):
                            p = p + lr * g
                            
                x = torch.tensor(x_val[d]).float()
                y = torch.tensor(y_val[d]).float()

                x2 = torch.tensor(x_val[d2]).float()
                y2 = torch.tensor(y_val[d2]).float()

                out_angle1 = model(x, d, only_angle=True)
                out_angle2 = model(x2, d2, only_angle=True)
                

                out = model(x, d)[:,0]
                loss2 = crit(out, y) 
                if triplet_coef_search:
                        tloss = tl(out_angle1, out_angle2, y, y2) * triplet_coef_search
                        loss2 += tloss
                if js_coef:
                    loss2 += multi_js_divergence(model.gammas).mean() * js_coef
                loss2.backward()
                torch.nn.utils.clip_grad_value_(model.gammas, GRAD_CLIP)
                opt2.step()
                
                if unroll:
                    with torch.no_grad():
                        for i,g in zip(model.parameters(), grads):
                            p = p - lr * g
                            
                            
                opt.zero_grad()
                x = torch.tensor(x_train[d]).float()
                y = torch.tensor(y_train[d]).float()

                x2 = torch.tensor(x_train[d2]).float()
                y2 = torch.tensor(y_train[d2]).float()

                out_angle1 = model(x, d, only_angle=True)
                out_angle2 = model(x2, d2, only_angle=True)
                
                out = model(x, d)[:,0]
                loss = crit(out, y)
                if triplet_coef_search:
                    tloss = tl(out_angle1, out_angle2, y, y2) * triplet_coef_search
                    loss += tloss
                loss.backward()
                torch.nn.utils.clip_grad_value_(model.parameters(), GRAD_CLIP)
                opt.step()
                losses.append(loss.item())
                losses2.append(loss2.item())
            
            tq.set_description(str(np.mean(losses))+';'+str(np.mean(losses2)))

        tq = tqdm.tqdm_notebook(range(EPOCH_NUM))
        
        # search step. Now we concatenate train and validation
        x_train = np.concatenate([x_train, x_val], axis=1)
        y_train =  np.concatenate([y_train, y_val], axis=1)
        
        
        for g in model.gammas:
            print (g.argmax(1))
        
        for g in model.gammas:
            print (g)
        
        opt = torch.optim.SGD([{'params': model.rots.parameters(), 'lr': lr0},
                              {'params': model.models.parameters(), 'lr': lr}],
                              lr=lr)
        
        for e in tq:

            for d in range(D):
                loss = 0.0
                opt.zero_grad()

                d2 = rs.choice([i for i in range(D) if i != d])

                x = torch.tensor(x_train[d]).float()
                y = torch.tensor(y_train[d]).float()
                out = model(x, d, use_softmax=False)[:,0]

               
                loss = crit(out, y) 
                if triplet_coef_train:
                    x2 = torch.tensor(x_train[d2]).float()
                    y2 = torch.tensor(y_train[d2]).float()

                    out_angle1 = model(x, d, only_angle=True, use_softmax=False) # note, using one-hot here
                    out_angle2 = model(x2, d2, only_angle=True, use_softmax=False)

                    tloss = tl(out_angle1, out_angle2, y, y2)* triplet_coef_train
                    loss += tloss
                
                
                loss.backward()
                torch.nn.utils.clip_grad_value_(model.parameters(), GRAD_CLIP)
                opt.step()
            angles= [str(r.angle.item()) for r in model.rots]
            #print (angles)
            tq.set_description(' '.join(angles))
        total = 0
        for d in range(D):

            out = model(torch.tensor(x_test[d]).float(), d, use_softmax=False)
            total += crit2(out[:,0], torch.tensor(y_test[d]).float()).item()
        
        acc = total/D
        accs.append(acc)
        param_num.append(calc_params(model.gammas))
        print (acc, param_num[-1])
        
    with open(fname+'.txt', 'a') as out:
        out.write(f'D={D}, HEAD_NUM={HEAD_NUM}, RUN_NUM={RUN_NUM}, CRIT={np.mean(accs)}+-{np.std(accs)}.  MIN={np.min(accs)}, MAX={np.max(accs)}')
        out.write(f' PARAMS={np.mean(param_num)}+-{np.std(param_num)}. MIN={np.min(param_num)}, MAX={np.max(param_num)}\n')
    
    with open(fname+'.jsonl', 'a') as out:
        out.write(json.dumps({'D': D, 'HEAD_NUM': HEAD_NUM, 'RUN_NUM': RUN_NUM, 'accs': accs, 'params': param_num})+'\n')
    print (model.gammas)
    for r in model.rots:
        print (r.angle)
    return (np.mean(accs), np.mean(param_num))


In [42]:
dartslike('triplets', 1000.0, 1000.0)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tq = tqdm.tqdm_notebook(range(EPOCH_NUM )) # taking half of epoch num for NAS


  0%|          | 0/1000 [00:00<?, ?it/s]

  g0 = torch.nn.functional.softmax(gammas[0])
  g1 = torch.nn.functional.softmax(gammas[1])
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tq = tqdm.tqdm_notebook(range(EPOCH_NUM))


  0%|          | 0/1000 [00:00<?, ?it/s]

tensor([1, 0])
tensor([1, 3])
tensor([2, 1])
tensor([0, 3])
Parameter containing:
tensor([[-0.5450,  5.6498, -1.0318, -0.9335],
        [ 0.8440,  0.6595, -0.2009,  0.7390]], requires_grad=True)
Parameter containing:
tensor([[-3.7440,  6.5023, -2.2566,  2.1004],
        [ 0.5722,  0.7642,  0.3720,  0.8991]], requires_grad=True)
Parameter containing:
tensor([[ 0.1720, -2.2740,  4.3540,  0.1014],
        [ 1.0386,  1.9601,  0.3613, -1.5203]], requires_grad=True)
Parameter containing:
tensor([[ 5.1786, -1.4467, -1.1263, -1.6726],
        [ 0.1536,  0.5927, -0.4586,  1.2925]], requires_grad=True)
0.6260390132665634 9


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

tensor([2, 1])
tensor([2, 1])
tensor([1, 1])
tensor([3, 1])
Parameter containing:
tensor([[-0.1755, -1.8504,  4.1967, -0.2531],
        [ 0.2956,  0.5995, -0.0516,  0.0474]], requires_grad=True)
Parameter containing:
tensor([[-1.7492, -1.7782,  7.6723, -1.6989],
        [-0.2116,  1.9990, -0.1802,  0.6888]], requires_grad=True)
Parameter containing:
tensor([[-2.6362,  4.5177,  0.5698, -0.7518],
        [-0.0456,  1.0035,  0.1959,  0.8928]], requires_grad=True)
Parameter containing:
tensor([[ 0.1449, -0.2666, -0.8514,  2.9188],
        [ 0.6412,  0.7672,  0.3593,  0.2858]], requires_grad=True)
0.5557030439376831 5


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

tensor([3, 2])
tensor([2, 3])
tensor([0, 0])
tensor([1, 0])
Parameter containing:
tensor([[-1.2398, -2.9122, -0.0332,  6.5977],
        [ 0.1499,  0.6569,  0.6831,  0.3672]], requires_grad=True)
Parameter containing:
tensor([[ 0.3881,  0.3012,  4.2401, -1.3249],
        [ 0.4381,  0.5296,  0.2632,  0.6955]], requires_grad=True)
Parameter containing:
tensor([[ 8.8492, -3.1909, -2.4418, -0.8935],
        [ 1.6096,  1.0026,  0.0514,  0.2434]], requires_grad=True)
Parameter containing:
tensor([[-0.4569,  4.0598, -1.1960, -0.7801],
        [ 1.5661, -0.1842,  0.6582,  0.1257]], requires_grad=True)
0.663034051656723 10


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

tensor([3, 1])
tensor([1, 2])
tensor([1, 3])
tensor([3, 3])
Parameter containing:
tensor([[-0.1021,  0.5795,  0.9668,  1.2863],
        [-0.1052,  0.6368,  0.6271,  0.2901]], requires_grad=True)
Parameter containing:
tensor([[-0.6868,  1.2273,  0.8721, -0.3378],
        [ 0.2414,  0.2085,  0.8157,  0.7384]], requires_grad=True)
Parameter containing:
tensor([[-1.4587,  3.3782,  1.3327, -0.3203],
        [ 0.5155, -0.1004,  0.5077,  0.7816]], requires_grad=True)
Parameter containing:
tensor([[-0.1149, -2.3247, -3.5448,  8.1482],
        [ 0.3302,  0.5819,  0.4224,  0.6882]], requires_grad=True)
0.9763320237398148 8


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

tensor([3, 1])
tensor([2, 1])
tensor([0, 1])
tensor([2, 1])
Parameter containing:
tensor([[-1.0204, -2.5542, -2.6133,  7.5860],
        [-0.2484,  1.4200, -0.2140,  1.1670]], requires_grad=True)
Parameter containing:
tensor([[-5.8587, -2.3483,  8.9698,  0.6944],
        [-1.7546,  2.8447, -0.7633,  2.7994]], requires_grad=True)
Parameter containing:
tensor([[ 7.0474, -2.7662, -0.5033, -1.0859],
        [-1.5815,  2.2077, -0.7931,  1.7473]], requires_grad=True)
Parameter containing:
tensor([[-1.1246, -0.5024,  5.1507, -1.5494],
        [-0.1911,  2.4940,  0.0985,  1.3463]], requires_grad=True)
0.834810271859169 5
[Parameter containing:
tensor([[-1.0204, -2.5542, -2.6133,  7.5860],
        [-0.2484,  1.4200, -0.2140,  1.1670]], requires_grad=True), Parameter containing:
tensor([[-5.8587, -2.3483,  8.9698,  0.6944],
        [-1.7546,  2.8447, -0.7633,  2.7994]], requires_grad=True), Parameter containing:
tensor([[ 7.0474, -2.7662, -0.5033, -1.0859],
        [-1.5815,  2.2077, -0.7931,  1.

(0.7311836808919907, 7.4)

In [43]:
# no reg
dartslike('darts', 0.0, 0.0)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tq = tqdm.tqdm_notebook(range(EPOCH_NUM )) # taking half of epoch num for NAS


  0%|          | 0/1000 [00:00<?, ?it/s]

  g0 = torch.nn.functional.softmax(gammas[0])
  g1 = torch.nn.functional.softmax(gammas[1])
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tq = tqdm.tqdm_notebook(range(EPOCH_NUM))


  0%|          | 0/1000 [00:00<?, ?it/s]

tensor([0, 3])
tensor([0, 1])
tensor([1, 2])
tensor([2, 0])
Parameter containing:
tensor([[1.3760, 0.6154, 0.3194, 0.8286],
        [0.4620, 0.6247, 0.2585, 0.6963]], requires_grad=True)
Parameter containing:
tensor([[ 2.7518,  0.0973, -0.0495, -0.1975],
        [ 0.1993,  0.8832,  0.6939,  0.8311]], requires_grad=True)
Parameter containing:
tensor([[ 1.0950,  1.3785, -0.1733,  0.0532],
        [ 0.6176,  0.2571,  0.7935,  0.1713]], requires_grad=True)
Parameter containing:
tensor([[-0.7687, -0.0573,  0.9944,  0.7646],
        [ 1.1316, -0.0844,  1.0183, -0.4853]], requires_grad=True)
1.1496404632925987 11


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

tensor([0, 1])
tensor([0, 3])
tensor([2, 3])
tensor([1, 0])
Parameter containing:
tensor([[ 0.7240,  0.1086,  0.6455,  0.4397],
        [ 0.3698,  0.5162, -0.0872,  0.0920]], requires_grad=True)
Parameter containing:
tensor([[ 2.3149, -0.1684,  0.3712, -0.0717],
        [ 1.0257,  0.2602, -0.0260,  1.0360]], requires_grad=True)
Parameter containing:
tensor([[0.1942, 0.4512, 0.9081, 0.1460],
        [0.2315, 0.5798, 0.4533, 0.7820]], requires_grad=True)
Parameter containing:
tensor([[0.0329, 0.8912, 0.6840, 0.3375],
        [0.9199, 0.4316, 0.4013, 0.3007]], requires_grad=True)
0.7164190113544464 9


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

tensor([0, 1])
tensor([2, 2])
tensor([1, 0])
tensor([0, 0])
Parameter containing:
tensor([[ 1.7535, -0.1089,  0.1089,  0.6589],
        [-0.0323,  1.5207,  0.0304,  0.3383]], requires_grad=True)
Parameter containing:
tensor([[0.6923, 0.7000, 1.4854, 0.7269],
        [0.2688, 0.2941, 1.0996, 0.2638]], requires_grad=True)
Parameter containing:
tensor([[0.4889, 1.1944, 0.5466, 0.0931],
        [1.3101, 0.4202, 0.7186, 0.4581]], requires_grad=True)
Parameter containing:
tensor([[ 0.9441,  0.2945, -0.3949,  0.7831],
        [ 1.4184,  0.1568, -0.2173,  0.8080]], requires_grad=True)
0.47139111906290054 9


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

tensor([3, 2])
tensor([0, 2])
tensor([2, 3])
tensor([1, 3])
Parameter containing:
tensor([[ 0.2083,  0.8950,  0.6677,  0.9595],
        [-0.1378,  0.4993,  0.9532,  0.1340]], requires_grad=True)
Parameter containing:
tensor([[ 1.1704, -0.5107,  0.2067,  0.2084],
        [ 0.5086, -0.0138,  1.2280,  0.2812]], requires_grad=True)
Parameter containing:
tensor([[ 0.6607,  0.1486,  1.4681,  0.6545],
        [ 0.2189,  0.1418, -0.3130,  1.6567]], requires_grad=True)
Parameter containing:
tensor([[ 0.3060,  1.4262,  0.3766,  0.0549],
        [-0.0018,  0.9141,  0.1177,  0.9926]], requires_grad=True)
0.8816287368535995 8


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

tensor([0, 1])
tensor([2, 2])
tensor([3, 1])
tensor([2, 1])
Parameter containing:
tensor([[ 1.6632,  0.6114, -0.3357, -0.5408],
        [ 0.4265,  1.5512,  0.0771,  0.0698]], requires_grad=True)
Parameter containing:
tensor([[-0.0809,  0.1934,  1.2518,  0.0928],
        [ 1.1669, -0.0302,  1.3727,  0.6169]], requires_grad=True)
Parameter containing:
tensor([[-0.1293, -0.0062,  1.1343,  1.6931],
        [-0.3275,  1.4808,  0.9123, -0.4852]], requires_grad=True)
Parameter containing:
tensor([[-0.2930,  0.8055,  1.1971,  0.2647],
        [ 0.8067,  1.7141,  0.6835,  0.5435]], requires_grad=True)
0.73158098757267 7
[Parameter containing:
tensor([[ 1.6632,  0.6114, -0.3357, -0.5408],
        [ 0.4265,  1.5512,  0.0771,  0.0698]], requires_grad=True), Parameter containing:
tensor([[-0.0809,  0.1934,  1.2518,  0.0928],
        [ 1.1669, -0.0302,  1.3727,  0.6169]], requires_grad=True), Parameter containing:
tensor([[-0.1293, -0.0062,  1.1343,  1.6931],
        [-0.3275,  1.4808,  0.9123, -0.4

(0.7901320636272431, 8.8)

In [44]:
dartslike('js', js_coef=100.0)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tq = tqdm.tqdm_notebook(range(EPOCH_NUM )) # taking half of epoch num for NAS


  0%|          | 0/1000 [00:00<?, ?it/s]

  g0 = torch.nn.functional.softmax(gammas[0])
  g1 = torch.nn.functional.softmax(gammas[1])
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tq = tqdm.tqdm_notebook(range(EPOCH_NUM))


  0%|          | 0/1000 [00:00<?, ?it/s]

tensor([0, 2])
tensor([0, 2])
tensor([0, 2])
tensor([0, 2])
Parameter containing:
tensor([[1.1091, 0.7148, 0.6006, 0.7149],
        [0.5554, 0.4120, 0.5713, 0.5029]], requires_grad=True)
Parameter containing:
tensor([[0.9797, 0.5843, 0.4620, 0.5763],
        [0.6948, 0.5548, 0.7095, 0.6485]], requires_grad=True)
Parameter containing:
tensor([[0.9127, 0.5223, 0.4020, 0.5164],
        [0.5053, 0.3613, 0.5222, 0.4508]], requires_grad=True)
Parameter containing:
tensor([[0.5484, 0.1579, 0.0561, 0.1705],
        [0.4421, 0.2950, 0.4591, 0.3840]], requires_grad=True)
1.1573370695114136 3


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

tensor([2, 0])
tensor([2, 0])
tensor([2, 0])
tensor([2, 0])
Parameter containing:
tensor([[0.5614, 0.3842, 0.6961, 0.2760],
        [0.4548, 0.1816, 0.0270, 0.2274]], requires_grad=True)
Parameter containing:
tensor([[0.6960, 0.5133, 0.8301, 0.4065],
        [0.8072, 0.5307, 0.3795, 0.5786]], requires_grad=True)
Parameter containing:
tensor([[0.5052, 0.3297, 0.6452, 0.2194],
        [0.7429, 0.4682, 0.3200, 0.5155]], requires_grad=True)
Parameter containing:
tensor([[0.5662, 0.3923, 0.7052, 0.2819],
        [0.7447, 0.4712, 0.3202, 0.5175]], requires_grad=True)
1.0676323920488358 3


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

tensor([0, 0])
tensor([0, 0])
tensor([0, 0])
tensor([0, 0])
Parameter containing:
tensor([[0.7070, 0.5347, 0.6424, 0.5284],
        [0.6471, 0.4184, 0.3792, 0.4125]], requires_grad=True)
Parameter containing:
tensor([[0.9998, 0.8365, 0.9460, 0.8223],
        [0.6650, 0.4341, 0.3985, 0.4288]], requires_grad=True)
Parameter containing:
tensor([[0.6795, 0.5204, 0.6236, 0.4995],
        [0.9141, 0.6766, 0.6422, 0.6742]], requires_grad=True)
Parameter containing:
tensor([[0.5092, 0.3433, 0.4457, 0.3286],
        [0.7293, 0.4918, 0.4554, 0.4893]], requires_grad=True)
0.8324178159236908 3


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

tensor([1, 3])
tensor([1, 3])
tensor([1, 3])
tensor([1, 3])
Parameter containing:
tensor([[0.6653, 0.7172, 0.6979, 0.6501],
        [0.1079, 0.3525, 0.3305, 0.6578]], requires_grad=True)
Parameter containing:
tensor([[0.2533, 0.2976, 0.2861, 0.2377],
        [0.2470, 0.4913, 0.4685, 0.7973]], requires_grad=True)
Parameter containing:
tensor([[0.7154, 0.7660, 0.7515, 0.6989],
        [0.1729, 0.4181, 0.3898, 0.7236]], requires_grad=True)
Parameter containing:
tensor([[0.5226, 0.5763, 0.5585, 0.5064],
        [0.2487, 0.4968, 0.4738, 0.8034]], requires_grad=True)
1.2857360243797302 3


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

tensor([2, 1])
tensor([2, 1])
tensor([2, 1])
tensor([2, 1])
Parameter containing:
tensor([[-0.1266,  0.1240,  1.0099,  0.3909],
        [ 0.2539,  1.1208,  0.6721,  0.0778]], requires_grad=True)
Parameter containing:
tensor([[-0.1168,  0.1367,  1.0286,  0.4087],
        [ 0.5063,  1.3687,  0.9232,  0.3280]], requires_grad=True)
Parameter containing:
tensor([[ 0.1814,  0.4358,  1.3474,  0.7274],
        [ 0.1160,  0.9901,  0.5367, -0.0625]], requires_grad=True)
Parameter containing:
tensor([[0.0223, 0.2815, 1.1465, 0.5240],
        [0.6602, 1.5279, 1.0759, 0.4837]], requires_grad=True)
0.8407305777072906 3
[Parameter containing:
tensor([[-0.1266,  0.1240,  1.0099,  0.3909],
        [ 0.2539,  1.1208,  0.6721,  0.0778]], requires_grad=True), Parameter containing:
tensor([[-0.1168,  0.1367,  1.0286,  0.4087],
        [ 0.5063,  1.3687,  0.9232,  0.3280]], requires_grad=True), Parameter containing:
tensor([[ 0.1814,  0.4358,  1.3474,  0.7274],
        [ 0.1160,  0.9901,  0.5367, -0.0625]],

(1.0367707759141922, 3.0)