## See README.md file for further details about the project and the environment.

### State-Action Description

### State
State s is an array with give components

* s[0]:  constraint matrix $A$of the current LP ($\max  -c^Tx \text{ s.t. }Ax \le  b$) . Dimension is $m \times n$. See by printing s[0].shape. Here $n$ is the (fixed) number of variables. For instances of size 60 by 60 used in the above command, $n$ will remain fixed as 60. And $m$ is the current number of constraints. Initially, $m$ is to the number of constraints in the IP instance. (For instances generated with --num-c=60, $m$ is 60 at the first step).  But $m$ will increase by one in every step of the episode as one new constraint (cut) is added on taking an action.
* s[1]: rhs $b$ for the current LP ($Ax\le b$). Dimension same as the number $m$ in matrix A.
* s[2]: coefficient vector $c$ from the LP objective ($-c^Tx$). Dimension same as the number of variables, i.e., $n$.
* s[3],  s[4]: Gomory cuts available in the current round of Gomory's cutting plane algorithm. Each cut $i$ is of the form $D_i x\le d_i$.   s[3] gives the matrix $D$ (of dimension $k \times n$) of cuts and s[4] gives the rhs $d$ (of dimension $k$). The number of cuts $k$ available in each round changes, you can find it out by printing the size of last component of state, i.e., s[4].size or s[-1].size.

### Actions
There are k=s[4].size actions available in each state $s$, with $i^{th}$ action corresponding to the $i^{th}$ cut with inequality $D_i x\le d_i$ in $s[3], s[4]$.

In [1]:
!pip install -i https://pypi.gurobi.com gurobipy

Looking in indexes: https://pypi.gurobi.com


In [2]:
!pip install wandb -qqq

In [3]:
import policy_network as PN
import helper as H

In [4]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mleonli66[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
import gymenv_v2
from gymenv_v2 import make_multiple_env
import numpy as np
import os
import torch

# Training

In [23]:
one_config = {
    "load_dir"        : 'instances/train_10_n60_m60',
    "idx_list"        : list(range(1)),
    "timelimit"       : 50,
    "reward_type"     : 'obj'
}


if __name__ == "__main__":
    # create env
    
    var_size = 61
    attention_size = 32
    k = 16
    hidden_size= 64
    lr = 3e-4
    Policy = PN.Policy_Network(var_size = var_size, attention_size = attention_size, k = k, hidden_size = hidden_size, lr = lr)
    env = make_multiple_env(**one_config)
    sigma = 5
    gamma = 0.99
    # To record traectories generated from current policy
    
    for e in range(25): 

        CONSTRAINTS = []  
        CANDIDATES = []
        ACTS = []
        PROBABILITY = []  
        REWARDS = [] 
        total_loss = 0

        s = env.reset()   # samples a random instance every time env.reset() is called
        d = False
        t = 0
        repisode = 0

        while not d:
            #Take a random action
            A, b, c0, cuts_a, cuts_b = s
            # find attention score
            a_b = np.concatenate((A,np.expand_dims(b,-1)),1)
            d_e = np.concatenate((cuts_a,np.expand_dims(cuts_b,-1)),1)
            total = np.concatenate((a_b, d_e),0)
            
            total = (total - np.mean(total)) / np.std(total)
            #total / np.linalg.norm(total)
            
            constraint = total[:len(a_b)]
            candidate = total[len(a_b):]

            CONSTRAINTS.append(constraint)
            CANDIDATES.append(candidate)
            attention_score = Policy.compute_attention(constraint, candidate)
            prob = Policy.compute_prob(attention_score)
            
            a = np.array([np.argmax(prob)])
            ACTS.append(a)

            s, r, d, _ = env.step(a)
            #print('episode', e, 'step', t, 'reward', r)            
            REWARDS.append(r)

            t += 1
            repisode += r
            
        
        #Below is for logging training performance
        rrecord.append(np.sum(REWARDS))
        
        # TODO:  Use discounted_rewards function to compute \hat{V}s/\hat{Q}s  from instant rewards in rews
        discounted_r = H.discounted_rewards(REWARDS, gamma)
        Q_s = H.evolution_strategies(discounted_r, sigma)
        
        for contraint,candidate,act,q_s in zip(CONSTRAINTS,CANDIDATES,ACTS,Q_s):
            loss = Policy.train(contraint,candidate,act,np.array([q_s]))
            total_loss += loss
            
        print(e, ": ", repisode, total_loss)

loading training instances, dir instances/train_10_n60_m60 idx 0
0 :  0.8867413289885917 3.6272668851686043
1 :  0.8867413289876822 3.6096191418001737
2 :  0.8867413289876822 3.5833160888870066
3 :  0.8867413289876822 3.5255436908672793
4 :  0.8867413289876822 3.3922460090016315
5 :  0.8867413289876822 3.0778622636797093
6 :  0.8867413289876822 2.3560433394804288
7 :  0.8867413289876822 0.9867708687056597
8 :  0.8867413289876822 0.07066179840363455
9 :  0.8867413289876822 0.028855769218037758
10 :  0.8867413289876822 0.017719559609595384
11 :  0.8867413289876822 0.012389322038243373
12 :  0.8867413289876822 0.009285832973272123
13 :  0.8867413289876822 0.007276146883599551
14 :  0.8867413289876822 0.0058821269498526155
15 :  0.8867413289876822 0.004867384410170958
16 :  0.8867413289876822 0.004101366931505351
17 :  0.8867413289876822 0.0035063805577455363
18 :  0.8867413289876822 0.0030338183994709877
19 :  0.8867413289876822 0.0026513060541010595
20 :  0.8867413289876822 0.00233715860

## Easy Mode

In [25]:
run=wandb.init(project="finalproject", entity="orcs4529", tags=["training-easy"])
#run=wandb.init(project="finalproject", entity="orcs4529", tags=["training-hard"])
#run=wandb.init(project="finalproject", entity="orcs4529", tags=["test"])

### TRAINING

# Easy Setup: Use the following environment settings. We will evaluate your agent with the same easy config below:
easy_config = {
    "load_dir"        : 'instances/train_10_n60_m60',
    "idx_list"        : list(range(10)),
    "timelimit"       : 50,
    "reward_type"     : 'obj'
}


if __name__ == "__main__":
    # create env
    
    var_size = 61
    attention_size = 32
    k = 16
    hidden_size= 64
    lr = 3e-4
    #Policy = PN.Policy_Network(var_size = var_size, attention_size = attention_size, k = k, hidden_size = hidden_size, lr = lr)
    env = make_multiple_env(**easy_config)
    sigma = 5
    gamma = 0.99
    rrecord = []

    # To record traectories generated from current policy
    
    for e in range(50): 

        CONSTRAINTS = []  
        CANDIDATES = []
        ACTS = []
        PROBABILITY = []  
        REWARDS = []  

        s = env.reset()   # samples a random instance every time env.reset() is called
        d = False
        t = 0
        repisode = 0
        total_loss = 0
        
        while not d:
            #Take a random action
            A, b, c0, cuts_a, cuts_b = s
            # find attention score
            a_b = np.concatenate((A,np.expand_dims(b,-1)),1)
            d_e = np.concatenate((cuts_a,np.expand_dims(cuts_b,-1)),1)
            total = np.concatenate((a_b, d_e),0)
            
            total = (total - np.mean(total)) / np.std(total)
            #total / np.linalg.norm(total)
            
            constraint = total[:len(a_b)]
            candidate = total[len(a_b):]

            CONSTRAINTS.append(constraint)
            CANDIDATES.append(candidate)
            attention_score = Policy.compute_attention(constraint, candidate)
            prob = Policy.compute_prob(attention_score)
            
            a = np.array([np.argmax(prob)])
            ACTS.append(a)

            s, r, d, _ = env.step(a)
            #print('episode', e, 'step', t, 'reward', r)            
            REWARDS.append(r)

            t += 1
            repisode += r
            
        
        #Below is for logging training performance
        rrecord.append(np.sum(REWARDS))
        
        # TODO:  Use discounted_rewards function to compute \hat{V}s/\hat{Q}s  from instant rewards in rews
        discounted_r = H.discounted_rewards(REWARDS, gamma)
        Q_s = H.evolution_strategies(discounted_r, sigma)
        
        for contraint,candidate,act,q_s in zip(CONSTRAINTS,CANDIDATES,ACTS,Q_s):
            loss = Policy.train(contraint,candidate,act,np.array([q_s]))
            total_loss += loss
        print(e, ": ", repisode, total_loss)

        fixedWindow=10
        movingAverage=0
        if len(rrecord) >= fixedWindow:
            movingAverage=np.mean(rrecord[len(rrecord)-fixedWindow:len(rrecord)-1])

        #wandb logging
        wandb.log({ "training reward" : rrecord[-1], "training reward moving average" : movingAverage})
        

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
training reward,▅███▅▇▅▅▅▇▅█▅▅█▅▅▁▁▁▁▁▁▁▅▁▁▁▁▁▂▁▅▁▁▁▁▁▂▅
training reward moving average,▁▁▁▁▁▁▁▁██▇▇█▇████▇▆▅▄▃▃▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▂

0,1
training reward,0.60725
training reward moving average,0.08968


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016751438200784226, max=1.0…

loading training instances, dir instances/train_10_n60_m60 idx 0
loading training instances, dir instances/train_10_n60_m60 idx 1
loading training instances, dir instances/train_10_n60_m60 idx 2
loading training instances, dir instances/train_10_n60_m60 idx 3
loading training instances, dir instances/train_10_n60_m60 idx 4
loading training instances, dir instances/train_10_n60_m60 idx 5
loading training instances, dir instances/train_10_n60_m60 idx 6
loading training instances, dir instances/train_10_n60_m60 idx 7
loading training instances, dir instances/train_10_n60_m60 idx 8
loading training instances, dir instances/train_10_n60_m60 idx 9
0 :  0.9216160135867995 1.3803602398846528e-11
1 :  0.6072478073228922 2.4138366226215577e-11
2 :  0.7058461509368499 1.3432304682352258e-11
3 :  0.7082653196380306 3.263610366594053e-11
4 :  1.115349081867862 0.31411148370545194
5 :  0.7022608542010857 1.192556964027512e-11
6 :  0.8867413289876822 1.924783975805301e-11
7 :  0.6151751669476653 2.89

In [14]:
cwd = os.getcwd()
PATH = cwd + '/Policy/easy_model'
torch.save(Policy, PATH)

## Curriculum Training

In [None]:
curriculum_10 = {
    "load_dir"        : 'instances/train_100_n60_m60',
    "idx_list"        : list(range(10)),
    "timelimit"       : 50,
    "reward_type"     : 'obj'
}

curriculum_40 = {
    "load_dir"        : 'instances/train_100_n60_m60',
    "idx_list"        : list(range(40)),
    "timelimit"       : 50,
    "reward_type"     : 'obj'
}

curriculum_70 = {
    "load_dir"        : 'instances/train_100_n60_m60',
    "idx_list"        : list(range(70)),
    "timelimit"       : 50,
    "reward_type"     : 'obj'
}

c = [curriculum_10, curriculum_40,curriculum_70]
if __name__ == "__main__":
    # create env
    
    var_size = 61
    attention_size = 32
    k = 16
    hidden_size= 64
    lr = 3e-4
    sigma = 5
    gamma = 0.95
    
    Policy = PN.Policy_Network(var_size = var_size, attention_size = attention_size, k = k, hidden_size = hidden_size, lr = lr)
    
    for i in range(3):
        print('currect curriculum: ',i+1)
        env = make_multiple_env(**c[i])

        # To record traectories generated from current policy

        for e in range(30): 

            CONSTRAINTS = []  
            CANDIDATES = []
            ACTS = []
            PROBABILITY = []  
            REWARDS = []  

            s = env.reset()   # samples a random instance every time env.reset() is called
            d = False
            t = 0
            repisode = 0
            total_loss = 0

            while not d:
                #Take a random action
                A, b, c0, cuts_a, cuts_b = s
                # find attention score
                a_b = np.concatenate((A,np.expand_dims(b,-1)),1)
                d_e = np.concatenate((cuts_a,np.expand_dims(cuts_b,-1)),1)
                total = np.concatenate((a_b, d_e),0)

                total = (total - np.mean(total)) / np.std(total)

                constraint = total[:len(a_b)]
                candidate = total[len(a_b):]

                CONSTRAINTS.append(constraint)
                CANDIDATES.append(candidate)
                attention_score = Policy.compute_attention(constraint, candidate)
                prob = Policy.compute_prob(attention_score)

                a = np.array([np.argmax(prob)])
                ACTS.append(a)

                s, r, d, _ = env.step(a)
                #print('episode', e, 'step', t, 'reward', r)            
                REWARDS.append(r)

                t += 1
                repisode += r

            # TODO:  Use discounted_rewards function to compute \hat{V}s/\hat{Q}s  from instant rewards in rews
            discounted_r = H.discounted_rewards(REWARDS, gamma)
            Q_s = H.evolution_strategies(discounted_r, sigma)

            for contraint,candidate,act,q_s in zip(CONSTRAINTS,CANDIDATES,ACTS,Q_s):
                loss = Policy.train(contraint,candidate,act,np.array([q_s]))
                total_loss += loss
            print(e, ": ", repisode, total_loss)


currect curriculum:  1
loading training instances, dir instances/train_100_n60_m60 idx 0
loading training instances, dir instances/train_100_n60_m60 idx 1
loading training instances, dir instances/train_100_n60_m60 idx 2
loading training instances, dir instances/train_100_n60_m60 idx 3
loading training instances, dir instances/train_100_n60_m60 idx 4
loading training instances, dir instances/train_100_n60_m60 idx 5
loading training instances, dir instances/train_100_n60_m60 idx 6
loading training instances, dir instances/train_100_n60_m60 idx 7
loading training instances, dir instances/train_100_n60_m60 idx 8
loading training instances, dir instances/train_100_n60_m60 idx 9
0 :  0.6072478073228922 2.4941501629267506
1 :  0.7022608541983573 2.8626708992901575
2 :  0.6151751669476653 2.4876391899464974
3 :  0.7082653196353021 2.8016526706404523
4 :  0.9216160135867995 3.54339528182613
5 :  1.115349081867862 24.645779786632524
6 :  0.8867413289876822 3.3233959258353956e-11
7 :  1.10131109

0 :  0.7777448278875454 1.9222378753922837e-11
1 :  0.7356487929473587 3.584179839697475e-11
2 :  1.193922700984558 0.3731177820986345
3 :  0.9216160135867995 2.5296037911520374e-12
4 :  0.2728014378433272 2.6544219351093895e-11
5 :  0.37488180020409345 5.305273261674978e-14
6 :  0.8157422834224235 7.938566394581424e-12
7 :  1.115349081867862 0.01928077951636756
8 :  1.0726768699150853 4.282721874068697e-13
9 :  0.6064157747389345 2.6604850262568412e-12


In [25]:
cwd = os.getcwd()
PATH = cwd + '/Policy/curriculum_model'
torch.save(Policy, PATH)

## Hard Mode

In [28]:
run=wandb.init(project="finalproject", entity="orcs4529", tags=["training-hard"])

### TRAINING

# Hard Setup: Use the following environment settings. We will evaluate your agent with the same hard config below:
hard_config = {
    "load_dir"        : 'instances/train_100_n60_m60',
    "idx_list"        : list(range(99)),
    "timelimit"       : 50,
    "reward_type"     : 'obj'
}

if __name__ == "__main__":
    # create env
    
    lr = 3e-4
    PATH = cwd + '/Policy/curriculum_model'
    Policy = torch.load(PATH)
    env = make_multiple_env(**hard_config)
    sigma = 5
    gamma = 0.99
    rrecord = []

    # To record traectories generated from current policy
    
    for e in range(40): 

        CONSTRAINTS = []  
        CANDIDATES = []
        ACTS = []
        PROBABILITY = []  
        REWARDS = []  

        s = env.reset()   # samples a random instance every time env.reset() is called
        d = False
        t = 0
        repisode = 0
        total_loss
        while not d:
            #Take a random action
            A, b, c0, cuts_a, cuts_b = s
            # find attention score
            a_b = np.concatenate((A,np.expand_dims(b,-1)),1)
            d_e = np.concatenate((cuts_a,np.expand_dims(cuts_b,-1)),1)
            total = np.concatenate((a_b, d_e),0)

            total = (total - np.mean(total)) / np.std(total)
            #total / np.linalg.norm(total)
            
            constraint = total[:len(a_b)]
            candidate = total[len(a_b):]

            CONSTRAINTS.append(constraint)
            CANDIDATES.append(candidate)
            attention_score = Policy.compute_attention(constraint, candidate)
            prob = Policy.compute_prob(attention_score)
            
            a = np.array([np.argmax(prob)])
            ACTS.append(a)

            s, r, d, _ = env.step(a)
            #print('episode', e, 'step', t, 'reward', r)            
            REWARDS.append(r)

            t += 1
            repisode += r
            
        
        #Below is for logging training performance
        rrecord.append(np.sum(REWARDS))
        
        # TODO:  Use discounted_rewards function to compute \hat{V}s/\hat{Q}s  from instant rewards in rews
        discounted_r = H.discounted_rewards(REWARDS, gamma)
        Q_s = H.evolution_strategies(discounted_r, sigma)
        
        for contraint,candidate,act,q_s in zip(CONSTRAINTS,CANDIDATES,ACTS,Q_s):
            loss = Policy.train(contraint,candidate,act,np.array([q_s]))
            total_loss += loss
        print(e, ": ", repisode, total_loss)

        fixedWindow=10
        movingAverage=0
        if len(rrecord) >= fixedWindow:
            movingAverage=np.mean(rrecord[len(rrecord)-fixedWindow:len(rrecord)-1])

        #wandb logging
        wandb.log({ "training reward" : rrecord[-1], "training reward moving average" : movingAverage})
        

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
training reward,▅▆█▃▃▅▃▃▆▁▅▆▄▄▁▆█▃▆▅▃▆▃▄▃▅▅▅▄▄▃▅▃█▃▄▄▆▆▆
training reward moving average,▁▁▁▁▁▁▁▁██▇▇▇▇▇▇▇█▇▇▇▇█▇▆▇▆▆▇▇▇▇████▇▇▇▇

0,1
training reward,1.25878
training reward moving average,0.92723


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01675102846735778, max=1.0)…

loading training instances, dir instances/train_100_n60_m60 idx 0
loading training instances, dir instances/train_100_n60_m60 idx 1
loading training instances, dir instances/train_100_n60_m60 idx 2
loading training instances, dir instances/train_100_n60_m60 idx 3
loading training instances, dir instances/train_100_n60_m60 idx 4
loading training instances, dir instances/train_100_n60_m60 idx 5
loading training instances, dir instances/train_100_n60_m60 idx 6
loading training instances, dir instances/train_100_n60_m60 idx 7
loading training instances, dir instances/train_100_n60_m60 idx 8
loading training instances, dir instances/train_100_n60_m60 idx 9
loading training instances, dir instances/train_100_n60_m60 idx 10
loading training instances, dir instances/train_100_n60_m60 idx 11
loading training instances, dir instances/train_100_n60_m60 idx 12
loading training instances, dir instances/train_100_n60_m60 idx 13
loading training instances, dir instances/train_100_n60_m60 idx 14
loadi

In [27]:
cwd = os.getcwd()
PATH = cwd + '/Policy/hard_model'
torch.save(Policy, PATH)

## Test

In [None]:
run=wandb.init(project="finalproject", entity="orcs4529", tags=["test"])

### testing

custom_config = {
    "load_dir"        : 'instances/randomip_n60_m60',   # this is the location of the randomly generated instances (you may specify a different directory)
    "idx_list"        : list(range(20)),                # take the first 20 instances from the directory
    "timelimit"       : 50,                             # the maximum horizon length is 50
    "reward_type"     : 'obj'                           # DO NOT CHANGE reward_type
}


if __name__ == "__main__":
    # create env
    
    PATH = cwd + '/Policy/curriculum_model'
    Policy = torch.load(PATH)
    env = make_multiple_env(**custom_config)
    rrecord = []

    # To record traectories generated from current policy
    
    for e in range(40): 
        s = env.reset()   # samples a random instance every time env.reset() is called
        d = False
        t = 0
        repisode = 0
        while not d:
            #Take a random action
            A, b, c0, cuts_a, cuts_b = s
            # find attention score
            a_b = np.concatenate((A,np.expand_dims(b,-1)),1)
            d_e = np.concatenate((cuts_a,np.expand_dims(cuts_b,-1)),1)
            total = np.concatenate((a_b, d_e),0)

            total = (total - np.mean(total)) / np.std(total)
            #total / np.linalg.norm(total)
            
            constraint = total[:len(a_b)]
            candidate = total[len(a_b):]

            attention_score = Policy.compute_attention(constraint, candidate)
            prob = Policy.compute_prob(attention_score)
            a = np.array([np.argmax(prob)])

            s, r, d, _ = env.step(a)
            t += 1
            repisode += r
            
        
        #Below is for logging training performance
        rrecord.append(repisode)
        
        print(e, ": ", repisode)

        fixedWindow=10
        movingAverage=0
        if len(rrecord) >= fixedWindow:
            movingAverage=np.mean(rrecord[len(rrecord)-fixedWindow:len(rrecord)-1])

        #wandb logging
        wandb.log({ "training reward" : rrecord[-1], "training reward moving average" : movingAverage})
        