## See README.md file for further details about the project and the environment.

### State-Action Description

### State
State s is an array with give components

* s[0]:  constraint matrix $A$of the current LP ($\max  -c^Tx \text{ s.t. }Ax \le  b$) . Dimension is $m \times n$. See by printing s[0].shape. Here $n$ is the (fixed) number of variables. For instances of size 60 by 60 used in the above command, $n$ will remain fixed as 60. And $m$ is the current number of constraints. Initially, $m$ is to the number of constraints in the IP instance. (For instances generated with --num-c=60, $m$ is 60 at the first step).  But $m$ will increase by one in every step of the episode as one new constraint (cut) is added on taking an action.
* s[1]: rhs $b$ for the current LP ($Ax\le b$). Dimension same as the number $m$ in matrix A.
* s[2]: coefficient vector $c$ from the LP objective ($-c^Tx$). Dimension same as the number of variables, i.e., $n$.
* s[3],  s[4]: Gomory cuts available in the current round of Gomory's cutting plane algorithm. Each cut $i$ is of the form $D_i x\le d_i$.   s[3] gives the matrix $D$ (of dimension $k \times n$) of cuts and s[4] gives the rhs $d$ (of dimension $k$). The number of cuts $k$ available in each round changes, you can find it out by printing the size of last component of state, i.e., s[4].size or s[-1].size.

### Actions
There are k=s[4].size actions available in each state $s$, with $i^{th}$ action corresponding to the $i^{th}$ cut with inequality $D_i x\le d_i$ in $s[3], s[4]$.

In [1]:
!pip install -i https://pypi.gurobi.com gurobipy

Looking in indexes: https://pypi.gurobi.com


In [2]:
!pip install wandb -qqq

In [3]:
import policy_network as PN
import helper as H

In [4]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mleonli66[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
import gymenv_v2
from gymenv_v2 import make_multiple_env
import numpy as np
import os
import torch

# Training

## Easy Mode

In [57]:
run=wandb.init(project="finalproject", entity="orcs4529", tags=["training-easy"])
#run=wandb.init(project="finalproject", entity="orcs4529", tags=["training-hard"])
#run=wandb.init(project="finalproject", entity="orcs4529", tags=["test"])

### TRAINING

# Easy Setup: Use the following environment settings. We will evaluate your agent with the same easy config below:
easy_config = {
    "load_dir"        : 'instances/train_10_n60_m60',
    "idx_list"        : list(range(10)),
    "timelimit"       : 50,
    "reward_type"     : 'obj'
}


if __name__ == "__main__":
    # create env
    
    var_size = 61
    attention_size = 32
    k = 16
    hidden_size= 64
    lr = 3e-4
    Policy = PN.Policy_Network(var_size = var_size, attention_size = attention_size, k = k, hidden_size = hidden_size, lr = lr)
    env = make_multiple_env(**easy_config)
    sigma = 5
    gamma = 0.99
    rrecord = []

    # To record traectories generated from current policy
    
    for e in range(30): 

        CONSTRAINTS = []  
        CANDIDATES = []
        ACTS = []
        PROBABILITY = []  
        REWARDS = []  

        s = env.reset()   # samples a random instance every time env.reset() is called
        d = False
        t = 0
        repisode = 0
        total_loss = 0
        
        while not d:
            #Take a random action
            A, b, c0, cuts_a, cuts_b = s
            # find attention score
            a_b = np.concatenate((A,np.expand_dims(b,-1)),1)
            d_e = np.concatenate((cuts_a,np.expand_dims(cuts_b,-1)),1)
            total = np.concatenate((a_b, d_e),0)
            
            total = (total - np.mean(total)) / np.std(total)
            #total / np.linalg.norm(total)
            
            constraint = total[:len(a_b)]
            candidate = total[len(a_b):]

            CONSTRAINTS.append(constraint)
            CANDIDATES.append(candidate)
            attention_score = Policy.compute_attention(constraint, candidate)
            prob = Policy.compute_prob(attention_score)
            
            a = np.array([np.argmax(prob)])
            ACTS.append(a)

            s, r, d, _ = env.step(a)
            #print('episode', e, 'step', t, 'reward', r)            
            REWARDS.append(r)

            t += 1
            repisode += r
            
        
        #Below is for logging training performance
        rrecord.append(np.sum(REWARDS))
        
        # TODO:  Use discounted_rewards function to compute \hat{V}s/\hat{Q}s  from instant rewards in rews
        discounted_r = H.discounted_rewards(REWARDS, gamma)
        Q_s = H.evolution_strategies(discounted_r, sigma)
        
        for contraint,candidate,act,q_s in zip(CONSTRAINTS,CANDIDATES,ACTS,Q_s):
            loss = Policy.train(contraint,candidate,act,np.array([q_s]))
            total_loss += loss
        print(e, ": ", repisode, total_loss)

        fixedWindow=5
        movingAverage=0
        if len(rrecord) >= fixedWindow:
            movingAverage=np.mean(rrecord[len(rrecord)-fixedWindow:len(rrecord)-1])

        #wandb logging
        wandb.log({ "training reward" : rrecord[-1], "training reward moving average" : movingAverage})
        

0,1
training reward,▆██▆█▁
training reward moving average,▁▁▁▁██

0,1
training reward,0.00029
training reward moving average,0.0563


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016752340266248212, max=1.0…

loading training instances, dir instances/train_10_n60_m60 idx 0
loading training instances, dir instances/train_10_n60_m60 idx 1
loading training instances, dir instances/train_10_n60_m60 idx 2
loading training instances, dir instances/train_10_n60_m60 idx 3
loading training instances, dir instances/train_10_n60_m60 idx 4
loading training instances, dir instances/train_10_n60_m60 idx 5
loading training instances, dir instances/train_10_n60_m60 idx 6
loading training instances, dir instances/train_10_n60_m60 idx 7
loading training instances, dir instances/train_10_n60_m60 idx 8
loading training instances, dir instances/train_10_n60_m60 idx 9
0 :  0.9216160135863447 3.783835412850882
1 :  0.7082653196353021 2.8696675312820448
2 :  0.6072478073228922 2.468401910800032
3 :  1.1013110904359564 27.475209237605
4 :  0.7022608542010857 2.0944960128247048
5 :  1.115349081867862 17.461775616708774
6 :  0.7058461509368499 3.571921094096939e-11
7 :  0.6151751669476653 8.879925617057352e-11
8 :  0

In [58]:
cwd = os.getcwd()
PATH = cwd + '/Policy/easy_model'
torch.save(Policy, PATH)

## Curriculum Training

In [27]:
curriculum_10 = {
    "load_dir"        : 'instances/curriculum_10',
    "idx_list"        : list(range(10)),
    "timelimit"       : 50,
    "reward_type"     : 'obj'
}

curriculum_40 = {
    "load_dir"        : 'instances/curriculum_40',
    "idx_list"        : list(range(40)),
    "timelimit"       : 50,
    "reward_type"     : 'obj'
}

curriculum_70 = {
    "load_dir"        : 'instances/curriculum_70',
    "idx_list"        : list(range(70)),
    "timelimit"       : 50,
    "reward_type"     : 'obj'
}

c = [curriculum_10, curriculum_40,curriculum_70]
if __name__ == "__main__":
    # create env
    
    var_size = 61
    attention_size = 32
    k = 16
    hidden_size= 64
    lr = 3e-4
    sigma = 5
    gamma = 0.95
    
    Policy = PN.Policy_Network(var_size = var_size, attention_size = attention_size, k = k, hidden_size = hidden_size, lr = lr)
    
    for i in range(3):
        print('currect curriculum: ',i+1)
        env = make_multiple_env(**c[i])

        # To record traectories generated from current policy

        for e in range(30): 

            CONSTRAINTS = []  
            CANDIDATES = []
            ACTS = []
            PROBABILITY = []  
            REWARDS = []  

            s = env.reset()   # samples a random instance every time env.reset() is called
            d = False
            t = 0
            repisode = 0
            total_loss = 0

            while not d:
                #Take a random action
                A, b, c0, cuts_a, cuts_b = s
                # find attention score
                a_b = np.concatenate((A,np.expand_dims(b,-1)),1)
                d_e = np.concatenate((cuts_a,np.expand_dims(cuts_b,-1)),1)
                total = np.concatenate((a_b, d_e),0)

                total = (total - np.mean(total)) / np.std(total)

                constraint = total[:len(a_b)]
                candidate = total[len(a_b):]

                CONSTRAINTS.append(constraint)
                CANDIDATES.append(candidate)
                attention_score = Policy.compute_attention(constraint, candidate)
                prob = Policy.compute_prob(attention_score)

                a = np.array([np.argmax(prob)])
                ACTS.append(a)

                s, r, d, _ = env.step(a)
                #print('episode', e, 'step', t, 'reward', r)            
                REWARDS.append(r)

                t += 1
                repisode += r

            # TODO:  Use discounted_rewards function to compute \hat{V}s/\hat{Q}s  from instant rewards in rews
            discounted_r = H.discounted_rewards(REWARDS, gamma)
            Q_s = H.evolution_strategies(discounted_r, sigma)

            for contraint,candidate,act,q_s in zip(CONSTRAINTS,CANDIDATES,ACTS,Q_s):
                loss = Policy.train(contraint,candidate,act,np.array([q_s]))
                total_loss += loss
            print(e, ": ", repisode, total_loss)


currect curriculum:  1
loading training instances, dir instances/train_100_n60_m60 idx 0
loading training instances, dir instances/train_100_n60_m60 idx 1
loading training instances, dir instances/train_100_n60_m60 idx 2
loading training instances, dir instances/train_100_n60_m60 idx 3
loading training instances, dir instances/train_100_n60_m60 idx 4
loading training instances, dir instances/train_100_n60_m60 idx 5
loading training instances, dir instances/train_100_n60_m60 idx 6
loading training instances, dir instances/train_100_n60_m60 idx 7
loading training instances, dir instances/train_100_n60_m60 idx 8
loading training instances, dir instances/train_100_n60_m60 idx 9
0 :  0.6072478073228922 2.4941501629267506
1 :  0.7022608541983573 2.8626708992901575
2 :  0.6151751669476653 2.4876391899464974
3 :  0.7082653196353021 2.8016526706404523
4 :  0.9216160135867995 3.54339528182613
5 :  1.115349081867862 24.645779786632524
6 :  0.8867413289876822 3.3233959258353956e-11
7 :  1.10131109

0 :  0.7777448278875454 1.9222378753922837e-11
1 :  0.7356487929473587 3.584179839697475e-11
2 :  1.193922700984558 0.3731177820986345
3 :  0.9216160135867995 2.5296037911520374e-12
4 :  0.2728014378433272 2.6544219351093895e-11
5 :  0.37488180020409345 5.305273261674978e-14
6 :  0.8157422834224235 7.938566394581424e-12
7 :  1.115349081867862 0.01928077951636756
8 :  1.0726768699150853 4.282721874068697e-13
9 :  0.6064157747389345 2.6604850262568412e-12
10 :  0.6537341319417465 2.0944082830404186e-11
11 :  0.6677158110928758 1.9080130871369904e-11
12 :  1.3055174419891955 0.6225112277242013
13 :  0.6903666877369687 2.5129790517690563e-11
14 :  0.8683939133684362 1.0385172297351456e-11
15 :  0.22786578133172952 6.514733455759635e-12
16 :  0.27260375314835983 7.531164663122189e-11
17 :  1.1392343260004054 0.009053985601105534
18 :  0.7797230435419351 8.36558483194453e-12
19 :  0.6151751669476653 1.7788081022651586e-11
20 :  0.9502730507106207 1.4696053695319184e-11
21 :  1.21861914333339

In [28]:
cwd = os.getcwd()
PATH = cwd + '/Policy/curriculum_model'
torch.save(Policy, PATH)

## Hard Mode

In [41]:
run=wandb.init(project="finalproject", entity="orcs4529", tags=["training-hard"])

### TRAINING

# Hard Setup: Use the following environment settings. We will evaluate your agent with the same hard config below:
hard_config = {
    "load_dir"        : 'instances/train_100_n60_m60',
    "idx_list"        : list(range(99)),
    "timelimit"       : 50,
    "reward_type"     : 'obj'
}

if __name__ == "__main__":
    # create env
    
    lr = 3e-4
    PATH = cwd + '/Policy/curriculum_model'
    Policy = torch.load(PATH)
    env = make_multiple_env(**hard_config)
    sigma = 5
    gamma = 0.99
    rrecord = []

    # To record traectories generated from current policy
    
    for e in range(30): 

        CONSTRAINTS = []  
        CANDIDATES = []
        ACTS = []
        PROBABILITY = []  
        REWARDS = []  

        s = env.reset()   # samples a random instance every time env.reset() is called
        d = False
        t = 0
        repisode = 0
        total_loss = 0
        while not d:
            #Take a random action
            A, b, c0, cuts_a, cuts_b = s
            # find attention score
            a_b = np.concatenate((A,np.expand_dims(b,-1)),1)
            d_e = np.concatenate((cuts_a,np.expand_dims(cuts_b,-1)),1)
            total = np.concatenate((a_b, d_e),0)

            total = (total - np.mean(total)) / np.std(total)
            #total / np.linalg.norm(total)
            
            constraint = total[:len(a_b)]
            candidate = total[len(a_b):]

            CONSTRAINTS.append(constraint)
            CANDIDATES.append(candidate)
            attention_score = Policy.compute_attention(constraint, candidate)
            prob = Policy.compute_prob(attention_score)
            
            a = np.array([np.argmax(prob)])
            ACTS.append(a)

            s, r, d, _ = env.step(a)
            #print('episode', e, 'step', t, 'reward', r)            
            REWARDS.append(r)

            t += 1
            repisode += r
            
        
        #Below is for logging training performance
        rrecord.append(np.sum(REWARDS))
        
        # TODO:  Use discounted_rewards function to compute \hat{V}s/\hat{Q}s  from instant rewards in rews
        discounted_r = H.discounted_rewards(REWARDS, gamma)
        Q_s = H.evolution_strategies(discounted_r, sigma)
        
        for contraint,candidate,act,q_s in zip(CONSTRAINTS,CANDIDATES,ACTS,Q_s):
            loss = Policy.train(contraint,candidate,act,np.array([q_s]))
            total_loss += loss
        print(e, ": ", repisode, total_loss)

        fixedWindow=5
        movingAverage=0
        if len(rrecord) >= fixedWindow:
            movingAverage=np.mean(rrecord[len(rrecord)-fixedWindow:len(rrecord)-1])

        #wandb logging
        wandb.log({ "training reward" : rrecord[-1], "training reward moving average" : movingAverage})
        

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
training reward,▁█
training reward moving average,▁▁

0,1
training reward,0.90482
training reward moving average,0.0


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01671237568370998, max=1.0)…

loading training instances, dir instances/train_100_n60_m60 idx 0
loading training instances, dir instances/train_100_n60_m60 idx 1
loading training instances, dir instances/train_100_n60_m60 idx 2
loading training instances, dir instances/train_100_n60_m60 idx 3
loading training instances, dir instances/train_100_n60_m60 idx 4
loading training instances, dir instances/train_100_n60_m60 idx 5
loading training instances, dir instances/train_100_n60_m60 idx 6
loading training instances, dir instances/train_100_n60_m60 idx 7
loading training instances, dir instances/train_100_n60_m60 idx 8
loading training instances, dir instances/train_100_n60_m60 idx 9
loading training instances, dir instances/train_100_n60_m60 idx 10
loading training instances, dir instances/train_100_n60_m60 idx 11
loading training instances, dir instances/train_100_n60_m60 idx 12
loading training instances, dir instances/train_100_n60_m60 idx 13
loading training instances, dir instances/train_100_n60_m60 idx 14
loadi

In [50]:
cwd = os.getcwd()
PATH = cwd + '/Policy/hard_model'
torch.save(Policy, PATH)

## Extra: 200 Instances

In [42]:
## Hard Mode
run=wandb.init(project="finalproject", entity="orcs4529", tags=["training-hard"])

### TRAINING

# Hard Setup: Use the following environment settings. We will evaluate your agent with the same hard config below:
Hhard_config = {
    "load_dir"        : 'instances/200_instance',
    "idx_list"        : list(range(200)),
    "timelimit"       : 50,
    "reward_type"     : 'obj'
}

if __name__ == "__main__":
    # create env
    
    lr = 3e-4
    PATH = cwd + '/Policy/curriculum_model'
    Policy = torch.load(PATH)
    env = make_multiple_env(**Hhard_config)
    sigma = 5
    gamma = 0.99
    rrecord = []

    # To record traectories generated from current policy
    
    for e in range(30): 

        CONSTRAINTS = []  
        CANDIDATES = []
        ACTS = []
        PROBABILITY = []  
        REWARDS = []  

        s = env.reset()   # samples a random instance every time env.reset() is called
        d = False
        t = 0
        repisode = 0
        total_loss = 0
        while not d:
            #Take a random action
            A, b, c0, cuts_a, cuts_b = s
            # find attention score
            a_b = np.concatenate((A,np.expand_dims(b,-1)),1)
            d_e = np.concatenate((cuts_a,np.expand_dims(cuts_b,-1)),1)
            total = np.concatenate((a_b, d_e),0)

            total = (total - np.mean(total)) / np.std(total)
            #total / np.linalg.norm(total)
            
            constraint = total[:len(a_b)]
            candidate = total[len(a_b):]

            CONSTRAINTS.append(constraint)
            CANDIDATES.append(candidate)
            attention_score = Policy.compute_attention(constraint, candidate)
            prob = Policy.compute_prob(attention_score)
            
            a = np.array([np.argmax(prob)])
            ACTS.append(a)

            s, r, d, _ = env.step(a)
            #print('episode', e, 'step', t, 'reward', r)            
            REWARDS.append(r)

            t += 1
            repisode += r
            
        
        #Below is for logging training performance
        rrecord.append(np.sum(REWARDS))
        
        # TODO:  Use discounted_rewards function to compute \hat{V}s/\hat{Q}s  from instant rewards in rews
        discounted_r = H.discounted_rewards(REWARDS, gamma)
        Q_s = H.evolution_strategies(discounted_r, sigma)
        
        for contraint,candidate,act,q_s in zip(CONSTRAINTS,CANDIDATES,ACTS,Q_s):
            loss = Policy.train(contraint,candidate,act,np.array([q_s]))
            total_loss += loss
        print(e, ": ", repisode, total_loss)

        fixedWindow=5
        movingAverage=0
        if len(rrecord) >= fixedWindow:
            movingAverage=np.mean(rrecord[len(rrecord)-fixedWindow:len(rrecord)-1])

        #wandb logging
        wandb.log({ "training reward" : rrecord[-1], "training reward moving average" : movingAverage})

VBox(children=(Label(value='0.001 MB of 0.019 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.045392…

0,1
training reward,▃▃▂▂▇▁▆▃▅▅▅▃▃▂▃▁▁▄▅▂▃▃▁▄▃█▃▂▃▄
training reward moving average,▁▁▁▁▅▇▆▇█▇███▇▇▆▅▅▅▆▆▇▇▅▆▆▇█▇▇

0,1
training reward,1.07268
training reward moving average,1.00083


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01670797708405492, max=1.0)…

loading training instances, dir instances/200_instance idx 0
loading training instances, dir instances/200_instance idx 1
loading training instances, dir instances/200_instance idx 2
loading training instances, dir instances/200_instance idx 3
loading training instances, dir instances/200_instance idx 4
loading training instances, dir instances/200_instance idx 5
loading training instances, dir instances/200_instance idx 6
loading training instances, dir instances/200_instance idx 7
loading training instances, dir instances/200_instance idx 8
loading training instances, dir instances/200_instance idx 9
loading training instances, dir instances/200_instance idx 10
loading training instances, dir instances/200_instance idx 11
loading training instances, dir instances/200_instance idx 12
loading training instances, dir instances/200_instance idx 13
loading training instances, dir instances/200_instance idx 14
loading training instances, dir instances/200_instance idx 15
loading training i

0 :  0.5162786186374433 7.485547466475e-12
1 :  1.001470740719924 1.065125953462011e-11
2 :  0.8364482100646455 3.58470454708127e-11
3 :  0.9014635093624292 3.727691075888632e-11
4 :  0.8925334688728981 5.1760232964848814e-11
5 :  1.2037116561025414 0.7290326998490219
6 :  1.0741382596181666 0.00012864584862755583
7 :  0.9939785146214035 1.4373194774636387e-11
8 :  0.6009325061550044 3.710366591310265e-11
9 :  0.5212616680737483 1.6190230429421627e-11
10 :  0.6276721296799224 7.76603247338741e-11
11 :  0.5094071355547385 3.503106257659047e-11
12 :  0.2961509774725073 2.9568693249457325e-12
13 :  0.4994096325872306 3.5804823068894446e-11
14 :  0.3593805646942201 3.524202513938324e-11
15 :  0.9351388868612958 1.535153711830229e-11
16 :  1.3563837944557235 1.2208617960283001
17 :  1.0245019860603861 0.0001209560289006004
18 :  0.8967601663971436 1.476503758373959e-11
19 :  1.3792470687105833 1.2280564537005034
20 :  0.12642602396863367 4.692451398273828e-13
21 :  0.5817997123542682 1.6412

## Test

In [67]:
custom_config = {
    "load_dir"        : 'instances/randomip_n60_m60',   # this is the location of the randomly generated instances (you may specify a different directory)
    "idx_list"        : list(range(50)),                # take the first 20 instances from the directory
    "timelimit"       : 50,                             # the maximum horizon length is 50
    "reward_type"     : 'obj'                           # DO NOT CHANGE reward_type
}


if __name__ == "__main__":
    # create env
    
    PATH = cwd + '/Policy/easy_model'
    #PATH = cwd + '/Policy/hard_model'
    Policy = torch.load(PATH)
    env = make_multiple_env(**custom_config)

    # To record traectories generated from current policy
    
    s = env.reset()   # samples a random instance every time env.reset() is called
    d = False
    t = 0
    repisode = 0
    while not d:
        #Take a random action
        A, b, c0, cuts_a, cuts_b = s
        # find attention score
        a_b = np.concatenate((A,np.expand_dims(b,-1)),1)
        d_e = np.concatenate((cuts_a,np.expand_dims(cuts_b,-1)),1)
        total = np.concatenate((a_b, d_e),0)

        total = (total - np.mean(total)) / np.std(total)
        #total / np.linalg.norm(total)

        constraint = total[:len(a_b)]
        candidate = total[len(a_b):]

        attention_score = Policy.compute_attention(constraint, candidate)
        prob = Policy.compute_prob(attention_score)
        a = np.array([np.argmax(prob)])

        s, r, d, _ = env.step(a)
        t += 1
        print('step', t, 'reward', r, 'action space size', s[-1].size)
        repisode += r

    #Below is for logging training performance
    rrecord.append(repisode)

    print('total episode reward: ', repisode)
        

loading training instances, dir instances/randomip_n60_m60 idx 0
loading training instances, dir instances/randomip_n60_m60 idx 1
loading training instances, dir instances/randomip_n60_m60 idx 2
loading training instances, dir instances/randomip_n60_m60 idx 3
loading training instances, dir instances/randomip_n60_m60 idx 4
loading training instances, dir instances/randomip_n60_m60 idx 5
loading training instances, dir instances/randomip_n60_m60 idx 6
loading training instances, dir instances/randomip_n60_m60 idx 7
loading training instances, dir instances/randomip_n60_m60 idx 8
loading training instances, dir instances/randomip_n60_m60 idx 9
loading training instances, dir instances/randomip_n60_m60 idx 10
loading training instances, dir instances/randomip_n60_m60 idx 11
loading training instances, dir instances/randomip_n60_m60 idx 12
loading training instances, dir instances/randomip_n60_m60 idx 13
loading training instances, dir instances/randomip_n60_m60 idx 14
loading training ins