In [1]:
import numpy as np

from StickerEnsemble import EnsembleStickerCube


class Environment(EnsembleStickerCube):
    """
    CUBE ENVIRONMENT
    """
    def __init__(self):
        super(Environment, self).__init__(randomize_representation = True)
        self.valid_turns = ["U", "U'", "R", "R'", "L", "L'",
                       "F", "F'", "B", "B'", "D", "D'"]
        self.turns_thusfar = 0
        
    def make_start_state(self, number):
        """Resets the cubes. Generate a 'random' scramble. Return the image."""
        self.reset()
        turn_arr = list(np.random.choice(self.valid_turns, size = number))
        turns = " ".join(turn_arr)
        self.__call__(turns)
        return self.visualize()
    
    def _get_reward(self):
        cube = self.cubes[0]
        sticker_list = cube.current_state
        reward = 0
        done = cube.is_solved()
        if done:
            reward += 20
        for i in range(6):
            side = sticker_list[(i*9):((i+1)*9)]
            count = dict()
            for stick in side:
                if stick in count:
                    count[stick]+=1
                else:
                    count[stick] = 1
            reward += max((y for x, y in count.items()))
        return reward, done
    
    def state_and_reward(self, current_state, picked_action):
        """
        Should take the current state and the action and return the new state and the reward.
        """
        self.turns_thusfar += 1
            
        actual_action = self.valid_turns[picked_action]
        self.__call__(actual_action)
        reward, done = self._get_reward()
    
        if self.turns_thusfar == 50 or done:
            done = True
            self.turns_thusfar = 0
        return self.visualize(), reward, done
        
        
    

In [2]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from torch.distributions import Categorical


TRAJECTORY_LENGTH = 30 #Approximately 10 seconds
MIDPOINTS = 2 #splits video data into trajectories of length above, but this determines the amount of overlap across trajectories

EPSILON_PERTURBATIONS = False  #if we want the network to predict how to perturb LS vector.
ITERATIONS = 100 #kinda like epochs?
BATCH_SIZE = 10   #Might be the exact same thing as episodes, up for interpretation.
EPISODES = 20     #How many trajectories to explore for a given job. Essentually to get a better estimate of the expected reward.
DISCOUNT = 0.99   #how much to discount the reward
ALPHA = 0.001     #learning rate?
INPUT_SIZE = 3888
#weight = torch.Tensor([0.5])

class DPN(nn.Module):
    """AGENTS"""
    def __init__(self, alpha, input_size, output_size):
        super(DPN, self).__init__()

        self.fc1 = nn.Linear(input_size, input_size)
        self.fc2 = nn.Linear(input_size, input_size)
        self.fc3 = nn.Linear(input_size, input_size)
        self.fc4 = nn.Linear(input_size, input_size)
        self.fc5 = nn.Linear(input_size, output_size)
        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, x):
        #x = T.tensor(x)
        residual = x
        h = F.leaky_relu(self.fc1(x)) + x
        h = F.leaky_relu(self.fc2(h)) + h + x
        h = F.leaky_relu(self.fc3(h)) +h
        h = F.leaky_relu(self.fc4(h)) +h
        h = F.softmax(self.fc5(h))
        return h

In [9]:

import numpy as np
from random import random
from scipy.stats import norm

from torch.distributions import Categorical
#from torch.distributions.independent import Independent
#from torch.distributions.normal import Normal
import torch

DATA_FILE_NAME = "trajectory_dict.pickle"
TRAJECTORY_LENGTH = 30 #Approximately 10 seconds
MIDPOINTS = 2 #splits video data into trajectories of length above, but this determines the amount of overlap across trajectories

EPSILON_PERTURBATIONS = False  #if we want the network to predict how to perturb LS vector.
ITERATIONS = 1000 #kinda like epochs?
BATCH_SIZE = 10   #Might be the exact same thing as episodes, up for interpretation.
EPISODES = 20     #How many trajectories to explore for a given job. Essentually to get a better estimate of the expected reward.
DISCOUNT = 0.99   #how much to discount the reward
ALPHA = 3e-3     #learning rate?

def curried_valuation(length_of_longest_trajectory):
    '''
    Given the length of the longest trajectory of a set of episodes;
    returns the function that will compute the valuation of an episode array (while padding it)
    Result intended to be used as  map(valuation, episodes_array) to return valuation of each episodes.
    '''
    def valuation(episode):
        '''
        returns the valuation of an episode (with padding)
        input: [(s_0, a_0, r_0), ... ,(s_t, a_t, r_t)]         potentially t<length_of_longest_trajectory
        output: [v_0, v_1, ... v_L]
        '''

        length = len(episode)
        if length != length_of_longest_trajectory:
            #If the episode isn't as long as the longest trajectory, pad it
            episode.extend([(0,0,0) for y in range(length_of_longest_trajectory-length)]) #have to make sure the numbers line up correctly
        out = np.zeros(len(episode))
        x = [i[2] for i in episode] #rewards
        out[-1] = x[-1]
        for i in reversed(range(len(x)-1)): #go backwards
            out[i] = x[i] + DISCOUNT*out[i+1] #this step valuation = reward + gamma*next_step_valuation
        #assert x.ndim >= 1
        return out
    return valuation

def weights_init_uniform_rule(m):
    classname = m.__class__.__name__
    # for every Linear layer in a model..
    if classname.find('Linear') != -1:
        # get the number of the inputs
        n = m.in_features
        y = 1.0/np.sqrt(n)
        m.weight.data.uniform_(-y, y)
        m.bias.data.fill_(0)



class DpnTraining:
    def __init__(self, INPUT_SIZE, policy_net):
        '''
        INPUT_SIZE = size and shape from the environment's output for a state  TODO
        OUTPUT_SIZE = number of possible actions                               TODO
        Probably include stuff to interact with the environment after inputting a class
        all caps words are hyperparameters you would set.
        '''
        self.env = Environment()

        # Define the network
        self.network = policy_net(ALPHA, 3888, 12)
        self.network.apply(weights_init_uniform_rule)
        # logging
        self.eps = np.finfo(np.float32).eps.item()
        self.rewards = []
        self.variance = []
        self.rewards_last = []
        self.variance_last = []


    def train(self, ITERATIONS):
        optimizer = torch.optim.Adam(self.network.parameters(), lr = ALPHA) #This is roughly based on some pytorch examples. We use this to update weights of the model.
        x = ITERATIONS
        #[int(round(x,0)) for x in np.linspace(0,25, ITERATIONS)+1]
        cnt = 0 
        for i in range(x+1):
            cnt += 1 
            first_frame = self.env.make_start_state(i) #this would be a list of starting states
            jobs = [first_frame] #TODO: Coerce job variable to appropriate pytorch type. Necessary due to environment not set up to handle processing different trajectories.
            self.train_on_jobs(jobs, optimizer)
            print("Iteration "+str(i+1)+" Completed with reward: " + str(self.rewards[-1]) + " Variance of :" + str(self.variance[-1]))
            print("Average last avg reward: " + str(self.rewards_last[-1]) + " last variance avg: " + str(self.variance_last[-1]))
            if cnt % 100 == 0: 
                torch.save(self.network.state_dict(), location)



    def fix_obs(self, observation):
        observation = observation.flatten()
        observation = observation/255
        observation = T.from_numpy(observation.astype(np.float)).float()
        return observation

    def forward(self, state):
        '''
        The forward pass of the network on the given state. Returns the output probabilites for taking the OUTPUT_SIZE probabilites
        might already be defined from the initialization after defining your model
        '''
        state = self.fix_obs(state)
        probs = self.network(state)
        return probs


    def trajectory(self, current_state):
        '''
        Maybe this implementation doesn't utilize GPUs very well, but I have no clue or not.
        Final output looks like:
        [(s_0, a_0, r_0), ..., (s_L, a_L, r_l)]
        '''
        output_history = []
        while True:
            probs = self.forward(current_state)
            distribution = Categorical(probs)
            picked_action = distribution.sample()
            action = picked_action.detach()
            #print(action)
            new_state, reward, done = self.env.state_and_reward(current_state, action) #Get the reward and the new state that the action in the environment resulted in. None if action caused death. TODO build in environment
            #Attempting this
            lg = distribution.log_prob(action)
            output_history.append( (current_state, action, reward, lg) )
            if done: #essentially, you died or finished your trajectory
                break
            else:
                current_state = new_state
        return output_history

    def train_on_jobs(self,jobset, optimizer):
        '''
        Training from a batch. Kinda presume the batch is a set of starting states not sure how you have the implemented states (do they include actions internally?)
        example shape of episode_array
        [
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5, 6, 7],
        [1, 2, 3]
        ]
        '''
        #optimizer.zero_grad()#Basically start gradient or how you'll change weights out at 0 but with the shape or whatever you need to update the weights through addition. TODO figure out how this thing should look
        for job_start in jobset:
            optimizer.zero_grad()
            #episode_array is going to be an array of length N containing trajectories [(s_0, a_0, r_0), ..., (s_L, a_L, r_0)]
            episode_array = [self.trajectory(job_start) for x in range(EPISODES)]
            # Now we need to make the valuations
            #temp
            longest_trajectory = max(len(episode) for episode in episode_array)
            valuation_fun = curried_valuation(longest_trajectory)
            cum_values = np.array([valuation_fun(ep) for ep in episode_array]) #should be a EPISODESxlength sized
            #Compute the baseline valuations.
            baseline_array = np.array([sum(cum_values[:,i])/EPISODES for i in range(longest_trajectory)]) #Probably defeats the purpose of numpy, but we're essentially trying to sum each valuation array together, and then divide by the number of episodes
            avg = baseline_array[0]
            var = [np.sqrt(sum(np.square(cum_values[:,i]-baseline_array[i]))/EPISODES) for i in range(longest_trajectory)]
            #log
            self.rewards.append(avg)
            self.variance.append(var[0])
            self.variance_last.append(var[-1])
            self.rewards_last.append(baseline_array[-1])
            #policy updates
            for i in range(EPISODES): #swapped two for loops
                for t in range(longest_trajectory):
                    try:
                        state, action, reward, log_pro= episode_array[i][t]
                    except ValueError: #this occurs when the trajectory is over.
                        pass
                    if var[t] <1e-4:
                        varry = 1e-4
                    else:
                        varry = var[t]
                    #first two products are scalars, final is scalar multiplication of computed gradients on the NN
                    if i ==0 and t == 0:
                        loss = -(cum_values[i][t]-baseline_array[t])/(varry + self.eps) * log_pro #This is what it should look like in pytorch. Added negative on recommendation of pytorch documentation
                    else:
                        loss += -(cum_values[i][t]-baseline_array[t])/(varry + self.eps)*log_pro
            loss.backward() #Compute the total cumulated gradient thusfar through our big-ole sum of losses
            optimizer.step() #Actually update our network weights. The connection between loss and optimizer is "behind the scenes", but recall that it's dependent

In [None]:
dpn = DpnTraining(INPUT_SIZE = 3888, policy_net = DPN)
dpn.train(ITERATIONS)



HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
Iteration 1 Completed with reward: 741.5671450871705 Variance of :31.33594842367577
Average last avg reward: 19.35 last variance avg: 1.878163997099295
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
Iteration 2 Completed with reward: 1362.2328448417509 Variance of :4.012312525796688
Average last avg reward: 37.0 last variance avg: 5.0
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
Iteration 3 Completed with reward: 1362.2328448417509 Variance of :4.012312525796688
Average last avg reward: 37.0 last variance avg: 5.0
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALF

HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
Iteration 28 Completed with reward: 701.0646084774974 Variance of :0.4133897753849283
Average last avg reward: 18.0 last variance avg: 1.0
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
Iteration 29 Completed with reward: 770.1389243750539 Variance of :0.8024625051595534
Average last avg reward: 20.0 last variance avg: 1.0
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
Iteration 30 Completed with reward: 859.062181622483 Variance of :0.4133897753850988
Average last avg reward: 22.0 last variance avg: 1.0
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
Iteration 31 Completed with reward: 612.3398406435679 Va

HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
Iteration 56 Completed with reward: 760.3136984068674 Variance of :0.40933693445009567
Average last avg reward: 19.5 last variance avg: 0.5
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
Iteration 57 Completed with reward: 641.8155185481281 Variance of :0.4012312525795778
Average last avg reward: 16.5 last variance avg: 0.5
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
Iteration 58 Completed with reward: 622.1650666117541 Variance of :0.40123125257974834
Average last avg reward: 15.5 last variance avg: 0.5
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFI

HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
Iteration 83 Completed with reward: 730.6395310888075 Variance of :1.604925010318823
Average last avg reward: 19.0 last variance avg: 2.0
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
Iteration 84 Completed with reward: 740.5640017637442 Variance of :0.3971784116446316
Average last avg reward: 19.0 last variance avg: 1.0
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
Iteration 85 Completed with reward: 779.8649056364914 Variance of :1.2077465986740774
Average last avg reward: 21.0 last variance avg: 1.0
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
HALFIES
Iteration 86 Completed with reward: 681.2156671276248 Variance o

In [None]:
env = Environment()
agent = Agent(alpha=0.3,beta=0.3, input_dims = 3888, output_dims = len(["U", "U'", "R", "R'", "L", "L'",
                       "F", "F'", "B", "B'", "D", "D'"]))

n_games = 50000
run_name = "please_finish_by_morning"
import os
if not os.path.exists(run_name):
    os.makedirs(run_name)
#print(agent.actor.state_dict())

x = [int(round(x,0)) for x in np.linspace(0,25, n_games)+1]

run_name = "FAST"
import os
if not os.path.exists(run_name):
    os.makedirs(run_name)

scores = []
for i in x:
    done = False
    observation = env.make_start_state(i)
    score = 0
    agent.actor.optimizer.zero_grad()
    while not done:
        action = agent.choose_action(observation)
        observation_, reward, done = env.state_and_reward(observation ,action)
        actor_loss = agent.learn(observation, reward, observation_, done)
        if score == 0:
            actor_total_loss = actor_loss
        else:
            actor_total_loss += actor_loss
        score += reward
        observation = observation_
    actor_loss = actor_loss
    actor_loss.backward()
    agent.actor.optimizer.step()
    if i%20 == 0:
        agent.update_critic_target()

            
    if i % 1000 == 0:
        if i == 0:
            pass
        else:
            avg = np.mean(scores[-1000:])
            location = "./"+run_name+ "/"+str(avg)+"_avg_"+str(i)+"_ngames.pt"
            torch.save(agent.actor.state_dict(), location)
    scores.append(score)

    avg_score = np.mean(scores[-100:])
    print('episode ', i, 'score %.1f' % score,
            'average score %.1f' % avg_score, "    avg single score: ", score/30)

In [None]:
import matplotlib.pyplot as plt

x = [i+1 for i in range(n_games-100)]
plt.plot(scores)
location = run_name + "/" + run_name
plt.savefig(location +"train.png")

In [None]:
y1 = scores
y2 = []
mean = lambda x: sum(x)/len(x)
for i in range(len(y1)):
    if i < 100:
        pass
    else:
        avg = mean(y1[(i-100):i])
        y2.append(avg)
        
plt.plot(y2)
#plt.show()
plt.savefig(location+"train_smooth.png")