In [1]:
import numpy as np

from StickerEnsemble import EnsembleStickerCube


class Environment(EnsembleStickerCube):
    """
    CUBE ENVIRONMENT
    """
    def __init__(self):
        super(Environment, self).__init__(randomize_representation = True)
        self.valid_turns = ["U", "U'", "R", "R'", "L", "L'",
                       "F", "F'", "B", "B'", "D", "D'"]
        self.turns_thusfar = 0
        
    def make_start_state(self, number):
        """Resets the cubes. Generate a 'random' scramble. Return the image."""
        self.reset()
        turn_arr = list(np.random.choice(self.valid_turns, size = number))
        turns = " ".join(turn_arr)
        self.__call__(turns)
        return self.visualize()
    
    def _get_reward(self):
        cube = self.cubes[0]
        sticker_list = cube.current_state
        reward = 0
        done = cube.is_solved()
        if done:
            reward += 20
        for i in range(6):
            side = sticker_list[(i*9):((i+1)*9)]
            count = dict()
            for stick in side:
                if stick in count:
                    count[stick]+=1
                else:
                    count[stick] = 1
            reward += max((y for x, y in count.items()))
        return reward, done
    
    def state_and_reward(self, current_state, picked_action):
        """
        Should take the current state and the action and return the new state and the reward.
        """
        self.turns_thusfar += 1
            
        actual_action = self.valid_turns[picked_action]
        self.__call__(actual_action)
        reward, done = self._get_reward()
        if self.turns_thusfar == 25:
            print("HALFIES")
        if self.turns_thusfar == 50 or done:
            done = True
            self.turns_thusfar = 0
        return self.visualize(), reward, done
        
        
    

In [2]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from torch.distributions import Categorical


def weights_init_uniform_rule(m):
    classname = m.__class__.__name__
    # for every Linear layer in a model..
    if classname.find('Linear') != -1:
        # get the number of the inputs
        n = m.in_features
        y = 1.0/np.sqrt(n)
        m.weight.data.uniform_(-y, y)
        m.bias.data.fill_(0)

class DPN(nn.Module):
    """AGENTS"""
    def __init__(self, alpha, input_size, output_size):
        super(DPN, self).__init__()

        self.fc1 = nn.Linear(input_size, input_size)
        self.fc2 = nn.Linear(input_size, input_size)
        self.fc3 = nn.Linear(input_size, input_size)
        self.fc4 = nn.Linear(input_size, input_size)
        self.fc5 = nn.Linear(input_size, output_size)
        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, x):
        #x = T.tensor(x)
        residual = x
        h = F.leaky_relu(self.fc1(x)) + x
        h = F.leaky_relu(self.fc2(h)) + h + x
        h = F.leaky_relu(self.fc3(h)) +h
        h = F.leaky_relu(self.fc4(h)) +h
        h = F.softmax(self.fc5(h))
        return h


class Critic(nn.Module):
    """CRITIC"""
    def __init__(self, beta, input_size):
        super(Critic, self).__init__()

        self.fc1 = nn.Linear(input_size, input_size)
        self.fc2 = nn.Linear(input_size, input_size)
        self.fc7 = nn.Linear(input_size, 1)
        self.optimizer = optim.Adam(self.parameters(), lr=beta, weight_decay=0.1)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, x):
        #x = T.tensor(x)
        h = F.leaky_relu(self.fc1(x))
        h = F.leaky_relu(self.fc2(h))
        out = self.fc7(h)
        return out






class Agent(object):
    def __init__(self, alpha, beta, input_dims,output_dims, gamma=0.99):
        self.gamma = gamma
        self.log_probs = None
        self.alpha = alpha
        self.beta = beta
        self.actor = DPN(alpha, input_dims,output_dims)
        self.critic = Critic(beta, input_dims)
        self.actor.apply(weights_init_uniform_rule)
        self.critic.apply(weights_init_uniform_rule)
        self.critic_target = Critic(beta, input_dims)
        self.update_critic_target()
    
    def update_critic_target(self):
        self.critic_target.load_state_dict(self.critic_target.state_dict())

    def fix_obs(self, observation):
        observation = observation.flatten()
        observation = observation/255
        observation = T.from_numpy(observation.astype(np.float)).float()
        return observation
    
    def choose_action(self, observation):
        observation = self.fix_obs(observation)
        probs  = self.actor.forward(observation)#.to(self.actor.device)
        action_probs = Categorical(probs)
        action = action_probs.sample()
        self.log_probs = action_probs.log_prob(action).to(self.actor.device)
        return action.item()

    def learn(self, state, reward, new_state, done):
        state = self.fix_obs(state)
        new_state = self.fix_obs(new_state)
        #self.actor.optimizer.zero_grad()
        self.critic.optimizer.zero_grad()

        critic_value_ = self.critic.forward(new_state)
        critic_value = self.critic.forward(state)
        reward = T.tensor(reward, dtype=T.float).to(self.actor.device)
        delta = ((reward + -self.gamma*critic_value_*(1-int(done))) + \
                                                                critic_value)

        critic_loss = delta**2
        critic_loss.backward(retain_graph = True)
        #(actor_loss + critic_loss).backward()
        #T.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.01)
        #print(critic_value)
        self.critic.optimizer.step()
        actor_loss = -1*self.log_probs * delta.detach()
        #actor_loss.backward()
        #T.nn.utils.clip_grad_norm_(self.actor.parameters(), 0.01)
        #self.actor.optimizer.step()
        return actor_loss

In [3]:
env = Environment()
agent = Agent(alpha=0.3,beta=0.3, input_dims = 3888, output_dims = len(["U", "U'", "R", "R'", "L", "L'",
                       "F", "F'", "B", "B'", "D", "D'"]))

n_games = 50000
run_name = "please_finish_by_morning"
import os
if not os.path.exists(run_name):
    os.makedirs(run_name)
#print(agent.actor.state_dict())

x = [int(round(x,0)) for x in np.linspace(0,25, ngames)+1]

run_name = "FAST"
import os
if not os.path.exists(run_name):
    os.makedirs(run_name)

scores = []
for i in x:
    done = False
    observation = env.make_start_state(i)
    score = 0
    agent.actor.optimizer.zero_grad()
    while not done:
        action = agent.choose_action(observation)
        observation_, reward, done = env.state_and_reward(observation ,action)
        actor_loss = agent.learn(observation, reward, observation_, done)
        if score == 0:
            actor_total_loss = actor_loss
        else:
            actor_total_loss += actor_loss
        score += reward
        observation = observation_
    actor_loss = actor_loss
    actor_loss.backward()
    agent.actor.optimizer.step()
    if i%20 == 0:
        agent.update_critic_target()

            
    if i % 1000 == 0:
        if i == 0:
            pass
        else:
            avg = np.mean(scores[-1000:])
            location = "./"+run_name+ "/"+str(avg)+"_avg_"+str(i)+"_ngames.pt"
            torch.save(agent.actor.state_dict(), location)
    scores.append(score)

    avg_score = np.mean(scores[-100:])
    print('episode ', i, 'score %.1f' % score,
            'average score %.1f' % avg_score, "    avg single score: ", score/30)

  return torch._C._cuda_getDeviceCount() > 0


HALFIES


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


episode  0 score 874.0 average score 874.0     avg single score:  29.133333333333333  test:  nan
HALFIES
episode  1 score 1063.0 average score 968.5     avg single score:  35.43333333333333  test:  nan
HALFIES
episode  2 score 850.0 average score 929.0     avg single score:  28.333333333333332  test:  nan
HALFIES
episode  3 score 962.0 average score 937.2     avg single score:  32.06666666666667  test:  nan
HALFIES
episode  4 score 825.0 average score 914.8     avg single score:  27.5  test:  nan
HALFIES
episode  5 score 949.0 average score 920.5     avg single score:  31.633333333333333  test:  nan
HALFIES
episode  6 score 1089.0 average score 944.6     avg single score:  36.3  test:  nan
HALFIES
episode  7 score 886.0 average score 937.2     avg single score:  29.533333333333335  test:  nan
HALFIES
episode  8 score 876.0 average score 930.4     avg single score:  29.2  test:  nan
HALFIES
episode  9 score 902.0 average score 927.6     avg single score:  30.066666666666666  test:  nan


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

x = [i+1 for i in range(n_games-100)]
plt.plot(scores)
location = run_name + "/" + run_name
plt.savefig(location +"train.png")

In [None]:
y1 = scores
y2 = []
mean = lambda x: sum(x)/len(x)
for i in range(len(y1)):
    if i < 100:
        pass
    else:
        avg = mean(y1[(i-100):i])
        y2.append(avg)
        
plt.plot(y2)
#plt.show()
plt.savefig(location+"train_smooth.png")