In [22]:
%load_ext autoreload
%autoreload 2

import torch as th
from typing import Dict

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
class ArtificialHumanEnv():
    """
    Environment that runs the virtual humans and calculuates the value of the common good.

    Indices:
        t: agent types [0..1]
    """
    state_dimensions = {
        'punishments': ['agent'],
        'contributions': ['agent'],
        'payoffs': ['agent'],
        'valid': ['agent'],
        'common_good': ['agent'],
        'episode_step': ['agent'],
    }

    def __init__(
            self, *, contributors_model, n_agents, max_contribution, max_punishment, episode_steps, device):
        """
        Args:
            asdasd
        """
        self.episode = 0
        self.episode_steps = episode_steps
        self.device = device
        self.max_contribution = max_contribution
        self.max_punishment = max_punishment
        self.contributors_model = contributors_model
        self.n_agents = n_agents
        self.reset_state()


    def reset_state(self):
        self.state = {
            'punishments': th.zeros(self.n_agents, dtype=th.int16),
            'contributions': th.zeros(self.n_agents, dtype=th.int16),
            'payoffs': th.zeros(self.n_agents, dtype=th.float32),
            'valid': th.zeros(self.n_agents, dtype=th.bool),
            'common_good': th.tensor(0, dtype=th.float32),
            'episode_step': th.tensor(0, dtype=th.int16)
        }


    def __getattr__(self, name):
        if 'state' in self.__dict__:
            state = self.__dict__['state']
            return state[name]


    def __setattr__(self, name, value):
        if 'state' in self.__dict__:
            if name in self.__dict__['state']:
                self.state[name] = value
            else:
                object.__setattr__(self, name, value)
        else:
            object.__setattr__(self, name, value)

    @staticmethod
    def calc_common_good(contributions, punishments):
        return contributions.sum() * 1.6 + punishments.sum()

    @staticmethod
    def calc_payout(contributions, punishments, commond_good):
        # TODO: check how to handle missing values
        return 20 - contributions - punishments + 0.25 * commond_good

    def get_contributions(self):
        contributions = self.contributors_model.act(**self.state)
        return contributions

    def init_episode(self):
        self.episode += 1
        self.episode_step = 0
        self.reset_state()
        self.contributions = self.get_contributions()
        return self.state

    def step(self, punishments):
        self.episode_step += 1

        assert punishments.max() <= self.max_punishment
        assert punishments.dtype == th.int64

        if (self.episode_step == self.episode_steps):
            done = True
        elif self.episode_step > self.episode_steps:
            raise ValueError('Environment is done already.')
        else:
            done = False

        self.punishments = punishments
        self.common_good = self.calc_common_good(self.contributions, self.punishments)
        self.payoffs = self.calc_payout(self.contributions, self.punishments, self.common_good)
        self.contributions = self.get_contributions()

        return self.state, self.common_good, done


In [None]:
# memory
import collections
import numpy as np

class Memory():
    def __init__(self, device, n_episodes, n_episode_steps, output_file):
        self.memory = None
        self.n_episodes = n_episodes
        self.n_episode_steps = n_episode_steps
        self.device = device
        self.output_file = output_file
        self.current_row = 0
        self.episode_queue = collections.deque([], maxlen=self.n_episodes)


    def init_store(self, state):
        self.memory = {
            k: th.empty((self.n_episodes, self.n_episode_steps, *t.shape), dtype=t.dtype, device=self.device)
            for k, t in state
        } + {
            'episode': th.empty((self.n_episodes, self.n_episode_steps), dtype=th.int64, device=self.device),
            'episode_steps': th.empty((self.n_episodes, self.n_episode_steps), dtype=th.int64, device=self.device)
        }

    def next_episode(self, episode):
        if self.current_row == (self.n_episodes - 1):
            self.write()
        self.current_row = (self.current_row + 1) % self.n_episodes
        self.episode = episode
        self.episode_queue.appendleft(self.current_row)

    def add(self, state, episode_step):
        self.memory['episode'][self.current_row,episode_step] = self.episode
        self.memory['episode_steps'][self.current_row,episode_step] = episode_step
        for k, t in state.items():
            self.memory[k][self.current_row,episode_step] = t

    def sample(self, batch_size, horizon, **kwargs):
        eff_horizon = min(len(self), horizon)
        relative_episode = np.random.choice(eff_horizon, batch_size, replace=False)
        return self.get_relative(relative_episode, **kwargs)

    def last(self, batch_size, **kwargs):
        assert batch_size <= self.n_episodes
        relative_episodes = np.arange(batch_size)
        return self.get_relative(relative_episodes, **kwargs)

    def get_relative(self, relative_episode, keys=None):
        if keys is None:
            keys = self.memory.keys()
        hist_idx = th.tensor(
            [self.episode_queue[rp] for rp in relative_episode], dtype=th.int64, device=self.device)
        return {k: v[hist_idx] for k, v in self.memory.items() if k in keys}

    def rec(self, state, episode, episode_steps):
        if self.memory is None:
            self.init_store(state)
        self.add_state(state, episode, episode_steps)

    def __len__(self):
        return len(self.episode_queue)

    def write(self):
        if self.output_file:
            th.save(
                {
                    k: t[:self.current_row] for k, t in self.memory.items()
                },
                f'{self.output_file}_{self.episode}.pt'
            )

    def __del__(self):
        if hasattr(self, 'memory') and (self.current_row != 0):
            self.write()

In [None]:
import torch as th

def shift_obs(tensor_dict):
    """
    Creates previous and current observations.

    Args:
        tensor_dict: each tensor need to have the episode_step dimension at second position
    """
    previous = {k: t[:, :-1] for k, t in tensor_dict.items()}
    current = {k: t[:, 1:] for k, t in tensor_dict.items()}
    return previous, current


class DQN():
    def __init__(
            self, *, manager_model_args, opt_args, gamma, target_update_freq, device):
        self.device = device

        self.policy_model = get_manager_model(evice=device, **manager_model_args).to(device)
        self.target_model = get_manager_model(evice=device, **manager_model_args).to(device)

        self.target_model.eval()
        self.optimizer = th.optim.RMSprop(self.policy_model.parameters(), **opt_args)
        self.gamma = gamma
        self.target_update_freq = target_update_freq

    def init_episode(self, episode):
        if (episode % self.target_update_freq == 0):
            # copy policy net to target net
            self.target_model.load_state_dict(self.policy_model.state_dict())

        # TODO: add for rnn
        # self.policy_model.reset()
        # self.target_model.reset()

    def get_q(self, **observations):
        with th.no_grad():
            return self.policy_model(**observations)

    def update(self, observations, actions, rewards):
        previous_obs, current_obs = shift_obs(observations)

        self.policy_model.reset()
        self.target_model.reset()

        policy_state_action_values = self.policy_model(
            **previous_obs).gather(-1, actions.unsqueeze(-1))

        next_state_values = th.zeros_like(rewards, device=self.device)
        next_state_values = self.target_model(**current_obs).max(-1)[0].detach()

        # Compute the expected Q values
        expected_state_action_values = (next_state_values * self.gamma) + rewards

        # Compute Huber loss
        loss = th.nn.functional.smooth_l1_loss(policy_state_action_values,
                                               expected_state_action_values.unsqueeze(-1))

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_model.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

In [None]:
from itertools import count


def int_to_ordinal(arr, n_levels):
    """
    Turns a integer series into an ordinal encoding. 
    """
    encoding = np.array(
        [[1]*i + [0]*(n_levels - i - 1)
        for i in range(n_levels)]
    )

    return encoding[arr]


def int_to_onehot(arr, n_levels):
    out = np.zeros((arr.size, n_levels))
    out[np.arange(arr.size),arr] = 1
    return out


def encoder(state, encodings):
    return {
        **state,
        **{
            k: encoder(state, **encoding) for k, encoding in encodings
        }
    }

def joined_encoder(state, encoding):
    encoding = [
        encoder(df, **e)
        for e in encodings
    ]
    return th.cat(encoding, axis=-1) 


def single_encode(tensor, encoding='numeric', n_levels=None):
    """
    """
    if encoding == 'ordinal':
        assert n_levels is not None, 'Number of levels not provided.'
        return int_to_ordinal(data.astype(int).values, n_levels)
    elif encoding == 'onehot':
        n_levels = len(data.cat.categories)
        return int_to_onehot(data.astype(int).values, n_levels)
    elif encoding == 'numeric':
        val = data.astype(int).values
        if add_axis:
            val = val[:,np.newaxis]
        return val
    else:
        raise ValueError(f"Encoding type {encoding} is unknown.")



def eps_greedy(q_values, eps, device):
    """
    Args:
        q_values: Tensor of type `th.float` and arbitrary shape, last dimension reflect the actions.
        eps: fraction of actions sampled at random
    Returns:
        actions: Tensor of type `th.long` and the same dimensions then q_values, besides of the last.
    """
    n_actions = q_values.shape[-1]
    actions_shape = q_values.shape[:-1]

    greedy_actions = q_values.argmax(-1)
    random_actions = th.randint(0, n_actions, size=actions_shape, device=device)

    # random number which determine whether to take the random action
    random_numbers = th.rand(size=actions_shape, device=device)
    select_random = (random_numbers < eps).long()
    picked_actions = select_random * random_actions + (1 - select_random) * greedy_actions

    return picked_actions


def run(env, controller, encoder, memory, n_episodes, eps, sample_args, device):
    for episode in range(n_episodes):
        print(f'Start episode {episode} in mode {episode}.')
        state, rewards, done = env.init_episode()

        # initialize episode for all controller
        controller.init_episode(episode)

        for step in count():
            # Get observations
            state_enc = encoder(state)

            # Get q values from controller
            q_values = controller.get_q(**state_enc)

            # Sample a action
            selected_action = eps_greedy(q_values=q_values, eps=eps, device=device)
            # pass actions to environment and advance by one step
            state, rewards, done = env.step(selected_action)
            memory.add(episode_step=step, action=selected_action, rewards=rewards, **state_enc)

            if done:
                # allow all controller to update themself
                sample = memory.random(**sample_args)
                if sample is not None:
                    controller.update(sample)
                break

In [29]:

from aimanager.model.neural.random import RandomArtificialHumans

device = th.device('cpu')
rec_device = th.device('cpu')
rah = RandomArtificialHumans(device=device, max_contribution=20)

env = ArtificialHumanEnv(
    contributors_model=rah, n_agents=4, max_contribution=20, max_punishment=30, episode_steps=16, device=device)

In [30]:
state = env.init_episode()
print(state)
done = False
while not done:
    punishments = th.randint(0, 31, (4,), device=device)
    state, reward, done = env.step(punishments)
    print(state, reward, done)
    

{'punishments': tensor([0, 0, 0, 0], dtype=torch.int16), 'contributions': tensor([ 6,  5, 19,  8]), 'payoffs': tensor([0., 0., 0., 0.]), 'valid': tensor([False, False, False, False]), 'common_good': tensor(0.), 'episode_step': tensor(0, dtype=torch.int16)}
{'punishments': tensor([ 8,  7, 27,  8]), 'contributions': tensor([ 7, 14,  1, 19]), 'payoffs': tensor([33.7000, 35.7000,  1.7000, 31.7000]), 'valid': tensor([False, False, False, False]), 'common_good': tensor(110.8000), 'episode_step': tensor(1, dtype=torch.int16)} tensor(110.8000) False
{'punishments': tensor([28,  8,  7,  0]), 'contributions': tensor([15, 18,  2, 16]), 'payoffs': tensor([12.1500, 25.1500, 39.1500, 28.1500]), 'valid': tensor([False, False, False, False]), 'common_good': tensor(108.6000), 'episode_step': tensor(2, dtype=torch.int16)} tensor(108.6000) False
{'punishments': tensor([10, 27, 30, 10]), 'contributions': tensor([18,  6, 14, 17]), 'payoffs': tensor([34.6500, 14.6500, 27.6500, 33.6500]), 'valid': tensor([Fa