In [1]:
import sys, os
import shutil

sys.path.append('/Users/jasmineli/Desktop/moral-ai-irl')
sys.path.append('/Users/jasmineli/Desktop/moral-ai-irl/human_aware_rl_master')
import torch
from torch import nn
import numpy as np
import pickle
import argparse
import matplotlib.pyplot as plt
from human_aware_rl.ppo.ppo_rllib_client import run
from human_aware_rl_master.human_aware_rl.human.process_dataframes import *
from human_aware_rl.dummy.rl_agent import *
from human_aware_rl.rllib.utils import get_base_ae
from overcooked_ai_py.agents.agent import AgentPair
from human_aware_rl.irl.config_model import get_train_config
from overcooked_ai_py.mdp.overcooked_mdp import OvercookedState

In [2]:
class TorchLinearReward(nn.Module):
    def __init__(self, n_input, n_h1=400, n_h2=1):
        super(TorchLinearReward, self).__init__()
        self.fc1 = nn.Linear(in_features=n_input, out_features=n_h1, bias=True)
        self.fc2 = nn.Linear(in_features=n_h1, out_features=n_h2, bias=True)
        self.act = nn.ELU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.fc2(x)
        # x = self.act(x)
        return x

    def get_theta(self):
        return [self.fc1.weight.detach()]

    def get_rewards(self, states):
        if type(states) == np.ndarray:
            states = torch.tensor(states, dtype=torch.float)
        with torch.no_grad():
            rewards = self.forward(states).detach()
        return rewards


In [3]:
def _loadEnvironment(config):
    mdp_params = config["environment_params"]["mdp_params"]
    env_params = config["environment_params"]["env_params"]
    ae = get_base_ae(mdp_params, env_params)
    env = ae.env
    
    return env

def _loadProcessedHumanData(data_path, view_traj=False):
    assert os.path.isfile(data_path)
    with open(data_path, 'rb') as file:
        human_data = pickle.load(file)
    
    gridworld = human_data['gridworld']
    trajectory = human_data['trajectory']

    states = []
    actions = []
    scores = []
    for i in range(len(trajectory)):
        state = []
        action = []
        score = []
        for j in range(len(trajectory[i])):
            state_dict = trajectory[i][j]
            s = state_dict['state']
            a = state_dict['joint_action']
            sc = state_dict['score']

            s = OvercookedState.from_dict(s)
            state.append(s)
            action.append(a)
            score.append(sc)
            
            if view_traj:
                print(gridworld.state_string(s))
        states.append(state)
        actions.append(action)
        scores.append(score)

    assert len(states) == len(trajectory)
    assert len(actions) == len(trajectory)
    assert len(scores) == len(trajectory)
    return states, actions, scores

def _convertAction2Index(actions):
    act = []
    for traj in actions:
        temp = []
        for idx in traj:
            act_0 = tuple(idx[0]) if type(idx[0]) == list else idx[0]
            act_1 = tuple(idx[1]) if type(idx[1]) == list else idx[1]
            temp.append([Action.ACTION_TO_INDEX[act_0], Action.ACTION_TO_INDEX[act_1]])
        act.append(temp)
    return act

def getVisitation(states, joint_action, scores, env):
    target_player_idx = 0
    num_game = len(states)
    print(f'number of games={num_game}')
    freq = {}
    i = 0
    for game, actions, score in zip(states,joint_action, scores):
        while i < 10:
            print(f'length of games={len(game)}')
            i += 1
        for s,a,sc in zip(game,actions, score):
            reward_features = env.human_coop_state_encoding(s, a, sc)[target_player_idx]
            reward_features = tuple(reward_features)
            if reward_features not in freq:
                freq[reward_features] = 0
            freq[reward_features] += 1
    
    for state in freq:
        freq[state] /= num_game
    return freq

def getExpertVisitation(env, data_path):
    states, actions, scores = _loadProcessedHumanData(data_path, view_traj=False)
    actions = _convertAction2Index(actions)
    state_visit = getVisitation(states, actions, scores, env)
    return state_visit

def getAgentVisitation(train_config, env): #get the feature expectations of a new policy using RL agent
    '''
    Trains an RL agent with the current reward function. 
    Then rolls out one trial of the trained agent and calculate the feature expectation of the RL agent.
    - train_config: the configuration taken by the rllib trainer
    
    Returns the feature expectation.
    '''
    # train and get rollouts
    try:
        results = run(train_config)
        states = results['evaluation']['states']
        actions = results['evaluation']['actions']
        scores = results['evaluation']['sparse_reward']
        actions = _convertAction2Index(actions)
        state_visit = getVisitation(states, actions, scores, env)
        return state_visit
    except Exception as e:
        print('ERROR: could not get Agent Visitation. -->' + str(e))

def getStatesAndGradient(expert_sv, agent_sv):
    # calculate the gradient for each of the state: (mu_agent - mu_expert)
    visit = {}
    for state in agent_sv:
        visit[state] = agent_sv[state]
    for state in expert_sv:
        if state not in visit:
            visit[state] = 0.0
        visit[state] -= expert_sv[state]
    
    # organize into NN input
    states = []
    grad = []
    for s in visit:
        state = torch.tensor(s, dtype=torch.float)
        # state.to(device)
        states.append(state)
        grad.append(visit[s])
    states = torch.stack(states)
    grad = torch.tensor(grad, dtype=torch.float)
    grad = torch.unsqueeze(grad, dim=1)

    return states, grad

In [4]:
trial = '_notebook'
data_path = '/home/jasmine/moral-ai-irl/overcooked_participants_data/cleaned/trajectories_onion_cook_2022-07-31_15:05:31.data'

# init 
n_epochs = 10
i = 1

# directory to save results
cwd = os.getcwd()
save_dir = f'{cwd}/result/human/T{trial}'
if not os.path.exists(save_dir):
    os.mkdir(save_dir)
# make a copy of the config file
path = os.path.join(save_dir, f'config.py')
shutil.copy('config_model.py', path)

print(f'initiating models and optimizers...')
reward_obs_shape = torch.tensor([15])       # change if reward shape changed.
reward_model = TorchLinearReward(reward_obs_shape)
optim = torch.optim.SGD(reward_model.parameters(), lr=0.02, momentum=0.9, weight_decay=0.9)

print(f'loading training configurations...')
config = get_train_config()
print(config['training_params']['num_gpus'])
print(config['model_params']['use_lstm'])

config['environment_params']['multi_agent_params']['custom_reward_func'] = reward_model.get_rewards

print(f'getting expert trajectory and state visitation...')
env = _loadEnvironment(config)
# expert_state_visit = getExpertVisitation(env, data_path)
print(f'complete')

initiating models and optimizers...
loading training configurations...
0
False
getting expert trajectory and state visitation...
complete


Train 1 epoch:

In [5]:
expert_state_visit = getExpertVisitation(env, data_path)
print(f'{len(expert_state_visit)} states in expert state visitations')

print(f'model parameters:')
for param in reward_model.parameters():
    print(param)

number of games=1922
length of games=21
length of games=21
length of games=21
length of games=21
length of games=21
length of games=21
length of games=21
length of games=21
length of games=21
length of games=21
765 states in expert state visitations
model parameters:
Parameter containing:
tensor([[-0.1803, -0.0027, -0.1179,  ...,  0.2176, -0.1291,  0.0154],
        [-0.1754,  0.2035,  0.0118,  ...,  0.0039,  0.1261,  0.0601],
        [-0.0435,  0.2515, -0.1740,  ...,  0.0430, -0.1235, -0.0320],
        ...,
        [ 0.0504,  0.0936, -0.2381,  ...,  0.1450, -0.2033, -0.1953],
        [ 0.0209,  0.2391,  0.2537,  ..., -0.0946, -0.1037,  0.2535],
        [-0.0997,  0.0147, -0.1818,  ..., -0.0778, -0.1428, -0.1709]],
       requires_grad=True)
Parameter containing:
tensor([-0.0951, -0.0182,  0.1727,  0.1131,  0.0899,  0.2304, -0.0089, -0.1329,
         0.1795, -0.0579,  0.0200,  0.0404, -0.1870,  0.1478, -0.1582,  0.0759,
         0.0011, -0.1054,  0.0184,  0.1065,  0.2506, -0.0499, -0.20

In [6]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"
assert config['training_params']['num_gpus'] == 0
assert config['model_params']['use_lstm'] == False

agent_state_visit = getAgentVisitation(config, env)

DummyPolicy: layout=coop_experiment_1, agent=MAIDumbAgentRightCoop
DummyPolicy: layout=coop_experiment_1, agent=MAIDumbAgentRightCoop
0: ep rew mean=-4.867697918054613, max=-1.2578230644576252, min=-15.397325423546135
the policies are: ['ppo', 'dummy']
number of games=3
length of games=15
length of games=15
length of games=15
length of games=15
length of games=15
length of games=15
length of games=15
length of games=15
length of games=15
length of games=15


In [7]:
print(len(agent_state_visit))
print(agent_state_visit)


37
{(4.0, 3.0, 8.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0): 1.0, (3.0, 3.0, 7.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0): 0.6666666666666666, (3.0, 3.0, 7.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0): 0.6666666666666666, (4.0, 3.0, 7.0, 3.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0): 0.6666666666666666, (4.0, 3.0, 6.0, 3.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0): 0.6666666666666666, (4.0, 3.0, 7.0, 3.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0): 0.3333333333333333, (4.0, 3.0, 7.0, 3.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0): 0.3333333333333333, (3.0, 3.0, 7.0, 4.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0): 0.3333333333333333, (3.0, 3.0, 7.0, 5.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0): 0.3333333333333333, (4.0, 3.0, 7.0, 6.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0): 0.3333333333333333, (4.0, 3.0, 8.0, 6.0, 0.0, 0.0, 1.0, 0.0, 

In [8]:
# compute the rewards and gradients for occurred states
states, grad_r = getStatesAndGradient(expert_state_visit, agent_state_visit)
print(states)
print(grad_r)

tensor([[ 4.,  3.,  8.,  ...,  0.,  0.,  0.],
        [ 3.,  3.,  7.,  ...,  0.,  0.,  0.],
        [ 3.,  3.,  7.,  ...,  0.,  0.,  0.],
        ...,
        [ 2.,  1.,  8.,  ...,  0.,  0.,  1.],
        [ 3.,  3., 10.,  ...,  0.,  0.,  0.],
        [ 3.,  3.,  9.,  ...,  0.,  0.,  0.]])
tensor([[ 1.0000e+00],
        [ 6.6615e-01],
        [ 6.6667e-01],
        [ 6.6407e-01],
        [ 6.5990e-01],
        [ 3.2969e-01],
        [ 2.1211e-01],
        [ 3.3281e-01],
        [ 3.3229e-01],
        [ 3.2969e-01],
        [ 3.3125e-01],
        [ 2.5269e-01],
        [ 3.3229e-01],
        [ 3.3333e-01],
        [ 3.3229e-01],
        [ 3.3073e-01],
        [ 3.3333e-01],
        [ 6.6615e-01],
        [ 3.3281e-01],
        [ 3.3281e-01],
        [ 3.3229e-01],
        [ 3.3333e-01],
        [ 3.3333e-01],
        [ 3.3073e-01],
        [ 3.3229e-01],
        [ 2.9431e-01],
        [ 5.4440e-01],
        [ 1.5175e-01],
        [ 3.3073e-01],
        [ 3.3229e-01],
        [ 3.3125e-01

In [9]:
reward = reward_model.forward(states)
print(f'rewards = {reward}')

rewards = tensor([[-0.0326],
        [-0.0823],
        [-0.0408],
        [-0.2622],
        [-0.2532],
        [-0.5684],
        [-0.4476],
        [-0.3029],
        [-0.3815],
        [-0.4874],
        [-0.8296],
        [-0.7917],
        [-0.9597],
        [-0.1160],
        [-0.2011],
        [-0.2033],
        [-0.2391],
        [-0.5575],
        [-0.3283],
        [-0.3629],
        [-0.4073],
        [-0.7869],
        [-0.8135],
        [-0.7939],
        [-0.1161],
        [-0.1275],
        [-0.1311],
        [-0.1199],
        [-0.2197],
        [-0.5492],
        [-0.5342],
        [-0.1281],
        [-0.2017],
        [-0.2704],
        [-0.8108],
        [-0.8271],
        [-0.8376],
        [-0.3653],
        [-0.3383],
        [-0.4509],
        [-0.3522],
        [-0.3233],
        [-0.2440],
        [-0.2019],
        [-0.1339],
        [-0.2446],
        [-0.1758],
        [-0.4530],
        [-0.6260],
        [-0.3846],
        [-0.4026],
        [-0.2522],
  

In [10]:
# gradient descent
optim.zero_grad()
reward.backward(gradient=grad_r)
optim.step()

In [11]:
print(f'model parameters:')
for param in reward_model.parameters():
    print(param)

model parameters:
Parameter containing:
tensor([[-0.1776, -0.0067, -0.1200,  ...,  0.2136, -0.1268,  0.0154],
        [-0.1743,  0.1899,  0.0041,  ...,  0.0039,  0.1239,  0.0600],
        [-0.0362,  0.2701, -0.1467,  ...,  0.0422, -0.1213, -0.0333],
        ...,
        [ 0.0495,  0.0919, -0.2338,  ...,  0.1424, -0.1996, -0.1917],
        [ 0.0207,  0.2336,  0.2489,  ..., -0.0929, -0.1018,  0.2491],
        [-0.0982,  0.0123, -0.1801,  ..., -0.0764, -0.1403, -0.1676]],
       requires_grad=True)
Parameter containing:
tensor([-9.4050e-02, -1.9384e-02,  1.7323e-01,  1.1227e-01,  8.8740e-02,
         2.2495e-01, -8.2278e-03, -1.3150e-01,  1.7644e-01, -5.5884e-02,
         1.9842e-02,  4.0598e-02, -1.8294e-01,  1.4501e-01, -1.5486e-01,
         7.4371e-02,  1.1939e-03, -1.0314e-01,  1.7847e-02,  1.0529e-01,
         2.4615e-01, -4.9848e-02, -2.0044e-01, -1.8187e-01, -1.3087e-01,
         1.9807e-01,  4.1375e-02,  3.0948e-02, -1.5169e-01, -2.1891e-01,
        -2.3173e-01,  1.0497e-01,  1.73

In [12]:
reward = reward_model.forward(states)
print(f'rewards = {reward}')

rewards = tensor([[ -9.1838],
        [-10.4790],
        [-12.2273],
        [-14.4247],
        [-14.1154],
        [-14.4948],
        [-13.8008],
        [-16.2893],
        [-18.1002],
        [-19.8660],
        [-20.0323],
        [-16.5329],
        [-17.6014],
        [-14.0788],
        [-14.8102],
        [-14.5213],
        [-14.4973],
        [-14.8755],
        [-16.6379],
        [-18.4745],
        [-20.6102],
        [-20.7840],
        [-20.8070],
        [-20.8197],
        [-10.4281],
        [ -8.9988],
        [-13.7246],
        [-13.3956],
        [-14.1525],
        [-14.5515],
        [-14.9178],
        [-15.8546],
        [-17.6738],
        [-19.4545],
        [-20.4160],
        [-20.4694],
        [-20.4429],
        [-18.8878],
        [-16.8653],
        [-17.6302],
        [-16.1726],
        [-16.5063],
        [-14.6494],
        [-11.3701],
        [ -9.4943],
        [ -7.0167],
        [ -5.1615],
        [ -5.9708],
        [ -6.6312],
        [ 