In [None]:
import numpy as np
import math

class GridWorld:
    def __init__(self,k,d,noise,size=8,danger=[7,1],goal=[4,5],wall=[2,5],coins=[(1,6),(4,2),(5,5)], horizon=50, feedback_every=10):
        self.noise = noise
        self.k=k
        self.d=d
        self.size = size
        self.horizon = horizon
        self.goal = tuple(goal)
        self.danger = tuple(danger)
        self.wall = tuple(wall)
        self._init_coins = tuple(map(tuple, coins)) 
        self.coins = set(self._init_coins)
        self.collected_coins = set()
        self.done = 0
        self.feedback_every = feedback_every
        self.gamma = 1
        self.previous_pos = None
    
    def reset(self, random=False):
        self.done = 0
        self.t = 0
        self.collected = 0
        self.collected_coins = set()
        self.coins = set(self._init_coins)
        self.previous_pos = None

        if random:
            while True:
                r = np.random.randint(0, self.size)
                c = np.random.randint(0, self.size)
                potential_start = (r, c)
                
                # Check if the potential start is one of the terminal/wall states
                if potential_start not in {self.goal, self.danger, self.wall}:
                    self.pos = potential_start
                    break
        else:
            self.pos = (7, 0)

        return self.pos
    
    def step(self, intended_action):
        self.previous_pos = self.pos
        probs = np.full(4, 0.03)
        probs[intended_action] = 0.91
        action = np.random.choice(4, p=probs)
        x, y = self.pos
        if action == 0: x = max(0, x-1)       # up
        if action == 1: x = min(self.size-1, x+1) # down
        if action == 2: y = max(0, y-1)       # left
        if action == 3: y = min(self.size-1, y+1) # right
        if((x,y)!=(self.wall)):
            self.pos = (x,y)
        if self.pos in self.coins:
            self.collected += 1
            self.collected_coins.add(self.pos)
            self.coins.remove(self.pos)
        self.t += 1
        self.done = ((self.t >= self.horizon)or (self.pos==self.goal) or (self.pos==self.danger))
        return self.pos, self.done

    def get_feedback(self):
        weights = [0.1, 1.0, 2.0, 3.0]

        true_reward = (1-(self.pos==self.danger))*(weights[self.collected]*(self.collected + 1.32*(self.pos==self.goal)) - 5*(self.t-14)/self.horizon + 5*36/50)

        scaled_reward = 2*true_reward

        ## now we quantize it into k bins
        edges = np.linspace(0,32.92,self.k+1)
        feedback = self.k-1
        for i in range(len(edges)-1):
            if(edges[i]<=scaled_reward and scaled_reward<edges[i+1]):
                feedback = i
                break

       # label noise
        probs = [0.0] * self.k
        probs[feedback] = 1-self.noise+self.noise/self.k
        rem = 1-probs[feedback]
        rem_distributed = rem / (self.k - 1)
        for i in range(self.k):
            if probs[i] == 0.0:
                probs[i] = rem_distributed
   
        feedback_list = [i for i in range(self.k)]
        feedback_given = np.random.choice(feedback_list,p=probs)
        return feedback_given, feedback
                
    
    def _features(self):
        """return trajectory features phi(tau)"""
        x, y = self.pos
        xg, yg = self.goal
        xd, yd = self.danger
        dist_to_goal = abs(x-xg) + abs(y-yg)
        dist_to_danger = abs(x-xd) + abs(y-yd)
        at_danger = int(self.pos == self.danger)
        at_goal = int(self.pos == self.goal and (at_danger==0))
        coin_indicator = [int(c in self.collected_coins) for c in self._init_coins]
        return np.array([dist_to_goal, dist_to_danger, at_goal, at_danger] + coin_indicator, dtype=float)
        

def softmax(logits):
    exps = np.exp(logits - np.max(logits))
    return exps / np.sum(exps)

class Policy:
    def __init__(self, grid_size, action_dim):
        self.grid_size = grid_size
        self.state_dim = grid_size * grid_size
        self.action_dim = action_dim
        self.theta = np.ones((self.state_dim, self.action_dim))
    
    def state_index(self, state):
        return state[0] * self.grid_size + state[1]
    
    def act(self, state):
        s_idx = self.state_index(state)
        probs = softmax(self.theta[s_idx])
        action = np.random.choice(len(probs), p=probs)
        return action, probs
    
    def grad_log_prob(self, state, action):
        """Return (state_index, grad_row) with grad_row shape (action_dim,)
           grad_row[j] = 1{j==action} - pi(j|s)"""
        s_idx = self.state_index(state)
        probs = softmax(self.theta[s_idx])
        grad_row = -probs.copy()
        grad_row[action] += 1.0
        return s_idx, grad_row

def train(feedback_every, noise, seed, m=10, k=6, eta=0.1, grid_size=8, num_coins=3):
    np.random.seed(seed)
    d = 4 + num_coins

    env = GridWorld(k=k, d=d, noise=noise, feedback_every=feedback_every)
    policy = Policy(grid_size=grid_size, action_dim=4)
    
    avg_true_rewards, avg_coins, avg_length = [], [], []

    for g in range(3000):
        theta_old = policy.theta.copy()
        rollout_trajectories = []

        true_rewards_this_iter = []

        for i in range(m): ## sample trajectories under current policy pi to approiximate the theoretical expectation
            s = env.reset()
            traj = {"states": [], "actions": [], "human_rewards": [], "true_rewards": [], "step_rewards": [], "steps":0, "coins":0}
            done = False
            previous_feedback = 0

            while not done:
                a, _ = policy.act(s)
                traj["states"].append(s)
                traj["actions"].append(a)
                s, done = env.step(a)

                if env.t % env.feedback_every == 0 or done:
                    feedback, true_reward = env.get_feedback()
                    traj["human_rewards"].append(feedback)
                    traj["true_rewards"].append(true_reward)

                    current_step_reward = feedback - previous_feedback
                    previous_feedback = feedback
                else:
                    current_step_reward = 0.0

                traj["step_rewards"].append(current_step_reward)

            true_rewards_this_iter.append(env.get_feedback()[1])

            traj["steps"] = env.t
            traj["coins"] = env.collected
            
            # reward to go
            returns = []
            G_t = 0
            for r in reversed(traj["step_rewards"]):
                G_t = r + env.gamma * G_t
                returns.insert(0, G_t)
            
            rollout_trajectories.append((traj, returns))
                
        ## now with these m rollouts, approximate the expectation of estimated reward under policy pi
        grad_theta = np.zeros_like(policy.theta)

        all_returns = [ret for _, returns in rollout_trajectories for ret in returns]
        b = float(np.mean(all_returns))  

        for traj, returns in rollout_trajectories:
            
            # Iterate through (s,a G_t)
            for state, action, G_t in zip(traj["states"], traj["actions"], returns):
                
                # Calculate Advantage
                advantage = G_t - b

                s_idx, grad_row = policy.grad_log_prob(state, action)
                
                # Update gradient for this specific step
                grad_theta[s_idx] += grad_row * advantage



        grad_theta = grad_theta/len(rollout_trajectories)
        theta_new = theta_old + eta*grad_theta
        theta_old = theta_new.copy()
        policy.theta = theta_new
        
        # logging
        avg_true_rewards.append(np.mean(true_rewards_this_iter))
        avg_coins.append(np.mean([traj["coins"] for traj,_ in rollout_trajectories]))
        avg_length.append(np.mean([traj["steps"] for traj,_ in rollout_trajectories]))

        # print(f"Iter {g}: true={avg_true_rewards[-1]:.2f}, coins={avg_coins[-1]:.2f}, length={avg_length[-1]:.2f}")

    
    return policy, avg_true_rewards, avg_coins, avg_length


feedbacks = [1, 5, 10, 15, 25, 50]
noise = 0.5
num_seeds = 5

all_results = {fb: [] for fb in feedbacks}

for fb in feedbacks:
    for seed in range(num_seeds):
        print(f"Training with feedback every {fb} steps, seed {seed}")
        _, avg_true_rewards, _, avg_length = train(
            feedback_every=fb,
            noise=noise,
            seed=seed
        )
        all_results[fb].append(avg_true_rewards)   # store curve


In [None]:
mean_curves = {}
std_curves = {}

for fb in feedbacks:
    arr = np.array(all_results[fb])   # shape (seeds, T)
    mean_curves[fb] = np.mean(arr, axis=0)
    std_curves[fb]  = np.std(arr, axis=0)


In [None]:
import matplotlib.pyplot as plt
from scipy.signal import savgol_filter
import numpy as np

plt.figure(figsize=(12,6))
colors = plt.cm.tab10(np.linspace(0, 1, len(feedbacks)))

for idx, fb in enumerate(feedbacks):
    mean_curve = mean_curves[fb]
    smooth_mean = savgol_filter(mean_curve, 51, 3)

    plt.plot(
        smooth_mean,
        linewidth=2,
        color=colors[idx],
        label=f"feedback_every={fb}"
    )

plt.xlabel("Iteration")
plt.ylabel("True Reward")
plt.title(f"Averaged Learning Curves Across {num_seeds} Seeds, Noise={noise}")
plt.grid()
plt.legend()
plt.show()
