In [None]:
import cvxpy as cp
import numpy as np
import math


class GridWorld:
    def __init__(self,k,d,size=8,danger=[7,1],goal=[4,5],wall=[2,5],coins=[(1,6),(4,2),(5,5)], horizon=50, noise=0.1):
        self.noise = noise
        self.k=k
        self.d=d
        self.size = size
        self.horizon = horizon
        self.goal = tuple(goal)
        self.danger = tuple(danger)
        self.wall = tuple(wall)
        self._init_coins = tuple(map(tuple, coins)) 
        self.coins = set(self._init_coins)
        self.collected_coins = set()
        self.done = 0
    
    def reset(self):
        self.done = 0
        self.pos = (0,7)
        self.t = 0
        self.collected = 0
        self.collected_coins = set()
        self.coins = set(self._init_coins)
        return self.pos
    
    def step(self, intended_action):
        probs = np.full(4, 0.03)
        probs[intended_action] = 0.91
        action = np.random.choice(4, p=probs)
        x, y = self.pos
        if action == 0: x = max(0, x-1)       # up
        if action == 1: x = min(self.size-1, x+1) # down
        if action == 2: y = max(0, y-1)       # left
        if action == 3: y = min(self.size-1, y+1) # right
        if((x,y)!=(self.wall)):
            self.pos = (x,y)
        if self.pos in self.coins:
            self.collected += 1
            self.collected_coins.add(self.pos)
            self.coins.remove(self.pos)
        self.t += 1
        self.done = ((self.t >= self.horizon)or (self.pos==self.goal) or (self.pos==self.danger))
        return self.pos, self.done
        

    def get_feedback_and_features(self):
        weights = [0.1, 1.0, 2.0, 3.0]
        true_reward = (1-(self.pos==self.danger))*(weights[self.collected]*(self.collected + 1.32*(self.pos==self.goal)) - 5*(self.t-14)/self.horizon + 5*36/50)
#         scaled_reward = 10*(1-math.exp(-true_reward/5))/(1+math.exp(-true_reward/5))
        scaled_reward = 2*true_reward
        ## now we quantize it into k bins
        edges = np.linspace(0,32.92,self.k+1)
        feedback = self.k-1
        for i in range(len(edges)-1):
            if(edges[i]<=scaled_reward and scaled_reward<edges[i+1]):
                feedback = i
                break
                
        # label noise
        probs = [0.0] * self.k
        probs[feedback] = 1-self.noise+self.noise/self.k
        rem = 1-probs[feedback]
        rem_distributed = rem / (self.k - 1)
        for i in range(self.k):
            if probs[i] == 0.0:
                probs[i] = rem_distributed
   
        feedback_list = [i for i in range(self.k)]
        feedback_given = np.random.choice(feedback_list,p=probs)
        return feedback_given,self._features()
    
    def true_return(self):
        weights = [0.1, 1.0, 2.0, 3.0]
        true_reward = (1-(self.pos==self.danger))*(weights[self.collected]*(self.collected + 1.32*(self.pos==self.goal)) - 5*(self.t-14)/self.horizon + 5*36/50)
        scaled_reward = 2*true_reward
        edges = np.linspace(0,32.92,self.k+1)
        feedback = self.k-1
        for i in range(len(edges)-1):
            if(edges[i]<=scaled_reward and scaled_reward<edges[i+1]):
                feedback = i
                break
        return feedback
                
    
    def _features(self):
        """return trajectory features phi(tau)"""
        x, y = self.pos
        xg, yg = self.goal
        xd, yd = self.danger
        dist_to_goal = abs(x-xg) + abs(y-yg)
        dist_to_danger = abs(x-xd) + abs(y-yd)
        at_danger = int(self.pos == self.danger)
        at_goal = int(self.pos == self.goal and (at_danger==0))
        coin_indicator = [int(c in self.collected_coins) for c in self._init_coins]
        return np.array([dist_to_goal, dist_to_danger, at_goal, at_danger] + coin_indicator, dtype=float)
        

def softmax(logits):
    exps = np.exp(logits - np.max(logits))
    return exps / np.sum(exps)

class Policy:
    def __init__(self, grid_size, action_dim):
        self.grid_size = grid_size
        self.state_dim = grid_size * grid_size
        self.action_dim = action_dim
        self.theta = np.ones((self.state_dim, self.action_dim))
    
    def state_index(self, state):
        return state[0] * self.grid_size + state[1]
    
    def act(self, state):
        s_idx = self.state_index(state)
        probs = softmax(self.theta[s_idx])
        action = np.random.choice(len(probs), p=probs)
        return action, probs
    
    def grad_log_prob(self, state, action):
        """Return (state_index, grad_row) with grad_row shape (action_dim,)
           grad_row[j] = 1{j==action} - pi(j|s)"""
        s_idx = self.state_index(state)
        probs = softmax(self.theta[s_idx])
        grad_row = -probs.copy()
        grad_row[action] += 1.0
        return s_idx, grad_row
    


class RewardModel:
    def __init__(self, k,d,C=0.0):
        self.k = k
        self.d = d
        self.W = np.zeros((k,d))
        self.C = C
        
    def estimate_W(self, X, Y, reg=1e-3):
        n, d = X.shape
        W = cp.Variable((self.k, self.d))  # optimization variable, not vectorized, shape(k,d)
        loss = 0
        for i in range(n):
            phi = X[i]  
            yi = int(Y[i])
            logits = W @ phi 
            ## yi cannot be more than (k-1), so if assigning deterministic rewards without crafting W*, be careful
            loss += -(logits[yi] - cp.log_sum_exp(logits))
        
        loss = loss/n + reg*cp.norm(W, "fro")**2
        prob = cp.Problem(cp.Minimize(loss))
        prob.solve(solver=cp.MOSEK)

        self.W = W.value
        return self.W
    
    def reward_probabilities(self, phi):
        """estimate P(y|tau)"""
        logits = self.W @ phi
        logits-= np.max(logits)
        exp_logits = np.exp(logits)
        return exp_logits/np.sum(exp_logits)
    
    def reward_estimate(self,phi):
        """returns the average, expected reward given the reward probabilities"""
        return np.sum(np.array([i*self.reward_probabilities(phi)[i] for i in range(self.k)]))
    
    def optimistic_reward(self, phi, n_samples):
        """
        optimism term included
        """
        base = self.reward_estimate(phi)
        n = max(n_samples, 1)
        bonus = self.C / np.sqrt(n)
        optimistic = base + bonus
        return min(optimistic, self.k - 1)


###----------Traning loop-------------###
def train(N=20,m=50,k=6,eta=0.1,epsilon=0.1,grid_size=8,danger=[7,1],goal=[0,7], wall=[2,5] ,horizon=50,coins=None,seed=0,noise=0.1):
    np.random.seed(seed)
    queries = 0
    steps = 0
    if coins is None:
        coins=[(1,6),(4,2),(5,5)]
    print("hi")
    d = 4+len(coins)
#     W_true = generate_W_true(k,d)
    env = GridWorld(k,d,size=grid_size, danger=danger, goal=goal, wall=wall, coins=coins, horizon=horizon, noise=noise)
    policy = Policy(grid_size=grid_size, action_dim=4)
    
    ## initialize weights w_0
    reward_model = RewardModel(k, d, C=10.0)
    reward_model.W = np.zeros((k,d))
    
    all_data_X, all_data_Y = [], []
    avg_true_rewards,avg_coins,avg_est_rewards = [], [], []
    flag = 0
    for n in range(N):
        if flag: break
        print(n)
        avg_true_reward_this_iter = 0
        avg_est_reward_this_iter = 0
        if(n<100):
            G_range = 10
        elif (n>100 ):
            G_range = 100

        for g in range(G_range):
            steps+=1
            rollout_trajectories = []
            avg_true_reward_this_iter = 0
            avg_est_reward_this_iter = 0
            rewards = []
            for i in range(m): ## sample trajectories under current policy pi to approiximate the theoretical expectation
                s = env.reset()
                traj = {"states": [], "actions": [], "steps":0, "coins":0}
                done = False

                while not done:
                    a, _ = policy.act(s)
                    traj["states"].append(s)
                    traj["actions"].append(a)
                    s, done = env.step(a)
            
                traj["steps"] = env.horizon
                traj["coins"] = env.collected
                y, phi = env.get_feedback_and_features()
                phi = np.array(phi, dtype=float)   
                rollout_trajectories.append((traj, phi, y))
                rewards.append(env.true_return())
#             if(np.mean(rewards)>31):
#                 flag = 1
#                 break

                
            ## now with these m rollouts, approximate the expectation of estimated reward under policy pi
            grad_theta = np.zeros_like(policy.theta)
            n_samples = max(len(all_data_X), 1)
            R_hats = [reward_model.optimistic_reward(phi, n_samples) for _, phi, _ in rollout_trajectories]
            b = float(np.mean(R_hats))  # baseline
                        
            for (traj,phi_tau,y), r_hat in zip(rollout_trajectories,R_hats):
                
                avg_est_reward_this_iter+=r_hat/len(rollout_trajectories)
#                 avg_true_reward_this_iter+=y/len(rollout_trajectories)
                temp = r_hat-b
                    
                for state,action in zip(traj["states"],traj["actions"]):
                    s_idx, grad_row = policy.grad_log_prob(state, action)
                    grad_theta[s_idx] += grad_row*(temp)

            grad_theta = grad_theta/len(rollout_trajectories)
            policy.theta += eta*grad_theta
            avg_true_reward_this_iter = np.mean(rewards)
            
        done = False
        s = env.reset()
        while not done:
            a,_ = policy.act(s)
            s,done = env.step(a)
        y,phi = env.get_feedback_and_features()
        all_data_X.append(phi)
        all_data_Y.append(y)
        reward_model.W = reward_model.estimate_W(np.array(all_data_X),np.array(all_data_Y), reg = 1e-3)
         
        traj,phi,y = rollout_trajectories[-1]

        coins_this_iter = traj["coins"]
        
        ## storing some info
        avg_est_rewards.append(avg_est_reward_this_iter)
        avg_true_rewards.append(avg_true_reward_this_iter)
        avg_coins.append(coins_this_iter)
        
        # update estimate of weight matrix W
        reward_model.W = reward_model.estimate_W(np.array(all_data_X),np.array(all_data_Y), reg = 1e-3)

        print(f"Iter {n:02d}: avg_estimated_reward={avg_est_rewards[-1]:.3f}, avg_true_reward={avg_true_rewards[-1]:.3f},coins_this_episode={avg_coins[-1]:.2f}")
    
    return policy, reward_model, avg_true_rewards, avg_est_rewards, reward_model.W,steps,queries


trained_policy, trained_reward_model, avg_true, avg_est, W,steps,queries = train(N=150,m=20,k=6,eta=0.5,epsilon=1e-2,grid_size=8,danger=[7,1],goal=[4,5], wall=[2,5],coins=[(1,6),(4,2),(5,5)],seed=2,noise=0.1)
print("Training complete.")



In [None]:
pi = trained_policy
import matplotlib.pyplot as plt

def plot_gridworld_path(env, states):
    size = env.size
    xs = [s[1] for s in states]
    ys = [s[0] for s in states]
    fig, ax = plt.subplots(figsize=(6, 6))

    for i in range(size + 1):
        ax.axhline(i - 0.5, linewidth=0.6)
        ax.axvline(i - 0.5, linewidth=0.6)
    ax.set_xlim(-0.5, size - 0.5)
    ax.set_ylim(-0.5, size - 0.5)
    ax.set_aspect('equal', adjustable='box')
    ax.invert_yaxis()

    ax.plot(xs, ys, marker='o', linewidth=1.5, label='path')
    if states:
        ax.scatter(xs[0], ys[0], marker='o', s=100, label='start')
        ax.scatter(xs[-1], ys[-1], marker='D', s=100, label='end')

    gx, gy = env.goal
    dx, dy = env.danger
    ax.scatter(gy, gx, marker='^', s=140, label='goal')
    ax.scatter(dy, dx, marker='X', s=140, label='danger')

    wx, wy = env.wall
    ax.scatter(wy, wx, marker='s', s=140, label='wall')

    init_coins = list(env._init_coins)
    remaining = list(env.coins)
    collected = list(env.collected_coins)

    if init_coins:
        ax.scatter(
            [c[1] for c in init_coins],
            [c[0] for c in init_coins],
            marker='s', s=80, alpha=0.25, label='coin tiles (all)'
        )

    if remaining:
        ax.scatter(
            [c[1] for c in remaining],
            [c[0] for c in remaining],
            marker='s', s=80, label='coins (remaining)'
        )

    if collected:
        ax.scatter(
            [c[1] for c in collected],
            [c[0] for c in collected],
            marker='*', s=180, label='coins (collected)'
        )

    ax.set_xticks(range(size))
    ax.set_yticks(range(size))
    ax.set_xlabel('y (column)')
    ax.set_ylabel('x (row)')
    ax.set_title('GridWorld trajectory')
    ax.legend(loc='upper left', fontsize=8)
    plt.show()


    
done = 0
traj = {"states": [], "actions": [], "steps": 0, "coins": 0}
env = GridWorld(2,7,size=8, danger=[7,1],goal=[4,5],wall=[2,5],coins=[(1,6),(4,2),(5,5)], horizon=50) ##set value of k here

s = env.reset() 
c = set([(1,6),(4,2),(5,5)])
while not done:
    a, _ = pi.act(s)
    traj["states"].append(s)
    traj["actions"].append(a)
    s, done = env.step(a)
    if (s in c):
        print(f"coin found at position: {s}")
        c.remove(s)
    if (s == env.goal):
        print("goal reached!")
    if (s == env.danger):
        print("danger zone")
        
print(env.t)

plot_gridworld_path(env, traj["states"])

In [None]:
import numpy as np
import matplotlib.pyplot as plt

N_EPISODES = 200
seeds = [0, 1, 2, 3, 4]

all_curves_01 = []

for seed in seeds:
    policy, rm, avg_true, avg_est, W, steps, queries = train(
        N=N_EPISODES,
        m=20,
        k=6,
        eta=0.5,
        grid_size=8,
        danger=(7, 1),
        goal=(4, 5),
        wall=(2, 5),
        coins=[(1, 6), (4, 2), (5, 5)],
        seed=seed,
        noise=0.1
    )
    all_curves_01.append(np.array(avg_true, dtype=float))

all_curves_01 = np.stack(all_curves_01, axis=0)   

mean_curve_01 = all_curves_01.mean(axis=0)
std_curve_01  = all_curves_01.std(axis=0)

x_01 = np.arange(N_EPISODES)

plt.figure(figsize=(7,4))
plt.plot(x, mean_curve_01, label="mean avg_true over seeds")
plt.fill_between(x, mean_curve_01 - std_curve_01, mean_curve_01 + std_curve_01,
                 alpha=0.2, label="±1 std over seeds")
plt.xlabel("Episode")
plt.ylabel("Average feedback per episode")
plt.title("avg feedback per episode averaged over seeds")
plt.grid(True)
plt.legend()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

N_EPISODES = 200
seeds = [0, 1, 2, 3, 4]

all_curves_05 = []

for seed in seeds:
    policy, rm, avg_true, avg_est, W, steps, queries = train(
        N=N_EPISODES,
        m=20,
        k=6,
        eta=0.5,
        grid_size=8,
        danger=(7, 1),
        goal=(4, 5),
        wall=(2, 5),
        coins=[(1, 6), (4, 2), (5, 5)],
        seed=seed,
        noise=0.5
    )
    all_curves_05.append(np.array(avg_true, dtype=float))

all_curves_05 = np.stack(all_curves_05, axis=0)   

mean_curve_05 = all_curves_05.mean(axis=0)
std_curve_05  = all_curves_05.std(axis=0)

x_05 = np.arange(N_EPISODES)

plt.figure(figsize=(7,4))
plt.plot(x_05, mean_curve_05, label="mean avg_true over seeds")
plt.fill_between(x_05, mean_curve_05 - std_curve_05, mean_curve_05 + std_curve_05,
                 alpha=0.2, label="±1 std over seeds")
plt.xlabel("Episode")
plt.ylabel("Average feedback per episode")
plt.title("avg feedback per episode averaged over seeds")
plt.grid(True)
plt.legend()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

N_EPISODES = 200
seeds = [0, 1, 2, 3, 4]

all_curves_08 = []

for seed in seeds:
    policy, rm, avg_true, avg_est, W, steps, queries = train(
        N=N_EPISODES,
        m=20,
        k=6,
        eta=0.5,
        grid_size=8,
        danger=(7, 1),
        goal=(4, 5),
        wall=(2, 5),
        coins=[(1, 6), (4, 2), (5, 5)],
        seed=seed,
        noise=0.8
    )
    all_curves_08.append(np.array(avg_true, dtype=float))

all_curves_08 = np.stack(all_curves_08, axis=0)   

mean_curve_08 = all_curves_08.mean(axis=0)
std_curve_08  = all_curves_08.std(axis=0)

x_08 = np.arange(N_EPISODES)

plt.figure(figsize=(7,4))
plt.plot(x_08, mean_curve_08, label="mean avg_true over seeds")
plt.fill_between(x_08, mean_curve_08 - std_curve_08, mean_curve_08 + std_curve_08,
                 alpha=0.2, label="±1 std over seeds")
plt.xlabel("Episode")
plt.ylabel("Average feedback per episode")
plt.title("avg feedback per episode averaged over seeds")
plt.grid(True)
plt.legend()
plt.show()


In [None]:
all_curves_08 = np.stack(all_curves_08, axis=0)   

mean_curve_08 = all_curves_08.mean(axis=0)
std_curve_08  = all_curves_08.std(axis=0)

x_08 = np.arange(N_EPISODES)

plt.figure(figsize=(7,4))
plt.plot(x_08, mean_curve_08, label="mean avg_true over seeds")
plt.fill_between(x_08, mean_curve_08 - std_curve_08, mean_curve_08 + std_curve_08,
                 alpha=0.2, label="±1 std over seeds")
plt.xlabel("Episode")
plt.ylabel("Average feedback per episode")
plt.title("avg feedback per episode averaged over seeds")
plt.grid(True)
plt.legend()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Stack and compute mean/std for each noise level
all_curves_01 = np.stack(all_curves_01, axis=0)
all_curves_05 = np.stack(all_curves_05, axis=0)
all_curves_08 = np.stack(all_curves_08, axis=0)

mean_01 = all_curves_01.mean(axis=0)
std_01  = all_curves_01.std(axis=0)

mean_05 = all_curves_05.mean(axis=0)
std_05  = all_curves_05.std(axis=0)

mean_08 = all_curves_08.mean(axis=0)
std_08  = all_curves_08.std(axis=0)

N_EPISODES = mean_01.shape[0]   # or whatever you used before
x = np.arange(N_EPISODES)

plt.figure(figsize=(7, 4))

# noise = 0.1
plt.plot(x, mean_01, label="noise = 0.1")
plt.fill_between(x, mean_01 - std_01, mean_01 + std_01, alpha=0.15)

# noise = 0.5
plt.plot(x, mean_05, label="noise = 0.5")
plt.fill_between(x, mean_05 - std_05, mean_05 + std_05, alpha=0.15)

# noise = 0.8
plt.plot(x, mean_08, label="noise = 0.8")
plt.fill_between(x, mean_08 - std_08, mean_08 + std_08, alpha=0.15)

plt.xlabel("Episode")
plt.ylabel("Average feedback per episode")
plt.title("Average feedback per episode (mean ± 1 std over seeds)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
import numpy as np

np.save('mean_01.npy', mean_01)
np.save('std_01.npy', std_01)
np.save('mean_05.npy', mean_05)
np.save('std_05.npy', std_05)
np.save('mean_08.npy', mean_08)
np.save('std_08.npy', std_08)
