2.3 a):

State space:    - 4 continuous variables:
                    - Cart Position; Limits: [-4.8, 4.8]
                    - Cart Velocity; Limits: [-Inf, Inf]
                    - Pole Angle; Limits: [-24 deg, 24 deg]
                    - Pole Angular Velocity; Limits: [-Inf, Inf]
            
Action space:   - 2 discrete actions:
                    - Push cart to the left
                    - Push cart to the right
                    
Environment dynamics:   - Start: all variables (Cart Position, Cart Velocity, Pole Angle, Pole Angular Velocity) 
                        start with a random value between -0.05 and +0.05
                        - End: 
                            1. Pole is no longer balanced (angle greater than +-12 degrees)
                            2. Cart reaches the edge of the envrionment (Cart Position greater than +-2.4)
                            3. Maximum episode length is met (in this case 500 time steps)
                            
Reward structure:   - Option 1: +1 for each time step the pole is balanced
                    - Option 2: 0 for all time steps, -1 when the pole falls (and the episode terminates, see above)

2.3 b):

In [1]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import random
import time

In [None]:
"""
Simple Simulation of the CartPole environment; 
code structure taken from gym documentation. The loop chooses a random action at each time step.
The action is then executed and the environment is rendered. Rendering is delayed by 0.1 seconds after each step,
as otherwise the simulation terminates after <1 second and cannot actually be observed by humans.
"""
env = gym.make("CartPole-v1", render_mode = "human") #Create the environment

observation, info = env.reset() #obligatory reset

episode_over = False
while not episode_over:
    action = env.action_space.sample() #sample random action (either 0 or 1, left or right)
    observation, reward, terminated, truncated, info = env.step(action) #action is performed
    
    print(observation)
    
    env.render() #environment is rendered
    time.sleep(0.1) #delay
    
    #if either a termination condition is met or the maximum episode length is reached, the loop needs to end
    episode_over = terminated or truncated 

env.close() #close the environment

In [None]:
def simple_policy(observation, borders=False, even=False, lowAngSpeed=False, lowCartSpeed=False):
    """
    Simple policy which uses the given observation to choose whether to push the cart left or right.
    """
    if borders:
        if observation[0] < -2.3:
            return 1
        elif observation[0] > 2.3:
            return 0
    
    if lowAngSpeed:
        if observation[3] < -1.5 or observation[3] > 1.5:
            return 0 if observation[3] < 0 else 1
        
    if lowCartSpeed:
        if observation[1] < -1.5 or observation[1] > 1.5:
            return 0 if observation[1] < 0 else 1
    
    if even: 
        if -0.05 < observation[2] < 0.05: #equates to ~3 degrees
            return random.choice([0, 1])
    
    return 0 if observation[2] < 0 else 1 #if the pole leans left, push cart left, otherwise push cart right

In [None]:
def run_env(borders=False, even=False, lowAngSpeed=False, lowCartSpeed=False, render=False):
    """
    Execution of the CartPole environment using own policies; rest as above.
    """
    env = gym.make("CartPole-v1") #Create the environment
    
    observation, info = env.reset() #obligatory reset
    
    reward_cum = 0
    episode_over = False
    while not episode_over:
        action = simple_policy(observation, borders=borders, even=even, lowAngSpeed=lowAngSpeed, lowCartSpeed=lowCartSpeed)
        observation, reward, terminated, truncated, info = env.step(action) #action is performed
        
        #print(observation) if observation[1] < -1.5 or observation[1] > 1.5 else None
        reward_cum += reward
        
        if render: 
            env.render() #environment is rendered
        
        #if either a termination condition is met or the maximum episode length is reached, the loop needs to end
        episode_over = terminated or truncated 
    
    env.close() #close the environment
    return reward_cum


In [None]:
def run_episodes(n=100):
    
    mean_rewards = []
    
    #simplest form
    mean_rewards.append(np.mean([run_env(borders=False, even=False, lowAngSpeed=False, lowCartSpeed=False) for _ in range(n)]))
    
    #form that generally tries to keep pole upright
    mean_rewards.append(np.mean([run_env(borders=False, even=True, lowAngSpeed=False, lowCartSpeed=False) for _ in range(n)]))
    
    #form that also tries to stay in bounds
    mean_rewards.append(np.mean([run_env(borders=True, even=True, lowAngSpeed=False, lowCartSpeed=False) for _ in range(n)]))
    
    #form that keeps angular speed within limits
    mean_rewards.append(np.mean([run_env(borders=True, even=True, lowAngSpeed=True, lowCartSpeed=False) for _ in range(n)]))
    
    #form that keeps angular speed within limits, but is not concerned with borders
    mean_rewards.append(np.mean([run_env(borders=False, even=True, lowAngSpeed=True, lowCartSpeed=False) for _ in range(n)]))
    
    #form that also keeps cart speed within limits
    mean_rewards.append(np.mean([run_env(borders=True, even=True, lowAngSpeed=True, lowCartSpeed=True) for _ in range(n)]))
    
    return mean_rewards


In [None]:
num_policies = 6
num_runs = 10

mean_reward_array = np.ndarray((num_runs, num_policies))
for i in range(num_runs):
    mean_rewards = run_episodes(n=200)
    mean_reward_array[i] = mean_rewards

print(mean_reward_array)

In [None]:
for i in range(num_policies):
    plt.plot(np.transpose(mean_reward_array)[i], label=f'Policy {i+1}')

plt.xlabel('Policy Index')
plt.ylabel('Mean Reward')
plt.title('Mean Rewards for Different Policies')
plt.legend()
plt.show()

It can be clearly seen that the simplest approach, i.e. moving in the opposite direction of the pole's lean is the worst policy.
When watching the actual simulation, it can be seen that this very quickly results in overcorrection on the agents part, 
with the pole oscillating faster and faster, leading to a fall.
This is slightly improved by just choosing a random direction when the pole is (nearly) upright. Choosing a random direction 
is an approach to simulate "doing nothing" in a sense, as the pole is already balanced and no action is required.
A significant improvement is achieved when correcting for the angular velocity of the pole, as this directly combats the oscillation
and therefore the tendency to overcorrect.
Interestingly, it can be seen that ensuring that the cart stays within its positional bounds does not significantly improve the 
performance of the agent. This is likely to the fact that the cart needs a long time to move to the outer edges, meaning that 
the case in which the cart reaches the edge is rare and therefore does not significantly impact the overall performance.
Lastly, taking the cart velocity into consideration does decrease the agents performance. This is probably because it tends to 
unnecessarily change the direction of the cart, which leads to additional angular velocity for the pole, rather than helping to 
keep it low and the pole upright.

2.3 c):

In [2]:
def transform(observation):
    """
    Transform the observation into a binary array, where each value is 1 if the corresponding value in the observation is positive
    and 0 otherwise. This is done to reduce the state space from 4 continuous variables to 4 binary variables, 
    resulting in a total of 16 discrete states.
    """
    return np.array([1 if obs >= 0 else 0 for obs in observation])

In [3]:
def bandit_policy(observation, bandits, epsilon=0.1):
    bandit = bandits[tuple(transform(observation))]

    if random.random() < epsilon or bandit[0] == 0 or bandit[1] == 0:
        ret = random.choice([0, 1])
    else:
        ret = 1 if bandit[3]/bandit[1] > bandit[2]/bandit[0] else 0
    
    return bandit, ret
        

In [19]:
def update_bandit(bandit, action, reward):
    print(reward)
    bandit[action] += 1
    bandit[action+2] += reward
    
    return bandit
    

In [13]:
def run_env_bandits(bandits, render=False):
    if render:
        env = gym.make("CartPole-v1", render_mode="human") #Create the environment
    else:
        env = gym.make("CartPole-v1") #Create the environment
    
    observation, info = env.reset() #obligatory reset
    
    reward_cum = np.ndarray((2, 2, 2, 2))
    reward_cum.fill(0)
    
    episode_over = False
    while not episode_over:
        transformed_observation = transform(observation)
        print(transformed_observation)
        
        bandit, action = bandit_policy(transformed_observation, bandits)
        observation, reward, terminated, truncated, info = env.step(action) #action is performed
        reward_cum[tuple(transformed_observation)] += reward
        
        bandits[tuple(transformed_observation)] = update_bandit(bandit, action, reward)
        
        if render: 
            env.render() #environment is rendered
            time.sleep(1)
        
        #if either a termination condition is met or the maximum episode length is reached, the loop needs to end
        episode_over = terminated or truncated 
    
    env.close() #close the environment
    return reward_cum


In [20]:
bandits = np.ndarray((2, 2, 2, 2, 4))
bandits.fill(random.choice([0,1]))

rewards = [[] for _ in range(16)]
#print(rewards)

test = np.ndarray((2, 2, 2, 2))

num = 10000
for _ in range(num):
    #print("bandits before")
    #print(bandits)
    test += run_env_bandits(bandits, render=False)
    #print(bandits)

#print(rewards)
print(test)
print(bandits)

[1 0 0 0]
1.0
[1 0 0 1]
1.0
[1 0 0 1]
1.0
[1 0 1 1]
1.0
[1 0 1 1]
1.0
[1 0 1 1]
1.0
[1 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[1 1 0 0]
1.0
[1 0 0 1]
1.0
[1 0 0 1]
1.0
[1 0 0 1]
1.0
[1 0 0 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 1 0 0]
1.0
[0 0 0 1]
1.0
[0 0 0 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[1 0 1 0]
1.0
[1 0 1 1]
1.0
[1 0 1 1]
1.0
[1 0 1 1]
1.0
[1 0 1 1]
1.0
[1 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 1 1 0]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 1 1 0]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1 1]
1.0
[0 0 1