# Pendulum
## The notebook describes a solution of the problem of controlling a pendulum to be raised to a vertical position and held in this state using a Temporal-difference Q-learning algorithm 

### Workspace
Workspace is divided into 4 values, the cartesian coordinate system x and y, theta angle in radians and tau the torque of the pendulum


### Action Space
Force applied to the pendulum in the range of continuous values from -2.0 to 2.0

### Observation Space
Observation Space contains 3 values, the x and y coordinates, and the angular velocity of the pendulum.
x = cos(theta) continuous value from -1.0 to 1.0
y = sin(theta) continuous value from -1.0 to 1.0
Angular velocity is continuous value from -8.0 to 8.0


### I've made those changes:
Converted observation space from 3 values to 2, by obtaining theta angle from its cosine and sine using the x and y values I know, and also I made a discretization from continuous values

### I used the built-in award function which is described as:
### r = -(theta^2^ + 0.1 * theta_dt^2^ + 0.001 * torque^2^)

## My solution is written based on this pseudocode
![Pseudo code](./q-learning.jpg)

In [22]:
import gymnasium as gym
import numpy as np
from matplotlib import pyplot as plt

PRELOAD = True

In [8]:
# env = gym.make('Pendulum-v1', render_mode="human", max_episode_steps=600); env.metadata['render_fps'] = 60
env = gym.make('Pendulum-v1', max_episode_steps=600)

learning_rate = 0.1
discount_rate = 0.95

if not PRELOAD:
    epsilon = 1.0
    epsilon_min = 0.1
    epsilon_decay = 0.999
else:
    epsilon = 0.15
    epsilon_min = 0.05
    epsilon_decay = 0.999

episodes = 150000
total_reward = 0

action_space_size = 41
""" count of action [-2 : 2] with 0.1 step, 0->19 = [-2 : -0.1], 20 = 0, 21->40 = [0.1 : 2] """
observation_space_size = [63, 161]
"""
First is count of pendulum state [-pi : pi] with 0.1 step, 0->31 = [-pi : -pi/31], 32 = 0, 33->62 = [pi/32 : pi]
Second is count of pendulum angular velocity [-8 : 8] with 0.1 step, 0->79 = [-8 : -0.1], 80 = 0, 81->160 = [0.1 : 8]
"""

total_reward_for_episode = list()

In [9]:
action_space = np.linspace(-2, 2, num=action_space_size)
observation_space = [np.linspace(-np.pi, np.pi, num=observation_space_size[0]),
                     np.linspace(-8.0, 8.0, num=observation_space_size[1])]

if not PRELOAD:
    q_table = np.random.uniform(low=-2, high=-0, size=(observation_space_size + [action_space_size]))
else:
    q_table = np.load("D:/Programming/MachineLearning2/Zadanie1/TD_q_table_pendulum.npy")

In [25]:
def get_discrete_action(state):
    theta = np.arctan2(state[0], state[1])
    index_state = np.digitize(theta, observation_space[0]) - 1
    index_velocity = np.digitize(state[2], observation_space[1]) - 1
    action = q_table[index_state, index_velocity]
    return action

In [26]:
def get_indexes(state):
    theta = np.arctan2(state[0], state[1])
    index_state = np.digitize(theta, observation_space[0]) - 1
    index_velocity = np.digitize(state[2], observation_space[1]) - 1
    return [index_state, index_velocity]

In [12]:
for episode in range(1, episodes+1):
    state = env.reset()
    state = state[0]
    done = False
    episode_reward = 0

    while not done:
        if np.random.random() > epsilon:
            action = np.argmax(get_discrete_action(state))
        else:
            action = np.random.randint(0, action_space_size)

        step = [action_space[action]]

        observation, reward, done, truncated, *info = env.step(step)

        total_reward += reward
        episode_reward += reward

        # Change q table
        index_state, index_velocity = get_indexes(state)
        new_index_state, new_index_velocity = get_indexes(observation)
        new_action = np.argmax(get_discrete_action(observation))

        current_q = q_table[index_state, index_velocity, action]
        new_step = np.max(q_table[new_index_state, new_index_velocity])
        new_q = current_q + learning_rate * (reward + discount_rate * new_step - current_q)
        q_table[index_state, index_velocity, action] = new_q
        state = observation

        # Change epsilon
        if epsilon > epsilon_min:
            epsilon *= epsilon_decay

        # Truncated
        if truncated:
            state = env.reset()
            break

    # if episode % 1000 == 0:
    print("Episode: {}".format(episode))
    print("Total reward = {}".format(total_reward))
    print("Average reward = {}".format(total_reward/episode))

Episode: 1
Total reward = -243.1905086026395
Average reward = -243.1905086026395


KeyboardInterrupt: 

In [None]:
plt.plot(total_reward_for_episode)
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.show()

In [None]:
np.save("D:/Programming/MachineLearning2/Zadanie1/q_table_pendulum", q_table)

# TESTING

In [46]:
env = gym.make('Pendulum-v1', max_episode_steps=600)

learning_rate = 0.2
discount_rate = 0.95

epsilon = 1.0
epsilon_min = 0.1
epsilon_decay = 0.999

total_reward = 0

action_space_size = 21
""" count of action [-2 : 2] with 0.1 step, 0->19 = [-2 : -0.1], 20 = 0, 21->40 = [0.1 : 2] """
observation_space_size = [31, 81]
"""
First is count of pendulum state [-pi : pi] with 0.1 step, 0->31 = [-pi : -pi/31], 32 = 0, 33->62 = [pi/32 : pi]
Second is count of pendulum angular velocity [-8 : 8] with 0.1 step, 0->79 = [-8 : -0.1], 80 = 0, 81->160 = [0.1 : 8]
"""

episodes = 50000
total_reward_for_episode = list()

action_space = np.linspace(-2, 2, num=action_space_size)
observation_space = [np.linspace(-np.pi, np.pi, num=observation_space_size[0]),
                     np.linspace(-8.0, 8.0, num=observation_space_size[1])]

q_table = np.random.uniform(low=-2, high=-0, size=(observation_space_size + [action_space_size]))

def get_discrete_action(state):
    theta = np.arctan2(state[0], state[1])
    index_state = np.digitize(theta, observation_space[0]) - 1
    index_velocity = np.digitize(state[2], observation_space[1]) - 1
    action = q_table[index_state, index_velocity]
    return action

def get_indexes(state):
    theta = np.arctan2(state[0], state[1])
    index_state = np.digitize(theta, observation_space[0]) - 1
    index_velocity = np.digitize(state[2], observation_space[1]) - 1
    return [index_state, index_velocity]

for episode in range(1, episodes+1):
    state = env.reset()
    state = state[0]
    done = False
    episode_reward = 0

    while not done:
        if np.random.random() > epsilon:
            action = np.argmax(get_discrete_action(state))
        else:
            action = np.random.randint(0, action_space_size)

        step = [action_space[action]]

        observation, reward, done, truncated, *info = env.step(step)

        total_reward += reward
        episode_reward += reward

        # Change q table
        index_state, index_velocity = get_indexes(state)
        new_index_state, new_index_velocity = get_indexes(observation)
        new_action = np.argmax(get_discrete_action(observation))

        current_q = q_table[index_state, index_velocity, action]
        new_step = np.max(q_table[new_index_state, new_index_velocity])
        new_q = current_q + learning_rate * (reward + discount_rate * new_step - current_q)
        q_table[index_state, index_velocity, action] = new_q
        state = observation

        # Change epsilon
        if epsilon > epsilon_min:
            epsilon *= epsilon_decay

        # Truncated
        if truncated:
            break
    
    total_reward_for_episode.append(episode_reward)        

    if episode % 10000 == 0:
        print("Episode: {}".format(episode))
        print("Total reward = {}".format(total_reward))
        print("Average reward = {}".format(total_reward/episode))

print("First episode reward: {}".format(total_reward_for_episode[0]))
print("Last episode reward: {}".format(total_reward_for_episode[-1]))


Episode: 10000
Total reward = -15763026.519013282
Average reward = -1576.302651901328
Episode: 20000
Total reward = -21875445.327487525
Average reward = -1093.7722663743762
Episode: 30000
Total reward = -27076109.09935655
Average reward = -902.5369699785517
Episode: 40000
Total reward = -31955342.164262284
Average reward = -798.8835541065571
Episode: 50000
Total reward = -36921107.9489731
Average reward = -738.422158979462
First episode reward: -4874.387506418005
Last episode reward: -137.03893604444082


## Test data in tabular form

| Episodes | LR       | DR       | Avg. reward | Total reward | action space size | observation space size | First episode reward | Last episode reward |
|-------|---|---|---|---|---|------------------------|----------------------|---------------------|
| 10000 | 0.1 | 0.9 | -2921 | -29214268    | 41 | 63, 161                | -3536                |                     |
| 20000 | 0.1 | 0.9 | -2631 | -52637180    | 41 | 63, 161                |
| 30000 | 0.1 | 0.9 | -2422 | -72678361    | 41 | 63, 161                |
| 40000 | 0.1 | 0.9 | -2244 | -89771691    | 41 | 63, 161                |
| 50000 | 0.1 | 0.9 | -2091 | -104558145   | 41 | 63, 161                |                      | -607                |

| Episodes | LR    | DR    | Avg. reward | Total reward | action space size | observation space size | First episode reward | Last episode reward |
|----------|-------|-------|-------------|--------------|-------------------|------------------------|----------------------|---------------------|
| 10000    | 0.1   | 0.9   | -2144       | -21443769    | 21                | 31, 81                 | -3693                |                     |
| 20000    | 0.1   | 0.9   | -1545       | -30907904    | 21                | 31, 81                 |
| 30000    | 0.1   | 0.9   | -1299       | -38977437    | 21                | 31, 81                 |
| 40000    | 0.1   | 0.9   | -1156       | -46276821    | 21                | 31, 81                 |
| 50000    | 0.1   | 0.9   | -1066       | -53308885    | 21                | 31, 81                 |                      | -906                |

| Episodes | LR  | DR   | Avg. reward | Total reward | action space size | observation space size | First episode reward | Last episode reward |
|-------|-----|------|-------------|--------------|---|------------------------|----------------------|---------------------|
| 10000 | 0.2 | 0.95 | -2739       | -27392880    | 41 | 63, 161                | -2823                |                     |
| 20000 | 0.2 | 0.95 | -2305       | -46105374    | 41 | 63, 161                |
| 30000 | 0.2 | 0.95 | -2001       | -60039006    | 41 | 63, 161                |
| 40000 | 0.2 | 0.95 | -1769       | -70791353    | 41 | 63, 161                |
| 50000 | 0.2 | 0.95 | -1586       | -79316870    | 41 | 63, 161                |                      | -869                |

| Episodes | LR  | DR   | Avg. reward | Total reward | action space size | observation space size | First episode reward | Last episode reward |
|-------|-----|------|-------------|--------------|-------------------|------------------------|----------------------|---------------------|
| 10000 | 0.2 | 0.95 | -1576       | -15763026    | 21                | 31, 81                | -4874                |                     |
| 20000 | 0.2 | 0.95 | -1093       | -21875445    | 21                | 31, 81                |
| 30000 | 0.2 | 0.95 | -902        | -27076109    | 21                | 31, 81                |
| 40000 | 0.2 | 0.95 | -798        | -31955342    | 21                | 31, 81                |
| 50000 | 0.2 | 0.95 | -738        | -36921107    | 21                | 31, 81                |                      | -137                |