<a href="https://colab.research.google.com/github/HafizAQ/ML_Practices/blob/main/RL_from_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Courtesy Reference: https://medium.com/@wangdk93/reinforcement-learning-from-scratch-ee9b7218e70d

#Reinforcement Learning from Scratch

In [2]:
!pip install --upgrade matplotlib

Collecting matplotlib
  Downloading matplotlib-3.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading matplotlib-3.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.3/8.3 MB[0m [31m44.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: matplotlib
  Attempting uninstall: matplotlib
    Found existing installation: matplotlib 3.8.0
    Uninstalling matplotlib-3.8.0:
      Successfully uninstalled matplotlib-3.8.0
Successfully installed matplotlib-3.9.3


In [3]:
!pip show matplotlib

Name: matplotlib
Version: 3.9.3
Summary: Python plotting package
Home-page: https://matplotlib.org
Author: John D. Hunter, Michael Droettboom
Author-email: Unknown <matplotlib-users@python.org>
License: License agreement for matplotlib versions 1.3.0 and later
         
         1. This LICENSE AGREEMENT is between the Matplotlib Development Team
         ("MDT"), and the Individual or Organization ("Licensee") accessing and
         otherwise using matplotlib software in source or binary form and its
         associated documentation.
         
         2. Subject to the terms and conditions of this License Agreement, MDT
         hereby grants Licensee a nonexclusive, royalty-free, world-wide license
         to reproduce, analyze, test, perform and/or display publicly, prepare
         derivative works, distribute, and otherwise use matplotlib
         alone or in any derivative version, provided, however, that MDT's
         License Agreement and MDT's notice of copyright, i.e., "C

In [None]:
# from matplotlib.dates import epoch2num
from matplotlib.dates import num2date
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random
from torch.distributions import Categorical  # Import Categorical

bounds = [100, 100]  # Map size


def generate_simulation_data(num_samples):
    data = []
    bounds = [100, 100]  # Map size
    rock_radius = 5  # radius of stone

    for _ in range(num_samples):
        rock_position = np.random.uniform(rock_radius, bounds[0] - rock_radius, size=2) #Make sure that the starting position of the car is at least a certain distance from the stone. Here we set it to twice the radius of the stone
        min_distance = rock_radius * 2
        car_position = np.random.uniform(0, bounds, size=2)
        while np.linalg.norm(car_position - rock_position) < min_distance:
            car_position = np.random.uniform(0, bounds, size=2) # Each sample includes cart location and stone location
        data.append((car_position, rock_position))
    return data


# Generate simulation data
num_samples = 1000
simulation_data = generate_simulation_data(num_samples)


# %%
class CarRockEnv:

    def __init__(self,
                 rock_position,
                 rock_radius,
                 bounds=bounds,
                 max_steps=100,
                 number_of_rocks=20):
        self.rock_position = np.array(rock_position)
        self.rock_radius = rock_radius
        self.max_steps = max_steps
        self.step_count = 0
        self.number_of_rocks = number_of_rocks
        self.bounds = bounds
        self.move_map = {0: [1, 0], 1: [-1, 0], 2: [0, 1], 3: [0, -1]}
        # Initialization code here

    def reset(self):
        # Place multiple rocks randomly in the environment
        self.rocks = [np.random.uniform(0, 100, size=2) for _ in range(self.number_of_rocks)]
        self.car_position = np.array([0, 0])  # Reset car position
        return self.state()

    def state(self):
        return np.concatenate((self.car_position / self.bounds, self.rock_position / self.bounds))

    def step(self, action):
        # Apply action to update car position
        self.car_position += self.move_map[action]
        self.car_position = np.clip(self.car_position, [0, 0], self.bounds)
        reward = -0.1  # Small penalty for taking a step, to encourage efficiency

        # Determine the minimum distance to any rock
        min_distance = min(np.linalg.norm(self.car_position - rock) for rock in self.rocks)
        max_safe_distance = 30  # Define a max safe distance threshold

        # Check collision and distance penalties
        if min_distance < self.rock_radius:
            reward -= 10  # Significant penalty for collision
            done = True  # End the episode on collision
        elif min_distance > max_safe_distance:
            reward -= 5  # Penalty for being too far from the nearest rock
            done = False  # Continue the episode
        else:
            # Optionally reward being within a safe range
            reward += 1 - (min_distance / max_safe_distance)  # Scale reward based on closeness
            done = False

        return self.state(), reward, done, {}

    def render(self):
        # Optional: Implement rendering to visualize the environment
        pass


# %%
class PolicyNetwork(nn.Module):

    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim))
        self.softmax = nn.Softmax(dim = -1)

    def forward(self, status):
        output = self.network(status)
        a = self.softmax(output)
        return a


# %%


def compute_discounted_rewards(rewards, gamma=0.99):
    discounted_rewards = []
    R = 0
    for r in reversed(rewards):
        R = r + gamma * R
        discounted_rewards.insert(0, R)
    return discounted_rewards


def evaluate_policy(policy_net, env, episodes=10):
    total_rewards = 0
    for i in range(episodes):
        state = env.reset()
        done = False
        episode_reward = 0
        while not done:
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            with torch.no_grad():
                action_probs = policy_net(state_tensor)
            action = torch.argmax(action_probs).item()
            next_state, reward, done, _ = env.step(action)
            episode_reward += reward
            state = next_state
        total_rewards += episode_reward

    average_reward = total_rewards / episodes
    return average_reward

def train(env, policy, optimizer, episodes=1000):

    for episode in range(episodes):
        print("episode", episode)
        state = env.reset()
        log_probs = []
        rewards = []
        done = False

        while not done:
            state_tensor = torch.FloatTensor(state).unsqueeze(0)  # car pos, rock pos
            probs = policy(state_tensor)
            m = Categorical(probs)
            action = m.sample()
            state, reward, done, _ = env.step(action.item())

            log_probs.append(m.log_prob(action))
            rewards.append(reward)

            if done:
                discounted_rewards = compute_discounted_rewards(rewards)
                policy_loss = []
                for log_prob, Gt in zip(log_probs, discounted_rewards):
                    policy_loss.append(-log_prob * Gt) # why ?
                optimizer.zero_grad()
                policy_loss = torch.cat(policy_loss).sum()
                policy_loss.backward()
                optimizer.step()  # what is this, why
                print("action", int(action))
                print("Loss", int(policy_loss))

                if episode % 50 == 0:
                    print(f"Episode {episode}, Total Reward: {sum(rewards)}")
                break
    return policy


# %%
# Initialize the environment and policy
env = CarRockEnv(rock_position=[50, 50], rock_radius=5)
policy = PolicyNetwork(input_dim=4, output_dim=4)
optimizer = optim.Adam(policy.parameters(), lr=1e-4)


train(env, policy, optimizer)

evaluate_policy(policy_net=policy, env=env, episodes=10)

episode 0
action 0
Loss -19666
Episode 0, Total Reward: -444.11267070088417
episode 1
action 1
Loss 16761
episode 2
action 0
Loss 37112
episode 3
action 0
Loss 1947
episode 4
action 2
Loss 101
episode 5
action 2
Loss 16574
episode 6
action 2
Loss -35
episode 7
action 0
Loss 8956
episode 8
action 0
Loss 3273
episode 9
action 2
Loss -81
episode 10
action 0
Loss 749
episode 11
action 0
Loss 2886
episode 12
