Explanation:
Reinforcement Learning (RL) is a type of machine learning where an agent learns to make decisions by interacting with an environment. The agent receives rewards or penalties based on the actions it takes, aiming to maximize cumulative rewards over time. Key concepts include:

Agent: The decision-maker.

Environment: The world with which the agent interacts.

State: A representation of the current situation of the environment.

Action: The choices available to the agent.

Reward: The feedback the agent receives after taking an action.

Policy: A strategy that the agent follows to determine actions based on the state.

Q-Learning: A popular RL algorithm that seeks to learn a policy by updating Q-values, which estimate the expected reward of taking an action in a given state.

In [2]:
import numpy as np
import random

# Define the environment
class SimpleEnvironment:
    def __init__(self, size=5):
        self.size = size
        self.state = 0  # Starting state

    def reset(self):
        self.state = 0
        return self.state

    def step(self, action):
        if action == 1 and self.state < self.size - 1:  # Move right, stay within bounds
            self.state += 1
        elif action == 0 and self.state > 0:  # Move left, stay within bounds
            self.state -= 1

        if self.state == self.size - 1:
            reward = 1  # Goal reached
            done = True
        else:
            reward = 0
            done = False

        return self.state, reward, done

# Q-Learning algorithm
class QLearningAgent:
    def __init__(self, n_states, n_actions, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.q_table = np.zeros((n_states, n_actions))
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration factor

    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice([0, 1])  # Explore: random action
        else:
            return np.argmax(self.q_table[state])  # Exploit: best action

    def update_q_table(self, state, action, reward, next_state):
        predict = self.q_table[state, action]
        target = reward + self.gamma * np.max(self.q_table[next_state])
        self.q_table[state, action] += self.alpha * (target - predict)

# Main training loop
env = SimpleEnvironment(size=5)
agent = QLearningAgent(n_states=5, n_actions=2)

for episode in range(100):
    state = env.reset()
    done = False

    while not done:
        action = agent.choose_action(state)
        next_state, reward, done = env.step(action)
        agent.update_q_table(state, action, reward, next_state)
        state = next_state

print("Trained Q-Table:")
print(agent.q_table)

Trained Q-Table:
[[0.17445926 0.72442911]
 [0.08726795 0.80905147]
 [0.34888804 0.89977085]
 [0.26769819 0.99997344]
 [0.         0.        ]]
