<a href="https://colab.research.google.com/github/Mehak-shahani/my-projects-/blob/main/Q_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing necessary libraries
import gym  # For simulation environment
import numpy as np  # For numerical operations
import random  # For random operations in exploration-exploitation
import torch  # For neural networks and tensors
import torch.nn as nn  # For building deep neural networks
import torch.optim as optim  # For optimization
import matplotlib.pyplot as plt  # For plotting graphs


In [2]:
# Create the CartPole environment from OpenAI Gym
env = gym.make('CartPole-v1')

# Visualize the environment with a single random action
state = env.reset()
env.render()  # Show the environment's visual


  deprecation(
  deprecation(
If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
  from pkg_resources import resource_stream, resource_exists
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-name

In [6]:
# Define the Q-learning agent class
class QLearningAgent:
    def __init__(self, action_space, state_space, learning_rate=0.1, discount_factor=0.99, exploration_rate=1.0, exploration_decay=0.995):
        self.action_space = action_space
        self.state_space = state_space
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay
        self.q_table = np.zeros((state_space, action_space))  # Initialize the Q-table with zeros

    def choose_action(self, state):
        # Exploration vs exploitation
        if random.uniform(0, 1) < self.exploration_rate:
            return env.action_space.sample()  # Exploration: Random action
        else:
            return np.argmax(self.q_table[state])  # Exploitation: Action with the highest Q-value

    def update_q_table(self, state, action, reward, next_state):
        # Q-learning update rule
        best_next_action = np.argmax(self.q_table[next_state])
        self.q_table[state, action] = self.q_table[state, action] + self.learning_rate * (reward + self.discount_factor * self.q_table[next_state, best_next_action] - self.q_table[state, action])

    def decay_exploration(self):
        # Decay exploration rate over time
        self.exploration_rate *= self.exploration_decay

In [7]:
# Discretize the state space (CartPole's state space is continuous, so we need to discretize it)
state_space = 100  # We will discretize the state into 100 bins
action_space = env.action_space.n  # Number of possible actions in the environment

# Create the Q-learning agent
agent = QLearningAgent(action_space=action_space, state_space=state_space)

In [10]:
# Train the Q-learning agent
episodes = 1000
for episode in range(episodes):
    state = env.reset()
    state = np.digitize(state[0], bins=np.linspace(-2.4, 2.4, 10)) * 10 + np.digitize(state[1], bins=np.linspace(-3, 3, 10))  # Discretizing state space
    total_reward = 0

    done = False
    while not done:
        action = agent.choose_action(state)  # Choose action based on policy
        next_state, reward, done, info = env.step(action)  # Take action and observe the result (correct unpacking)
        next_state = np.digitize(next_state[0], bins=np.linspace(-2.4, 2.4, 10)) * 10 + np.digitize(next_state[1], bins=np.linspace(-3, 3, 10))  # Discretizing next state

        agent.update_q_table(state, action, reward, next_state)  # Update Q-table
        state = next_state  # Move to the next state

        total_reward += reward  # Accumulate reward

    # Decay the exploration rate after each episode
    agent.decay_exploration()
     # Print progress every 100 episodes
    if (episode + 1) % 100 == 0:
        print(f"Episode {episode + 1}: Total Reward = {total_reward}, Exploration Rate = {agent.exploration_rate:.4f}")


Episode 100: Total Reward = 23.0, Exploration Rate = 0.0040
Episode 200: Total Reward = 22.0, Exploration Rate = 0.0024
Episode 300: Total Reward = 15.0, Exploration Rate = 0.0015
Episode 400: Total Reward = 20.0, Exploration Rate = 0.0009
Episode 500: Total Reward = 15.0, Exploration Rate = 0.0005
Episode 600: Total Reward = 23.0, Exploration Rate = 0.0003
Episode 700: Total Reward = 17.0, Exploration Rate = 0.0002
Episode 800: Total Reward = 20.0, Exploration Rate = 0.0001
Episode 900: Total Reward = 21.0, Exploration Rate = 0.0001
Episode 1000: Total Reward = 15.0, Exploration Rate = 0.0000
