# Practical Work - Implicit Q-Learning

<div class="alert alert-info">

...

<br>

...

<br>

...

</div>

### Imports and auxiliary settings

In [1]:
!apt update
!pip install swig
!pip install gym gym[box2d]
!pip install stable-baselines3[extra]

[33m0% [Working][0m            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
[33m0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (185.125.190.39)] [1 InRele[0m[33m0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (185.125.190.39)] [Connecti[0m                                                                                                    Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
[33m0% [Waiting for headers] [Waiting for headers] [Connecting to ppa.launchpadcontent.net] [Waiting for[0m                                                                                                    Get:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backpor

In [2]:
import gym
import numpy as np
import pickle
import random

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

from tqdm import tqdm

# Setup Google Drive mount to store your results

In [3]:
use_google_drive = True
if use_google_drive:
    from google.colab import drive
    drive.mount('/content/drive')

  and should_run_async(code)


Mounted at /content/drive


# Data Collection with Online RL

In [40]:
# Function to save dataset to a file
def save_dataset(dataset, filename):
    with open(filename, 'wb') as f:
        pickle.dump(dataset, f)

# Function to load dataset
def load_dataset(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

# Create Enviornment

In [41]:
# Create and wrap the environment
env_id = 'LunarLanderContinuous-v2'
env = make_vec_env(env_id, n_envs=1)

## 1. Random Dataset

In [42]:
def generate_random_dataset(env, num_episodes=1000):
    tqdm.write('Generating Random Dataset...')
    random_dataset = []

    for _ in tqdm(range(num_episodes), desc='Random policy steps'):
        obs = env.reset()
        done = [False]

        while not done[0]:
            # Sample an action from the action space of the environment
            action = [env.action_space.sample()]
            # Step through the environment with the action
            new_obs, reward, done, _ = env.step(action)
            # Append the experience to the dataset
            random_dataset.append((obs[0], action[0], reward[0], new_obs[0], done[0]))
            obs = new_obs

    return random_dataset

In [43]:
# Generate Random Dataset
random_dataset = generate_random_dataset(env, num_episodes=1000)

# Save the Dataset
save_dataset(random_dataset, 'Random_dataset.pkl')

# Move dataset to Google Drive (if using Colab)
!cp Random_dataset.pkl /content/drive/MyDrive/Master-AI/Practical-Work

tqdm.write('Random_dataset.pkl saved.')

Generating Random Dataset...


Random policy steps: 100%|██████████| 1000/1000 [01:03<00:00, 15.77it/s]


Random_dataset.pkl saved.


## 2. Expert Dataset

In [44]:
def generate_expert_dataset(env, model, num_episodes=100):
    tqdm.write('Generating Expert Dataset...')
    expert_dataset = []

    for _ in tqdm(range(num_episodes), desc='Expert policy steps'):
        obs = env.reset()
        done = False

        while not done:
            action, _ = model.predict(obs, deterministic=True)
            new_obs, reward, done, info = env.step(action)
            # Remove the additional dimension
            expert_dataset.append((obs[0], action[0], reward[0], new_obs[0], done[0]))
            obs = new_obs

    return expert_dataset

In [45]:
# Train a model (PPO or DDPG) to convergence
expert_model = PPO("MlpPolicy", env, verbose=0)
expert_model.learn(total_timesteps=200000)  # Increase timesteps as needed for convergence

# Generate Expert Dataset
expert_dataset = generate_expert_dataset(env, expert_model, num_episodes=200)

# Save the Dataset
save_dataset(expert_dataset, 'Expert_dataset.pkl')

# Move dataset to Google Drive (if using Colab)
!cp Expert_dataset.pkl /content/drive/MyDrive/Master-AI/Practical-Work

tqdm.write('Expert_dataset.pkl saved.')

Generating Expert Dataset...


Expert policy steps: 100%|██████████| 200/200 [05:54<00:00,  1.77s/it]


Expert_dataset.pkl saved.


## 3. Mixed Dataset

In [46]:
# Load Random and Expert Datasets
random_dataset = load_dataset('Random_dataset.pkl')
expert_dataset = load_dataset('Expert_dataset.pkl')

tqdm.write('Generating Mixed Dataset...')

# Mix datasets: 80% random and 20% expert
mixed_dataset = random.sample(random_dataset, int(0.8 * len(random_dataset))) + \
                random.sample(expert_dataset, int(0.2 * len(expert_dataset)))

save_dataset(mixed_dataset, 'Mixed_dataset.pkl')

# Move dataset to Google Drive (if using Colab)
!cp Mixed_dataset.pkl /content/drive/MyDrive/Master-AI/Practical-Work

tqdm.write('Mixed_dataset.pkl saved.')

Generating Mixed Dataset...
Mixed_dataset.pkl saved.


## 4. Noisy Dataset

In [47]:
def generate_noisy_dataset(env, model, epsilon=0.2, num_episodes=100):
    tqdm.write(f'Generating Noisy Dataset...')
    noisy_dataset = []

    for _ in tqdm(range(num_episodes), desc='Noisy policy steps'):
        obs = env.reset()
        done = False

        while not done:
            if np.random.random() < epsilon:
                action = [env.action_space.sample()]
            else:
                action, _ = model.predict(obs, deterministic=True)
            new_obs, reward, done, info = env.step(action)
            noisy_dataset.append((obs[0], action[0], reward[0], new_obs[0], done[0]))
            obs = new_obs

    return noisy_dataset

In [48]:
# Generate Noisy Dataset using the expert model
noisy_dataset = generate_noisy_dataset(env, expert_model, num_episodes=200)

# Save the Dataset
save_dataset(noisy_dataset, 'Noisy_dataset.pkl')

# Move dataset to Google Drive (if using Colab)
!cp Noisy_dataset.pkl /content/drive/MyDrive/Master-AI/Practical-Work

tqdm.write('Noisy_dataset.pkl saved.')

Generating Noisy Dataset...


Noisy policy steps: 100%|██████████| 200/200 [07:32<00:00,  2.26s/it]


Noisy_dataset.pkl saved.


## 5. Replay Dataset

In [49]:
def generate_replay_dataset(env, model, total_timesteps=100000):
    tqdm.write(f'Generating Replay Dataset...')
    replay_dataset = []
    obs = env.reset()

    for _ in tqdm(range(total_timesteps), desc='Replay policy steps'):
        action, _ = model.predict(obs, deterministic=True)
        new_obs, reward, done, info = env.step(action)
        replay_dataset.append((obs[0], action[0], reward[0], new_obs[0], done[0]))
        obs = new_obs if not done[0] else env.reset()

    return replay_dataset

In [50]:
# Generate Replay Dataset
replay_dataset = generate_replay_dataset(env, expert_model, total_timesteps=100000)

# Save the Dataset
save_dataset(replay_dataset, 'Replay_dataset.pkl')

# Move dataset to Google Drive (if using Colab)
!cp Replay_dataset.pkl /content/drive/MyDrive/Master-AI/Practical-Work

tqdm.write('Replay_dataset.pkl saved.')


Generating Replay Dataset...


Replay policy steps: 100%|██████████| 100000/100000 [05:27<00:00, 305.08it/s]


Replay_dataset.pkl saved.


---------------------------------

In [51]:
# Close the environments
env.close()

# Load and check datasets

In [52]:
# List of dataset filenames
dataset_filenames = ['Random_dataset.pkl', 'Expert_dataset.pkl',
                     'Mixed_dataset.pkl', 'Noisy_dataset.pkl',
                     'Replay_dataset.pkl']

# Load and check each dataset
for filename in dataset_filenames:
    dataset = load_dataset(filename)

    print(f"Checking dataset: {filename}")
    print(f"Number of entries: {len(dataset)}")

    # Check the first entry
    if len(dataset) > 0:
        obs, action, reward, new_obs, done = dataset[0]
        print(f"First entry - obs: {obs.shape}, action: {type(action)}, reward: {type(reward)}, new_obs: {new_obs.shape}, done: {type(done)}")

    print("\n")

Checking dataset: Random_dataset.pkl
Number of entries: 109894
First entry - obs: (8,), action: <class 'numpy.ndarray'>, reward: <class 'numpy.float32'>, new_obs: (8,), done: <class 'numpy.bool_'>


Checking dataset: Expert_dataset.pkl
Number of entries: 106565
First entry - obs: (8,), action: <class 'numpy.ndarray'>, reward: <class 'numpy.float32'>, new_obs: (8,), done: <class 'numpy.bool_'>


Checking dataset: Mixed_dataset.pkl
Number of entries: 109228
First entry - obs: (8,), action: <class 'numpy.ndarray'>, reward: <class 'numpy.float32'>, new_obs: (8,), done: <class 'numpy.bool_'>


Checking dataset: Noisy_dataset.pkl
Number of entries: 133605
First entry - obs: (8,), action: <class 'numpy.ndarray'>, reward: <class 'numpy.float32'>, new_obs: (8,), done: <class 'numpy.bool_'>


Checking dataset: Replay_dataset.pkl
Number of entries: 100000
First entry - obs: (8,), action: <class 'numpy.ndarray'>, reward: <class 'numpy.float32'>, new_obs: (8,), done: <class 'numpy.bool_'>


