# Practical Work - Implicit Q-Learning

<div class="alert alert-info">

...

<br>

...

<br>

...

</div>

### Imports and auxiliary settings

In [1]:
!apt update
!pip install swig
!pip install gym gym[box2d]
!pip install stable-baselines3[extra]

[33m0% [Working][0m            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
[33m0% [Connecting to archive.ubuntu.com (185.125.190.36)] [Connecting to security.ubuntu.com (185.125.1[0m                                                                                                    Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
[33m0% [Waiting for headers] [Waiting for headers] [Connected to ppa.launchpadcontent.net (185.125.190.8[0m                                                                                                    Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/dea

In [2]:
import gym
import numpy as np
import pickle

from stable_baselines3 import PPO, DDPG
from stable_baselines3.common.env_util import make_vec_env

from tqdm import tqdm

# Setup Google Drive mount to store your results

In [3]:
use_google_drive = True
if use_google_drive:
    from google.colab import drive
    drive.mount('/content/drive')

  and should_run_async(code)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Collection with Online RL

In [4]:
# Function the generate a Dataset
def generate_dataset(env, model, num_episodes=100):
    dataset = []
    for _ in range(num_episodes):
        obs = env.reset()
        done = False
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            new_obs, reward, done, info = env.step(action)
            # Remove the additional dimension
            dataset.append((obs[0], action[0], reward[0], new_obs[0], done[0]))
            obs = new_obs
    return dataset

# Function to save dataset to a file
def save_dataset(dataset, filename):
    with open(filename, 'wb') as f:
        pickle.dump(dataset, f)

In [5]:
# Create and wrap the environment
env_id = 'LunarLanderContinuous-v2'
env = make_vec_env(env_id, n_envs=1)

In [6]:
# Initialize PPO and DDPG models
ppo_model = PPO("MlpPolicy", env, verbose=0)
ddpg_model = DDPG("MlpPolicy", env, verbose=0)

# Train models and collect datasets
for name, model in [('PPO', ppo_model), ('DDPG', ddpg_model)]:
    for training_step in tqdm([50000, 100000], desc=f'Training {name}'):
        model.learn(total_timesteps=training_step)
        tqdm.write(f'\nGenerating dataset for {name} after {training_step} steps...')
        dataset = generate_dataset(env, model, num_episodes=100)
        save_dataset(dataset, f'{name}_{training_step}_dataset.pkl')
        tqdm.write(f'\nDataset {name}_{training_step}_dataset.pkl saved.')

Training PPO:   0%|          | 0/2 [02:56<?, ?it/s]


Generating dataset for PPO after 50000 steps...


Training PPO:  50%|█████     | 1/2 [08:01<08:01, 481.65s/it]


Dataset PPO_50000_dataset.pkl saved.


Training PPO:  50%|█████     | 1/2 [15:21<08:01, 481.65s/it]


Generating dataset for PPO after 100000 steps...


Training PPO: 100%|██████████| 2/2 [19:55<00:00, 597.91s/it]



Dataset PPO_100000_dataset.pkl saved.


Training DDPG:   0%|          | 0/2 [18:46<?, ?it/s]


Generating dataset for DDPG after 50000 steps...


Training DDPG:  50%|█████     | 1/2 [22:48<22:48, 1368.21s/it]


Dataset DDPG_50000_dataset.pkl saved.


Training DDPG:  50%|█████     | 1/2 [1:10:52<22:48, 1368.21s/it]


Generating dataset for DDPG after 100000 steps...


Training DDPG: 100%|██████████| 2/2 [1:13:51<00:00, 2215.59s/it]


Dataset DDPG_100000_dataset.pkl saved.





In [9]:
# Random policy dataset generation
tqdm.write('\nGenerating dataset for Random policy...')
random_dataset = []
for _ in tqdm(range(1000), desc='Random policy steps'):
    obs = env.reset()
    done = [False]
    while not done[0]:
        # Sample an action from the action space of the environment
        action = [env.action_space.sample()]
        # Step through the environment with the action
        new_obs, reward, done, _ = env.step(action)
        # Append the experience to the dataset
        random_dataset.append((obs[0], action[0], reward[0], new_obs[0], done[0]))
        obs = new_obs

save_dataset(random_dataset, 'Random_1000_dataset.pkl')
tqdm.write('Random_1000_dataset.pkl saved.')


Generating dataset for Random policy...


Random policy steps: 100%|██████████| 1000/1000 [01:03<00:00, 15.84it/s]


Random_1000_dataset.pkl saved.


In [10]:
# Close the environments
env.close()

# Move Datasets to Drive-Folder:

In [11]:
# Move datasets to Google Drive (if using Colab)
!cp *.pkl /content/drive/MyDrive/Master-AI/Practical-Work

# Load and check datasets

In [12]:
def load_dataset(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [13]:
# List of dataset filenames
dataset_filenames = ['PPO_50000_dataset.pkl', 'PPO_100000_dataset.pkl',
                     'DDPG_50000_dataset.pkl', 'DDPG_100000_dataset.pkl',
                     'Random_1000_dataset.pkl']

# Load and check each dataset
for filename in dataset_filenames:
    dataset = load_dataset(filename)

    print(f"Checking dataset: {filename}")
    print(f"Number of entries: {len(dataset)}")

    # Check the first entry
    if len(dataset) > 0:
        obs, action, reward, new_obs, done = dataset[0]
        print(f"First entry - obs: {obs.shape}, action: {type(action)}, reward: {type(reward)}, new_obs: {new_obs.shape}, done: {type(done)}")

    print("\n")

Checking dataset: PPO_50000_dataset.pkl
Number of entries: 74596
First entry - obs: (1, 8), action: <class 'numpy.ndarray'>, reward: <class 'numpy.ndarray'>, new_obs: (1, 8), done: <class 'numpy.ndarray'>


Checking dataset: PPO_100000_dataset.pkl
Number of entries: 68737
First entry - obs: (1, 8), action: <class 'numpy.ndarray'>, reward: <class 'numpy.ndarray'>, new_obs: (1, 8), done: <class 'numpy.ndarray'>


Checking dataset: DDPG_50000_dataset.pkl
Number of entries: 63571
First entry - obs: (1, 8), action: <class 'numpy.ndarray'>, reward: <class 'numpy.ndarray'>, new_obs: (1, 8), done: <class 'numpy.ndarray'>


Checking dataset: DDPG_100000_dataset.pkl
Number of entries: 53042
First entry - obs: (1, 8), action: <class 'numpy.ndarray'>, reward: <class 'numpy.ndarray'>, new_obs: (1, 8), done: <class 'numpy.ndarray'>


Checking dataset: Random_1000_dataset.pkl
Number of entries: 110110
First entry - obs: (8,), action: <class 'numpy.ndarray'>, reward: <class 'numpy.float32'>, new_obs: 