In [1]:
import os
os.chdir('../')
os.getcwd()

'e:\\github_clone\\Deep-Inverse-Reinforcement-Learning'

In [2]:
import torch
import numpy as np
from IRL import RewardNet, StateActionDataset, TransitionStorage
from IRL.data import load_and_read_transitions
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
dataset = load_dataset('NathanGavenski/LunarLander-v2')
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['obs', 'actions', 'rewards', 'episode_starts'],
        num_rows: 383994
    })
})


In [5]:
full_dataset = dataset['train'].train_test_split(test_size=0.02, shuffle=True)
dataset_train = full_dataset['train']
dataset_valid = full_dataset['test']

print(dataset_train)
print(dataset_valid)

Dataset({
    features: ['obs', 'actions', 'rewards', 'episode_starts'],
    num_rows: 376314
})
Dataset({
    features: ['obs', 'actions', 'rewards', 'episode_starts'],
    num_rows: 7680
})


In [6]:
def prepare_data(data):
    # Extract 'obs', 'actions', and 'rewards' columns
    obs = data['obs']
    actions = data['actions']
    rewards = data['rewards']
    
    # Combine 'obs' and 'actions' into single feature vectors
    X = [torch.tensor(obs[i] + [actions[i]], dtype=torch.float32) for i in range(len(obs))]
    X = torch.stack(X).to(device)
    
    # Convert rewards to tensor
    y = torch.tensor(rewards, dtype=torch.float32).to(device)
    
    # Scale rewards to range [-1, 1]
    y_min = y.min()
    y_max = y.max()
    y = 2 * (y - y_min) / (y_max - y_min) - 1  # Scaling formula to bring values between -1 and 1
    
    return X, y

In [7]:
X_train, y_train = prepare_data(dataset_train)
X_valid, y_valid = prepare_data(dataset_valid)

In [8]:
print(f"Features (X-train) shape: {X_train.shape}")
print(f"Labels (y-train) shape: {y_train.shape}")

print(f"Features (X-test) shape: {X_valid.shape}")
print(f"Labels (y-test) shape: {y_valid.shape}")

Features (X-train) shape: torch.Size([376314, 9])
Labels (y-train) shape: torch.Size([376314])
Features (X-test) shape: torch.Size([7680, 9])
Labels (y-test) shape: torch.Size([7680])


In [9]:
class RewardDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [10]:
train_dataset = RewardDataset(X_train, y_train)
valid_dataset = RewardDataset(X_valid, y_valid)

In [11]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, drop_last=False)
valid_dataloader = DataLoader(valid_dataset, batch_size=64, shuffle=False, drop_last=False)

In [12]:
rewardNet = RewardNet(lr=0.0005)

In [40]:
loss_history = []
score_history = []
for t in range(50):
    print(f"Epoch {t + 1}\n-------------------------------")
    epoch_loss_history, epoch_score_history = rewardNet.train_loop(train_dataloader)
    loss_history.extend(epoch_loss_history)
    score_history.extend(epoch_score_history)

Epoch 1
-------------------------------
loss: 0.395079  [    0/376314]
loss: 0.041163  [64000/376314]
loss: 0.000738  [128000/376314]
loss: 0.002334  [192000/376314]
loss: 0.000626  [256000/376314]
loss: 0.000688  [320000/376314]
Epoch 2
-------------------------------
loss: 0.001632  [    0/376314]
loss: 0.001976  [64000/376314]
loss: 0.000555  [128000/376314]
loss: 0.000471  [192000/376314]
loss: 0.001031  [256000/376314]
loss: 0.000396  [320000/376314]
Epoch 3
-------------------------------
loss: 0.000742  [    0/376314]
loss: 0.040596  [64000/376314]
loss: 0.000969  [128000/376314]
loss: 0.000591  [192000/376314]
loss: 0.000945  [256000/376314]
loss: 0.000467  [320000/376314]
Epoch 4
-------------------------------
loss: 0.000595  [    0/376314]
loss: 0.040430  [64000/376314]
loss: 0.000478  [128000/376314]
loss: 0.001310  [192000/376314]
loss: 0.000594  [256000/376314]
loss: 0.001097  [320000/376314]
Epoch 5
-------------------------------
loss: 0.041536  [    0/376314]
loss: 0.0

In [15]:
data = torch.tensor(X_valid[30], dtype=torch.float32, device=device)
rewardNet.model(data).item()

  data = torch.tensor(X_valid[30], dtype=torch.float32, device=device)


-0.6727458834648132

In [42]:
y_valid[30]

tensor(-0.6917, device='cuda:0')

In [44]:
rewardNet.save_model("artifacts\\rewardModel.pth")

# Evaluation

In [13]:
rewardNet.load_model("artifacts\\rewardModel.pth")

In [14]:
rewardNet.validate_loop(valid_dataloader)

Validate Error: 
 RMSE accuracy: 17.7%, Avg loss: 0.031234 
MAE : 0.159845 
R_squared : -2.248877
