In [2]:
import os
os.chdir('../')
os.getcwd()

'e:\\github_clone\\Deep-Inverse-Reinforcement-Learning'

In [26]:
import torch
import numpy as np
from IRL import RewardNet, StateActionDataset, TransitionStorage
from IRL.data import load_and_read_transitions
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
store = TransitionStorage()

In [7]:
def load_and_read_transitions(filename):
    transition_storage = TransitionStorage()
    transition_storage.load_from_disk(filename)

    observations, actions, rewards, next_states = transition_storage.get_data()
    return observations, actions, rewards, next_states

observations, actions, rewards, next_states = load_and_read_transitions("data\\transitions.npz")

Data loaded from data\transitions.npz


In [9]:
def normalize_data(data, min_val=None, max_val=None):
    """
    Normalize data to the range [-1, 1].
    If min_val and max_val are provided, they are used for normalization; otherwise, data min and max are calculated.
    """
    # Handle 1D and 2D arrays
    if min_val is None:
        min_val = np.min(data, axis=0)
    if max_val is None:
        max_val = np.max(data, axis=0)
    
    # Convert min_val and max_val to arrays if they are scalars
    if np.isscalar(min_val):
        min_val = np.array([min_val])
    if np.isscalar(max_val):
        max_val = np.array([max_val])
    
    # Avoid division by zero
    range_val = max_val - min_val
    range_val[range_val == 0] = 1  # Replace zero ranges with 1 to avoid division by zero

    normalized_data = 2 * (data - min_val) / range_val - 1
    return normalized_data

In [16]:
normalized_observations = normalize_data(observations)
normalized_actions = normalize_data(actions.reshape(-1, 1))
normalized_rewards = normalize_data(rewards.reshape(-1, 1)).flatten()  # Ensure rewards is 2D for normalization

# Combine observations and actions for input data
inputs = np.hstack([normalized_observations, normalized_actions])
print(f"input : {inputs.shape} -- output : {normalized_rewards.shape}")

input : (855412, 9) -- output : (855412,)


In [21]:
x_train, x_test, y_train, y_test = train_test_split(inputs, normalized_rewards, test_size=0.1)
print(f"Train {x_train.shape, y_train.shape} --- Test : {x_test.shape, y_test.shape}")

Train ((769870, 9), (769870,)) --- Test : ((85542, 9), (85542,))


In [23]:
X_train_tensor = torch.tensor(x_train, device=device, dtype=torch.float32)
X_test_tensor = torch.tensor(x_test, device=device, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, device=device, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, device=device, dtype=torch.float32)

In [25]:
train_dataset = StateActionDataset(x_train, y_train)
valid_dataset = StateActionDataset(x_test, y_test)

In [27]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, drop_last=False)
valid_dataloader = DataLoader(valid_dataset, batch_size=64, shuffle=False, drop_last=False)

In [24]:
rewardNet = RewardNet(lr=0.0003)

In [28]:
loss_history = []
score_history = []
for t in range(100):
    print(f"Epoch {t + 1}\n-------------------------------")
    epoch_loss_history, epoch_score_history = rewardNet.train_loop(train_dataloader)
    loss_history.extend(epoch_loss_history)
    score_history.extend(epoch_score_history)

Epoch 1
-------------------------------
loss: 0.003705  [    0/769870]
loss: 0.000208  [64000/769870]
loss: 0.000242  [128000/769870]
loss: 0.000260  [192000/769870]
loss: 0.000749  [256000/769870]
loss: 0.000312  [320000/769870]
loss: 0.000131  [384000/769870]
loss: 0.000584  [448000/769870]
loss: 0.000247  [512000/769870]
loss: 0.000263  [576000/769870]
loss: 0.000759  [640000/769870]
loss: 0.001284  [704000/769870]
loss: 0.000113  [768000/769870]
Epoch 2
-------------------------------
loss: 0.000151  [    0/769870]
loss: 0.001145  [64000/769870]
loss: 0.000858  [128000/769870]
loss: 0.000366  [192000/769870]
loss: 0.000765  [256000/769870]
loss: 0.000441  [320000/769870]
loss: 0.000372  [384000/769870]
loss: 0.000235  [448000/769870]
loss: 0.000216  [512000/769870]
loss: 0.000409  [576000/769870]
loss: 0.000277  [640000/769870]
loss: 0.000257  [704000/769870]
loss: 0.000585  [768000/769870]
Epoch 3
-------------------------------
loss: 0.000172  [    0/769870]
loss: 0.000546  [6400

In [36]:
data = torch.tensor(x_test[30], dtype=torch.float32, device=device)
rewardNet.model(data)

tensor([0.0031], device='cuda:0', grad_fn=<AddBackward0>)

In [37]:
y_test[30]

0.005660925805412775

In [38]:
rewardNet.save_model("artifact\\r2.pth")