# [  ]

In [1]:
import fatbot as fb
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import os

# Setup

### RL Algorithm

In [2]:
sbalgo =                fb.PPO         #<----- model args depend on this
model_name =            'ppo_model'    #<----- stores data in this folder
os.makedirs(model_name, exist_ok=True)

### Reward Scheme

In [4]:
reward_scheme = 'hull'
delta_reward = True

### Hyperparams

In [5]:
gamma =                 0.99
horizon =               300
total_timesteps =       10_000 #<--- for training
model_version =         'baseZ'
model_path =            os.path.join(model_name, model_version)

# learning rate scheduling
start_lr, end_lr = 0.00050, 0.000040
lr_mapper=fb.REMAP((-0.2,1), (start_lr, end_lr)) # set learn rate schedluer
def lr_schedule(progress):
  #progress_precent = 100*(1-progress)
  #lr = lr_mapper.in2map(1-progress)
  #if int(progress_precent) % 10 == 0:
  #  print(f'Progress: {progress} ~~> {progress_precent:.3f} %,  {lr = }')  
  return lr_mapper.in2map(1-progress) #lr

# Training

### prepare

In [6]:
db = fb.db.db6
# initial state distribution - uniformly sample from all listed states
initial_state_keys =  db.isd['S'] # [db.isd[db.isd_keys[0]]] #[v for k,v in db.isd.items()] 
permute_states =        False
print(f'Total Initial States: {len(initial_state_keys)}')

# build training env
training_env = db.envF(False, 20, 0, horizon, reward_scheme, delta_reward, 
                         initial_state_keys, False)

#<---- optinally check
fb.check_env(training_env) 

Total Initial States: 6
[*] World Created :: world_db6 :: Dim: ( X=60.0, Y=60.0, H=300 )
Delta-Reward: [True],  Delta-Action: [False]
Imaging: [False],  History: [False]



  logger.warn(


In [7]:
training_env.reset()

array([-20.,  20.,   0.,   0.,   1.,   0.,   0.,  20.,   0.,   0.,   2.,
         0.,  20.,  20.,   0.,   0.,   1.,   0., -20., -20.,   0.,   0.,
         1.,   0.,   0., -20.,   0.,   0.,   2.,   0.,  20., -20.,   0.,
         0.,   1.,   0.], dtype=float32)

### perform training

In [8]:
# start training
training_start_time = fb.common.now()
print(f'Training @ [{model_path}]')
model = sbalgo(policy=      'MlpPolicy', 
        env=                training_env, 
        learning_rate =     lr_schedule,
        n_steps=            1024*10,
        batch_size =        1024,
        n_epochs =          20,
        gamma =             gamma,
        gae_lambda=         0.95,
        clip_range=         0.20, 
        clip_range_vf=      None, 
        normalize_advantage=True, 
        ent_coef=           0.0, 
        vf_coef=            0.5, 
        max_grad_norm=      0.5, 
        use_sde=            False, 
        sde_sample_freq=    -1, 
        target_kl=          None, 
        tensorboard_log=    None, 
        create_eval_env=    False, 
        verbose=            0, 
        seed=               None, 
        device=             'cpu', 
        _init_setup_model=  True,
        policy_kwargs=dict(
                        activation_fn=  nn.LeakyReLU, 
                        net_arch=[dict(
                            pi=[512, 512, 300], 
                            vf=[512, 512, 300])])) #256, 256, 256, 128, 128

model.learn(total_timesteps=total_timesteps,log_interval=int(0.1*total_timesteps))
model.save(model_path)
training_end_time = fb.common.now()
print(f'Finished!, Time-Elapsed:[{training_end_time-training_start_time}]')

Training @ [ppo_model/baseZ]
Finished!, Time-Elapsed:[0:00:28.953694]


# Testing

In [9]:
model = sbalgo.load(model_path)
model, model_path

(<stable_baselines3.ppo.ppo.PPO at 0x7f7110724e80>, 'ppo_model/baseZ')

### prepare

In [13]:

# initial state distribution - uniformly sample from all listed states
initial_state_keys =  db.isd['S'] # [db.isd[db.isd_keys[0]]] #[v for k,v in db.isd.items()] 
permute_states =        False
print(f'Total Initial States: {len(initial_state_keys)}')

# build training env
testing_env = db.envF(True, 20, 0, 50, reward_scheme, delta_reward, 
                         initial_state_keys, False)

#<---- optinally check
fb.check_env(testing_env) 

Total Initial States: 6
[*] World Created :: world_db6 :: Dim: ( X=60.0, Y=60.0, H=50 )
Delta-Reward: [True],  Delta-Action: [False]
Imaging: [False],  History: [True]



  logger.warn(


### perform testing

In [14]:
print(f'Testing @ [{model_path}]')
average_return, total_steps, _ = fb.TEST(
    env=            testing_env, 
    model=          sbalgo.load(model_path), #<---- use None for random
    episodes=       1,
    steps=          0,
    deterministic=  True,
    render_as=      'T', # use None for no plots, use '' (empty string) to plot inline
    save_dpi=       'figure',
    make_video=     False,
    video_fps=      2,
)
print(f'{average_return=}, {total_steps=}')




Testing @ [ppo_model/baseZ]
[.] Testing for [1] episodes @ [inf] steps

[++] Begin Epoch: Running for 1 episodes

[+] Begin Episode: 1 of 1
[x] End Episode: 1] :: Return: -1.1040287017822266, Steps: 50
[--] End Epoch [1] episodes :: Avg Return: -1.1040287017822266, Total Steps: 50.0


ValueError: too many values to unpack (expected 3)