# Train RL model on Poker Hand Selection Task 

## Import packages and configure environment
Run tests first in the test notebooks if you face errors

In [1]:
import warnings
warnings.filterwarnings('ignore')

# environment
import gym
import HandClassificationEnv
import treys
import numpy as np

# agent and training
import stable_baselines
from stable_baselines import PPO2
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv

# plotting and logging
from stable_baselines.bench import Monitor
# from stable_baselines.results_plotter import load_results, ts2xy
# from stable_baselines.common.noise import AdaptiveParamNoiseSpec, NormalActionNoise
from stable_baselines.common.callbacks import BaseCallback
# import matplotlib.pyplot as plt
import time 


## Train the model 

In [None]:
POLICY = MlpPolicy
POLICY_NAME = 'MlpPolicy'
ENVIRONMENT = 'HandClassificationEnv-v1'
TIMESTEPS = 2000000
NETWORK_ARCH = [160,160,160,160]
LOG_INTERVAL = 2000

START_TIME = time.asctime().replace(' ', '-')
TENSORBOARD_DIR = f'./logs/{ENVIRONMENT}-{POLICY_NAME}-{START_TIME}-tensorboard'
MODEL_DIR = f'./models/{ENVIRONMENT}-{POLICY_NAME}-PPO2-{START_TIME}-{TIMESTEPS}-{NETWORK_ARCH}-model-folder.zip'
LOAD_EXISTING_MODEL = False
LOAD_DIR = None

def train(policy=POLICY, environment=ENVIRONMENT, timesteps=TIMESTEPS, load_existing=LOAD_EXISTING_MODEL, log_interval=LOG_INTERVAL):
    
    print(f"[INFO] STARTING TRAINING: {START_TIME} {ENVIRONMENT}-{POLICY_NAME}-PPO2")
    print(f"[INFO] NETWORK ARCH {NETWORK_ARCH}")
    
    # configure the environment 
    env = gym.make(ENVIRONMENT)
    
    print(f"[INFO] LOAD EXISTING MODEL? {LOAD_EXISTING_MODEL}")
    if LOAD_EXISTING_MODEL: 
        model.load(LOAD_DIR)
        print(f"[INFO] LOADED MODEL FROM {LOAD_DIR}")

    else:  
        # Custom MLP policy of two layers of size 32 each with tanh activation function
        policy_kwargs = dict(net_arch=NETWORK_ARCH)
        model = PPO2(policy, env, verbose=0, policy_kwargs=policy_kwargs)
        print(f"[INFO] Training for TIMESTEPS {TIMESTEPS})
        
    model.learn(total_timesteps=timesteps, log_interval=LOG_INTERVAL)
    
    model.save(save_path=MODEL_DIR, cloudpickle=False)
    print(f"[INFO] MODEL SAVED TO {MODEL_DIR}")
    
    
    return model


model = train()

[INFO] STARTING TRAINING: Sat-Apr-11-18:54:52-2020 HandClassificationEnv-v1-MlpPolicy-PPO2
[INFO] TIMESTEPS 2000000
[INFO] NETWORK ARCH [160, 160, 160, 160]
[INFO] LOAD EXISTING MODEL? False
[INFO] NEW MODEL <stable_baselines.ppo2.ppo2.PPO2 object at 0x146b99828>


In [None]:
1

## Evaluate the model 

In [15]:
# what is the average reward for random actions? 
from collections import Counter

def get_mean_reward_random():
    env = gym.make(ENVIRONMENT)
    
    reward_sum = 0
    for i in range(10000):
        random_action = env.action_space.sample()
        obs, r, done, _ = env.step(random_action)
        reward_sum+=r
        env.reset()
            
    return reward_sum/10000

print("Mean Random: ", get_mean_reward_random())

def evaluate(model, num_steps=1000):
    """
    Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_steps: (int) number of timesteps to evaluate it
    :return: (float) Mean reward for the last 100 episodes
    """
    env = gym.make(ENVIRONMENT)
    obs = env._get_obs()
    for t in range(3):
        episode_rewards = 0
        predictions = []
        for i in range(num_steps):
          # _states are only useful when using LSTM policies
            action, _states = model.predict(obs)

            obs, reward, done, info = env.step(action)
            episode_rewards += reward
            predictions.append(action)

            obs = env.reset()

        # Compute mean reward for the last 100000 episodes
        print(f"[INFO] Mean reward, trial {t}:", episode_rewards/10000)
        print(f"[INFO] Predictions distribution: {Counter(predictions)}")
            

# print("Mean reward random: ", get_mean_reward_random())
# model.load(load_path='models/HandMakerEnv-v1-MlpPolicy-Sat-Apr-11-14:17:49-2020-model-folder.zip')

evaluate(model)

Mean Random:  -0.896
[INFO] Mean reward, trial 0: -0.0134
[INFO] Predictions distribution: Counter({8: 947, 7: 46, 6: 4, 2: 2, 0: 1})
[INFO] Mean reward, trial 1: -0.0106
[INFO] Predictions distribution: Counter({8: 955, 7: 41, 6: 3, 2: 1})
[INFO] Mean reward, trial 2: -0.0096
[INFO] Predictions distribution: Counter({8: 959, 7: 39, 6: 1, 2: 1})


In [None]:
1