# Train RL model on Poker Hand Selection Task 

## Import packages and configure environment
Run tests first in the test notebooks if you face errors

In [27]:
import warnings
warnings.filterwarnings('ignore')

# environment
import gym
import HandMakerEnv
import treys
import numpy as np

# agent and training
import stable_baselines
from stable_baselines import PPO2
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv

# plotting and logging
from stable_baselines.bench import Monitor
from stable_baselines.results_plotter import load_results, ts2xy
from stable_baselines.common.noise import AdaptiveParamNoiseSpec, NormalActionNoise
from stable_baselines.common.callbacks import BaseCallback
import matplotlib.pyplot as plt
import time 



## Train the model 

In [2]:
POLICY = 'MlpPolicy'
ENVIRONMENT = 'HandMakerEnv-v1'
TIMESTEPS = 1000000
LOG_INTERVAL = 2500

START_TIME = time.asctime().replace(' ', '-')
TENSORBOARD_DIR = f'./logs/{ENVIRONMENT}-{POLICY}-{START_TIME}-tensorboard'
MODEL_DIR = f'./models/{ENVIRONMENT}-{POLICY}-{START_TIME}-model-folder.zip'
LOAD_EXISTING_MODEL = False
LOAD_DIR = None

def train(policy=POLICY, environment=ENVIRONMENT, timesteps=TIMESTEPS, load_existing=LOAD_EXISTING_MODEL, log_interval=LOG_INTERVAL):
    
    print(f"[INFO] STARTING TRAINING: {START_TIME} {ENVIRONMENT}-{POLICY}")
    print(f"[INFO] TIMESTEPS {TIMESTEPS}")
    
    # configure the environment 
    env = gym.make(ENVIRONMENT)
    
    if LOAD_EXISTING_MODEL: model.load(LOAD_DIR)
    else:  model = PPO2(policy, env, verbose=0)
        
    model.learn(total_timesteps=timesteps, log_interval=LOG_INTERVAL)
    
    model.save(save_path=MODEL_DIR, cloudpickle=False)
    
    return model


model = train()

[INFO] STARTING TRAINING: Sat-Apr-11-14:54:26-2020 HandMakerEnv-v1-MlpPolicy
[INFO] TIMESTEPS 50000
Wrapping the env in a DummyVecEnv.
-------------------------------------
| approxkl           | 0.0005621726 |
| clipfrac           | 0.001953125  |
| explained_variance | -1.09        |
| fps                | 217          |
| n_updates          | 1            |
| policy_entropy     | 9.010526     |
| policy_loss        | -0.016644834 |
| serial_timesteps   | 128          |
| time_elapsed       | 1.79e-05     |
| total_timesteps    | 128          |
| value_loss         | 0.42558858   |
-------------------------------------


## Evaluate the model 

In [8]:
# what is the average reward for random actions? 

def get_mean_reward_random():
    env = gym.make(ENVIRONMENT)
    
    reward_sum = 0
    for _ in range(10000):
        obs, r, done, _ = env.step(env.action_space.sample())
        reward_sum+=r
    return reward_sum/10000

def evaluate(model, num_steps=1000):
    """
    Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_steps: (int) number of timesteps to evaluate it
    :return: (float) Mean reward for the last 100 episodes
    """
    env = gym.make(ENVIRONMENT)
    for t in range(3):
        episode_rewards = 0
        obs = env.reset()
        for i in range(num_steps):
          # _states are only useful when using LSTM policies
            action, _states = model.predict(obs)

            obs, reward, done, info = env.step(action)
            episode_rewards += reward

            obs = env.reset(mode)

        # Compute mean reward for the last 100000 episodes
        print(f"Mean reward, trial {t}:", episode_rewards/10000)
            

# print("Mean reward random: ", get_mean_reward_random())
# model.load(load_path='models/HandMakerEnv-v1-MlpPolicy-Sat-Apr-11-14:17:49-2020-model-folder.zip')

evaluate(model)

Mean reward, trial 0: 0.024554301795765197
Mean reward, trial 1: 0.024332216563923918
Mean reward, trial 2: 0.0243640444920933
Five %: 0.9983333333333333


In [86]:
flush_cards = [treys.Card.new(f"{n}h") for n in [2,3,4,5,6]] + [treys.Card.new(f"{n}d") for n in [2,3,4,5,7]] + [treys.Card.new(f"{n}c") for n in [2, 'Q', 'J']]
flush_obs = np.array([item for sublist in [[int(i) for i in y] for y in [f'{a:032b}' for a in flush_cards]] for item in sublist])


In [87]:
env = gym.make(ENVIRONMENT)
env.card_ints = flush_cards
action, _states = model.predict(flush_obs)
obs, r, done, info = env.step(action)
print(r)

0.17180380595014744
