# Train RL model on Poker Hand Selection Task 

## Import packages and configure environment
Run tests first in the test notebooks if you face errors

In [1]:
import warnings
warnings.filterwarnings('ignore')

# environment
import gym
import HandClassificationEnv
import treys
import numpy as np

# agent and training
import stable_baselines
from stable_baselines import PPO2
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv

# plotting and logging
from stable_baselines.bench import Monitor
# from stable_baselines.results_plotter import load_results, ts2xy
# from stable_baselines.common.noise import AdaptiveParamNoiseSpec, NormalActionNoise
from stable_baselines.common.callbacks import BaseCallback
# import matplotlib.pyplot as plt
import time 


## Train the model 

In [7]:
POLICY = MlpPolicy
POLICY_NAME = 'MlpPolicy'
ENVIRONMENT = 'HandClassificationEnv-v1'
TIMESTEPS = 500000
NETWORK_ARCH = [160, 160, 160, 45, 45]
LOG_INTERVAL = 250

START_TIME = time.asctime().replace(' ', '-')
TENSORBOARD_DIR = f'./logs/tb/{ENVIRONMENT}-{POLICY_NAME}tensorboard/'
MODEL_DIR = f'./models/{ENVIRONMENT}-{POLICY_NAME}-PPO2-{START_TIME}-{TIMESTEPS}-model-folder'
LOAD_EXISTING_MODEL = False
LOAD_DIR = None

def train(policy=POLICY, environment=ENVIRONMENT, timesteps=TIMESTEPS, load_existing=LOAD_EXISTING_MODEL, log_interval=LOG_INTERVAL):
    
    print(f"[INFO] STARTING TRAINING: {START_TIME} {ENVIRONMENT}-{POLICY_NAME}-PPO2")
    print(f"[INFO] NETWORK ARCH {NETWORK_ARCH}")
    print(f"TENSORBOARD DIR {TENSORBOARD_DIR}")
    
    # configure the environment 
    env = gym.make(ENVIRONMENT)
    
    print(f"[INFO] LOAD EXISTING MODEL? {LOAD_EXISTING_MODEL}")
    if LOAD_EXISTING_MODEL: 
        model.load(LOAD_DIR)
        print(f"[INFO] LOADED MODEL FROM {LOAD_DIR}")

    else:  
        # Custom MLP policy of two layers of size 32 each with tanh activation function
        policy_kwargs = dict(net_arch=NETWORK_ARCH)
        model = PPO2(policy, env, verbose=0, policy_kwargs=policy_kwargs, tensorboard_log=TENSORBOARD_DIR)
        print(f"[INFO] Training for TIMESTEPS {TIMESTEPS}")
        
    model.learn(total_timesteps=timesteps, log_interval=LOG_INTERVAL, tb_log_name=f"t-{TIMESTEPS}-PPO2")
    
    model.save(save_path=MODEL_DIR, cloudpickle=False)
    print(f"[INFO] MODEL SAVED TO {MODEL_DIR}")
    
    return model


model = train()

[INFO] STARTING TRAINING: Sun-Apr-12-16:58:38-2020 HandClassificationEnv-v1-MlpPolicy-PPO2
[INFO] NETWORK ARCH [160, 160, 160, 45, 45]
TENSORBOARD DIR {TENSORBOARD_DIR}
[INFO] LOAD EXISTING MODEL? False
[INFO] Training for TIMESTEPS 500000
[INFO] MODEL SAVED TO ./models/HandClassificationEnv-v1-MlpPolicy-PPO2-Sun-Apr-12-16:58:38-2020-500000-model-folder


## Evaluate the model 

In [8]:
# what is the average reward for random actions? 
from collections import Counter

def get_mean_reward_random():
    env = gym.make(ENVIRONMENT)
    
    reward_sum = 0
    for i in range(10000):
        random_action = env.action_space.sample()
        obs, r, done, _ = env.step(random_action)
        reward_sum+=r
        env.reset()
            
    return reward_sum/10000

print(f"Mean reward 10 000 trails with random action: {get_mean_reward_random()} \n")

Mean reward 10 000 trails with random action: -0.7776 



In [9]:
LOG_DIR = f"./logs/{MODEL_DIR}/evaluation.txt"
def evaluate(model, num_steps=50000, log_dir=LOG_DIR):
    """
    Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_steps: (int) number of timesteps to evaluate it
    :param log_dir: (str) where to write the classification report
    """
    
    env = gym.make(ENVIRONMENT)
    obs = env._get_obs()

    episode_rewards = 0
    predictions = []
    actual_states = []
    
    for i in range(num_steps):
        # store the observation for evaluation later 
        actual_states.append(env.rank_class-1)
        # make a prediction
        action, _states = model.predict(obs)
        # observer the reward
        obs, reward, done, info = env.step(action)
        # store the reward for analysis later
        episode_rewards += reward
        # store the action for analysis later 
        predictions.append(action)

        # get a new observation for a new prediction
        obs = env.reset()

    # Compute mean reward for the last 100000 episodes
    # and save it to a file 
    import os
    from sklearn.metrics import classification_report
    filename = LOG_DIR
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "w") as f:
        f.write(f"[INFO] Mean reward: {episode_rewards/num_steps} \n")
        print(f"[INFO] Mean reward: {episode_rewards/num_steps} \n")
        print(f"[INFO] Predictions distribution: {Counter(predictions)} \n")
        print(f"[INFO] Actual states distribution: {Counter(actual_states)} \n")
        f.write(f"{classification_report(actual_states, predictions)}")
        print(classification_report(actual_states, predictions))
        
evaluate(model)

[INFO] Mean reward: 0.97768 

[INFO] Predictions distribution: Counter({8: 25380, 7: 21245, 6: 2387, 5: 981, 1: 7}) 

[INFO] Actual states distribution: Counter({8: 25113, 7: 21070, 6: 2376, 5: 1043, 4: 212, 3: 96, 2: 78, 1: 10, 0: 2}) 

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         2
          1       0.00      0.00      0.00        10
          2       0.00      0.00      0.00        78
          3       0.00      0.00      0.00        96
          4       0.00      0.00      0.00       212
          5       0.97      0.91      0.94      1043
          6       0.98      0.98      0.98      2376
          7       0.99      1.00      0.99     21070
          8       0.99      1.00      0.99     25113

avg / total       0.98      0.99      0.98     50000

