# Intro

Purpose: to create a game environment: deck of cards, player draws cards, needs to discard if has more than hand size limit. 

Thenumber of cards in the deck of each set, along with the goal, and the hand size, are parameters

we will give rewards for getting a set, and a large negative at the end (running out of cards) for not having all

# Environment

In [1]:
import numpy as np
import gym
import random

In [42]:
class CardSetFinder(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(
        self, 
        cardset_tot = [10,10],
        cardset_goal = [3,3],
        hand_limit = 3):
        
        self.reward_for_set = 3
        self.reward_for_invalid_action = -5
        self.reward_for_win = 10
        self.reward_for_lose = -10
        
        self.cardset_tot = cardset_tot
        self.cardset_goal = cardset_goal
        self.hand_limit = hand_limit
        self.sets_num = len(self.cardset_tot)
        
        self.won = False
        self.lost = False
        
        # actions: to discard one of the cards
        # we will code it as a discrete 0 to sets_num
        # so an action is to discard a type of card, not a specific card
        # NB there will be invalid actions, not all of the card types will be in hand all the time
        self.action_space = gym.spaces.Discrete(self.sets_num)
        
        # the states used to be in a table, with three columns, and each row represents a set
        # that did not work, and got bad results
        # so i am going to unpack it, the state is one long list of numbers
        # it is by sets, so first three number belong to first set, etc
        # state[set_num * 3 + 0]: number of cards in the deck, lowe limit: 0, upper limit: corresponding cardset_tot
        # state[set_num * 3 + 1]: number of cards in the hand, lower limit: 0, upper limit: hand_limit
        # state[set_num * 3 + 2]: whether we already have that set collected or not, 0: no, 1: yes
        
        observation_space_low = \
            [np.array([0,0,0]) for set_num in range(self.sets_num)]
        observation_space_low = np.hstack(observation_space_low)
        
        observation_space_high = \
            [np.array([self.cardset_tot[set_num],self.hand_limit,1]) for set_num in range(self.sets_num)]
        observation_space_high = np.hstack(observation_space_high)
        
        self.observation_space = gym.spaces.Box(
            low = observation_space_low, 
            high = observation_space_high, 
            dtype = int)
        
        # note: not all the combinations are going to be valid states, 
        # e.g. if all the cards of a set are in deck, there should not be any in hand, and the set should not be won
        
    
    def create_deck(self):
        # creates a list of 0, 1, 2, ... etc in random order
        
        for i in range(0, self.sets_num):
            self.deck += [i] * self.cardset_tot[i]
            
        random.shuffle(self.deck)
        
    def draw_cards(self):
        # draws cards until hand limit is met, or until we run out of cards
        curr_handsize = sum(self.hand)
        if curr_handsize < self.hand_limit:
            for i in range(curr_handsize, min(self.hand_limit,curr_handsize+len(self.deck))):
                self.draw_card()
        
    def draw_card(self):
        # draws the top card from the dack to hand
        current_card = self.deck[-1]
        self.hand[current_card] += 1
        self.deck.pop()

    def step(self, action):
        
        reward = 0
        done = False
        info = {}

        # incoming action is a number between 0 and sets_num
        if self.hand[action] == 0:
            # does not change anything, just returns invalid action penalty
            reward = self.reward_for_invalid_action
        else:
            # deletes a card from hand
            self.discard_card(action)

            # checks for set (at this point too, we can find new sets immediately, need to loop
            # e.g. as soon as we discard one set, we draw new cards, and it could be that we draw a set            
            found_sets = 1
            while found_sets > 0:
                self.draw_cards()
                found_sets = self.check_hand_for_sets()
                reward += found_sets * self.reward_for_set
            
            self.calc_state()
            
            # at this point, also needs to check if the game is done
            self.check_if_won()
            if self.won:
                reward += self.reward_for_win
            self.check_if_lost()
            if self.lost:
                reward += self.reward_for_lose

            done = self.won or self.lost
            
        return self.state, reward, done, info

    def reset(self):
        
        # set parameters to starting position
        
        # state: in the format of observation_space
        # 3 * sets_num number of zeroes
        self.state = np.zeros(self.sets_num * 3, dtype = int)
        # deck: a list of randomly arranged integers, from 0 to 
        self.deck = []
        self.create_deck()
        # hand: a list of sets_num, each element is an integer with the number of cards hold 
        self.hand = np.zeros(self.sets_num, dtype = int)
        
        found_sets = 1
        while found_sets > 0:
            self.draw_cards()
            found_sets = self.check_hand_for_sets()
            # here, we are not giving rewards, this is the first draw of the game, and sets are automatic
        
        self.calc_state()
        
        return self.state
    
    def check_hand_for_sets(self):
        # in the hand set, checks each card type
        # sees if we have any that is equal to the limit
        # only those sets that are not found yet
        
        found_sets = 0
        
        for i in range(0, self.sets_num):
            if self.state[i * 3 + 2]==0:
                if self.check_hand_for_set(i):
                    found_sets +=1
                    
        return found_sets
                
    def check_hand_for_set(self, set_num):
        set_found = False
        if self.hand[set_num] >= self.cardset_goal[set_num]:
            # discard those cards from hand
            self.hand[set_num] -= self.cardset_goal[set_num]
            # set the last element of state as DONE (to 1 from 0)
            self.state[set_num * 3 + 2]=1
            set_found = True
            
        return set_found
            
    def calc_state(self):
        # calculates the state variable based on deck list and hand set
        # only changes first two columns
        # the third one, whether we have already found the set, is handled in the check_hand_for_set
        for i in range(0, self.sets_num):
            self.state[i * 3 + 0] = self.deck.count(i)
            self.state[i * 3 + 1] = self.hand[i]
            
    def discard_card(self, set_to_discard):
        self.hand[set_to_discard] -= 1
    
    def check_if_won(self):
        # the game is won if all the sets are found
        # that is, every third value should be 1
        self.won = True
        for i in range(sets_num):
            self.won = self.won and (self.state[i * 3 + 2] == 1)
    
    def check_if_lost(self):
        # the game is lost if not all the sets are found, but we have nothing in the deck
        if self.won == False:
            self.lose = len(self.deck)==0

In [43]:
env = CardSetFinder()

In [44]:
env.reset()

array([8, 2, 0, 9, 1, 0])

In [45]:
env.step(0)

(array([7, 2, 0, 9, 1, 0]), 0, False, {})

In [46]:
state, reward, done, info = env.step(0)

In [47]:
done

False

In [48]:
type(done)

bool

# Testing the Environment

See if steps work out, cards drawn properly, end states calculated, etc. Does it reduce for incorrect action. 

In [24]:
env = CardSetFinder()

In [25]:
env.reset()

array([[9, 1, 0],
       [8, 2, 0]])

In [26]:
env.step(0)

(array([[8, 1, 0],
        [8, 2, 0]]), 0, False, {})

In [27]:
env.step(0)

(array([[6, 2, 0],
        [6, 1, 1]]), 3, False, {})

It's just a series of checking actions, whether they are working properly. 

In this next bit: check if two sets afterwards are popped in one go. 

In [45]:
env = CardSetFinder()

In [49]:
env.reset()

array([[9, 1, 0],
       [8, 2, 0]])

In [57]:
env.hand = [3,0]

In [58]:
env.deck = [1,1,1,0]

In [59]:
env.state = np.array([[3,0,0],[1,3,0]])

In [60]:
env.step(1)

(array([[3, 0, 0],
        [1, 3, 0]]), -5, False, {})

In [61]:
env.step(0)

(array([[0, 0, 1],
        [0, 0, 1]]), 16, True, {})

# Model Train

In [49]:
env = CardSetFinder()

In [50]:
from stable_baselines.common.env_checker import check_env

In [51]:
check_env(env)

In [52]:
from stable_baselines.common.vec_env import DummyVecEnv

In [77]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines import PPO2

In [58]:
from stable_baselines.deepq.policies import MlpPolicy
from stable_baselines import DQN

In [78]:
model = PPO2(MlpPolicy, env, verbose=False,)
model.learn(total_timesteps=1000000)

<stable_baselines.ppo2.ppo2.PPO2 at 0x7fa8ec26e390>

In [80]:
obs = env.reset()
print(env.state)
for i in range(20):
    action, _states = model.predict(obs)
    print(action)
    obs, rewards, dones, info = env.step(action)
    print(obs, rewards, dones)
    if dones:
        break

[9 1 0 8 2 0]
0
[8 1 0 5 2 1] 3 False
1
[7 2 0 5 1 1] 0 False
0
[6 2 0 5 1 1] 0 False
0
[6 1 0 4 2 1] 0 False
0
[6 0 0 3 3 1] 0 False
1
[5 1 0 3 2 1] 0 False
1
[4 2 0 3 1 1] 0 False
1
[2 1 1 1 2 1] 13 True


In [81]:
obs = env.reset()

In [82]:
action, _states = model.predict(obs)

In [83]:
action

1

In [84]:
obs

array([9, 1, 0, 8, 2, 0])

In [86]:
type(_states)

NoneType