In [3]:
!pip uninstall gym -y
!pip install gymnasium
!pip install shimmy
!pip install stable-baselines3






[notice] A new release of pip available: 22.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [348]:
import random
import gymnasium as gym
import numpy as np
from collections import Counter
def initialize_deck():
    suits = ["♥", "♦", "♣", "♠"]
    ranks = ["2", "3", "4", "5", "6", "7", "8", "9", "T", "J", "Q", "K", "A"]
    deck = [f"{rank}{suit}" for suit in suits for rank in ranks]
    random.shuffle(deck)
    return deck
# Helper functions for poker hand evaluation
def is_straight(hand):
    if len(hand) != 5:
        return False  # Straights only apply to 5-card hands
    ranks = "A23456789TJQKA"
    rank_indices = [ranks.index(card[0]) for card in hand]
    rank_sequence = list(range(min(rank_indices), min(rank_indices) + 5))
    return rank_indices == rank_sequence or ranks.endswith(''.join([card[0] for card in hand]))

def is_flush(hand):
    if len(hand) != 5:
        return False  # Flushes only apply to 5-card hands
    suits = [card[1] for card in hand]
    return len(set(suits)) == 1

def is_straight_flush(hand):
    return is_straight(hand) and is_flush(hand)
    
def is_four_of_a_kind(hand):
    ranks = [card[0] for card in hand]
    rank_counts = Counter(ranks)
    return 4 in rank_counts.values()

def is_full_house(hand):
    ranks = [card[0] for card in hand]
    rank_counts = Counter(ranks)
    return set(rank_counts.values()) == {2, 3}

def is_three_of_a_kind(hand):
    ranks = [card[0] for card in hand]
    rank_counts = Counter(ranks)
    return 3 in rank_counts.values() and not is_full_house(hand)

def is_two_pair(hand):
    ranks = [card[0] for card in hand]
    rank_counts = Counter(ranks)
    return list(rank_counts.values()).count(2) == 2

def is_one_pair(hand):
    ranks = [card[0] for card in hand]
    rank_counts = Counter(ranks)
    return list(rank_counts.values()).count(2) == 1 and not is_three_of_a_kind(hand) and not is_full_house(hand)

def best_poker_hand(hand):
    hand.sort(key=lambda card: "A23456789TJQK".index(card[0]))
    if is_straight_flush(hand):
        return "Straight Flush"
    elif is_four_of_a_kind(hand):
        return "Four of a Kind"
    elif is_full_house(hand):
        return "Full House"
    elif is_flush(hand):
        return "Flush"
    elif is_straight(hand):
        return "Straight"
    elif is_three_of_a_kind(hand):
        return "Three of a Kind"
    elif is_two_pair(hand):
        return "Two Pair"
    elif is_one_pair(hand):
        return "One Pair"
    else:
        return "High Card"
        
def refined_score_hand(hand):
    # Base chip values and multipliers for hand types
    hand_values = {
        "High Card": (5, 1),
        "One Pair": (10, 2),
        "Two Pair": (20, 2),
        "Three of a Kind": (30, 3),
        "Straight": (30, 4),
        "Flush": (35, 4),
        "Full House": (40, 4),
        "Four of a Kind": (60, 7),
        "Straight Flush": (100, 8)
    }

    # Pip values for cards
    pip_values = {
        "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "T": 10,
        "J": 10, "Q": 10, "K": 10, "A": 11
    }

    # Determine the hand type
    hand_type = best_poker_hand(hand)

    # Calculate the contributing pip value of the hand based on the hand type
    ranks = [card[0] for card in hand]
    rank_counts = Counter(ranks)
    contributing_pip_value = 0

    if hand_type in ["Four of a Kind", "Three of a Kind", "One Pair"]:
        for rank, count in rank_counts.items():
            if (hand_type == "Four of a Kind" and count == 4) or \
               (hand_type == "Three of a Kind" and count == 3) or \
               (hand_type == "One Pair" and count == 2):
                contributing_pip_value += pip_values[rank] * count
    elif hand_type == "Two Pair":
        for rank, count in rank_counts.items():
            if count == 2:
                contributing_pip_value += pip_values[rank] * count
    elif hand_type == "Full House":
        for rank, count in rank_counts.items():
            contributing_pip_value += pip_values[rank] * count
    if hand_type == "High Card":
        # For a "High Card" hand, only consider the pip value of the highest card
        high_card = max(hand, key=lambda card: pip_values[card[0]])
        contributing_pip_value = pip_values[high_card[0]]
    elif hand_type in ["Straight", "Flush", "Straight Flush"]:
        contributing_pip_value = sum(pip_values[card[0]] for card in hand)
    
    # Get the base chip value and multiplier for the hand type
    base_value, multiplier = hand_values[hand_type]

    # Calculate the final score
    final_score = (contributing_pip_value + base_value) * multiplier

    return final_score, hand_type, hand


In [711]:
from gymnasium import spaces
from gymnasium.spaces import MultiBinary, Discrete, Tuple
from gymnasium.utils import seeding
import itertools

class CardGameEnv(gym.Env):
    def __init__(self):
        super(CardGameEnv, self).__init__()
        # 8 binary actions for each card in hand, and 1 binary action to decide play or discard
        self.action_space = MultiBinary(53)
        self.max_plays = 4
        self.max_discards = 3
        # Calculate the total size of the observation vector
        total_size = (52 * 3) + (self.max_plays + 1) + (self.max_discards + 1)
        # Use a single MultiBinary space for the entire observation
        self.observation_space = spaces.MultiBinary(total_size)

        self.deck = initialize_deck()
        self.hand_size = 8
        self.hand = []
        self.discarded_cards = [] 
        self.round_score = 0
        self.plays_made = 0
        self.discards_made = 0
        self.win_score = 300
                # Define a mapping of card strings to row indices
        suits = ["♥", "♦", "♣", "♠"]
        ranks = ["2", "3", "4", "5", "6", "7", "8", "9", "T", "J", "Q", "K", "A"]
        self.card_to_row = {f"{rank}{suit}": index for index, (rank, suit) in enumerate(itertools.product(ranks, suits))}
        self.row_to_card = {index: card for card, index in self.card_to_row.items()}


    def draw_hand(self):
        self.hand = random.sample(self.deck, self.hand_size)
        for card in self.hand:
            self.deck.remove(card)
            
    def play_cards(self, selected_cards):
        # Remove played cards from hand and process play
        pass

    def discard_cards(self, selected_cards):
        # Add discarded cards to the discard pile and remove from hand
        pass

    def encode_state(self):
        # Initialize a matrix for all 52 cards with 3 columns for each state

        # Example encoding process
        #card_states = np.concatenate([np.zeros(52, dtype=int) for _ in range(3)])  # Placeholder for actual card states
        
        state_matrix = np.zeros((52, 3), dtype=int)
        
        
        # Mark cards in the deck, in hand, and discarded
        for card in self.deck:
            row_index = self.card_to_row[card]
            state_matrix[row_index] = [1,0,0]  # Mark as in the deck
        for card in self.hand:
            row_index = self.card_to_row[card]
            state_matrix[row_index] = [0,1,0]  # Mark as in the hand
        for card in self.discarded_cards:
            row_index = self.card_to_row[card]
            state_matrix[row_index] = [0,0,1]  # Mark as discarded
    
        play_count_encoded = np.zeros(self.max_plays + 1, dtype=int)
        discard_count_encoded = np.zeros(self.max_discards + 1, dtype=int)
        
        # Assuming self.plays_made and self.discards_made track the counts of plays/discards made
        play_count_encoded[self.plays_made] = 1
        discard_count_encoded[self.discards_made] = 1
        
        # Combine everything into a single observation vector
        observation = np.concatenate([state_matrix.flatten(), play_count_encoded, discard_count_encoded])

    
        # Return the complete state as a tuple, including card states, plays, and discards
        return observation
    
        
    def validate_action(self,mode, card_selections):
        print(f"card_selections total: {sum(card_selections)}")
        if sum(card_selections)>5 or sum(card_selections)<1 or mode == 0 and self.max_discards==self.discards_made:
            return False
        else:
            return True
        
    def step(self, action):
        reward=0
        terminated = False
        truncated = False
        mode = action[-1]  # The last element indicates play (1) or discard (0)
        card_selections = action[:-1]  # The rest of the elements indicate card selections        #need to write this function to make sure only 1-5 cards in hand are selected
        if not self.validate_action(mode,card_selections):
            return self.encode_state(), -sum(card_selections)*10, False,truncated, {"reason": "Invalid action"}
        # Calculate the number of selected cards correctly
        num_selected_cards = sum(card_selections)
        
        # Process the selected cards based on mode (play or discard)
        selected_cards_indices = [i for i, selected in enumerate(card_selections) if selected == 1]
        print(selected_cards_indices)
        selected_cards = [self.row_to_card[i] for i in selected_cards_indices]
        print(selected_cards)
        if mode == 0:  # Discard mode
            if self.discards_made < self.max_discards:
                self.discards_made += 1
                # Add the selected cards to the discarded pile
                self.discarded_cards.extend(selected_cards)
                # Remove the discarded cards from the hand
                self.hand = [card for card in self.hand if card not in selected_cards]
            else:
                reward = -10  # Penalize invalid actions
        elif mode == 1:  # Play mode
            if self.plays_made < self.max_plays:
                self.plays_made += 1
                # Played cards are treated like discards except for points
                self.discarded_cards.extend(selected_cards)
                # Assume play_score is calculated correctly from played_cards
                play_score, _, _ = refined_score_hand(selected_cards)
                self.round_score += play_score
                reward = play_score  # Update reward based on play score
                if self.round_score >= self.win_score:
                    terminated = True
                    reward = 1000  # Reward for winning           
                #TODO? add negative reward for plays made without winning game
                # Remove the played cards from the hand
                self.hand = [card for card in self.hand if card not in selected_cards]
        
        # Draw cards to refill hand to 8
        needed_cards = self.hand_size - len(self.hand)
        if needed_cards > 0 and len(self.deck) >= needed_cards:
            new_cards = random.sample(self.deck, needed_cards)
            self.hand += new_cards
            for card in new_cards:
                self.deck.remove(card)
        
        # Check if the game is over due to max plays without reaching win score
        if self.plays_made >= self.max_plays and self.round_score < self.win_score:
            terminated = True
            reward = -1000  # Penalty for not reaching the win score within the play limit
        
        return self.encode_state(), reward, terminated ,truncated , {}

    #def seed(self, seed=None):
    #    # Assuming `self.np_random` is your environment's RNG
    #    self.np_random, seed = seeding.np_random(seed)
    #    # Additionally, seed other random number generators used by the environment if any
    #    return [seed]
    def reset(self,seed=None,**kwargs):
        #if seed is not None:
        #    self.seed(seed)  # Seed the environment's RNG with the provided seed
        self.deck = initialize_deck()
        self.discarded_cards = [] 
        self.draw_hand()
        self.round_score = 0
        self.plays_made = 0
        self.discards_made = 0
        return self.encode_state(), {}

    def render(self, mode='human'):
        print(f"Hand: {self.hand}, Round Score: {self.round_score}, Plays Made: {self.plays_made}, Discards Made: {self.discards_made}")


In [712]:
env = CardGameEnv()
print(f"Initial Deck Size: {len(env.deck)}")
env.draw_hand()
print(f"Hand: {env.hand}, Hand Size: {len(env.hand)}, Remaining Deck Size: {len(env.deck)}")

Initial Deck Size: 52
Hand: ['9♣', '6♠', 'K♣', '5♣', '4♣', '6♦', 'Q♦', 'K♦'], Hand Size: 8, Remaining Deck Size: 44


In [713]:
        suits = ["♥", "♦", "♣", "♠"]
        ranks = ["2", "3", "4", "5", "6", "7", "8", "9", "T", "J", "Q", "K", "A"]
       
print({f"{rank}{suit}": index for index, (rank, suit) in enumerate(itertools.product(ranks, suits))})

{'2♥': 0, '2♦': 1, '2♣': 2, '2♠': 3, '3♥': 4, '3♦': 5, '3♣': 6, '3♠': 7, '4♥': 8, '4♦': 9, '4♣': 10, '4♠': 11, '5♥': 12, '5♦': 13, '5♣': 14, '5♠': 15, '6♥': 16, '6♦': 17, '6♣': 18, '6♠': 19, '7♥': 20, '7♦': 21, '7♣': 22, '7♠': 23, '8♥': 24, '8♦': 25, '8♣': 26, '8♠': 27, '9♥': 28, '9♦': 29, '9♣': 30, '9♠': 31, 'T♥': 32, 'T♦': 33, 'T♣': 34, 'T♠': 35, 'J♥': 36, 'J♦': 37, 'J♣': 38, 'J♠': 39, 'Q♥': 40, 'Q♦': 41, 'Q♣': 42, 'Q♠': 43, 'K♥': 44, 'K♦': 45, 'K♣': 46, 'K♠': 47, 'A♥': 48, 'A♦': 49, 'A♣': 50, 'A♠': 51}


In [714]:
initial_state = env.reset()
print(f"Hand: {env.hand} Initial State: {initial_state}, Initial Round Score: {env.round_score}")

Hand: ['4♣', 'T♥', '3♥', '7♥', '2♥', 'T♠', '6♠', '6♦'] Initial State: (array([0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0]), {}), Initial Round Score: 0


In [715]:
#step test

import numpy as np

# Initialize the environment
env = CardGameEnv()
env.reset()

# Manually set up a known state for testing
env.hand = ['4♥', '3♦', '4♣', '5♠', '6♥', '7♦', '8♣', '9♠']  # Example hand
env.deck = [card for card in initialize_deck() if card not in env.hand]  # Reset deck excluding the hand
env.discarded_cards = []  # Start with no discarded cards

# Also manually set plays and discards made if necessary
env.plays_made = 0
env.discards_made = 0

# Create an action to discard '4♣' and '6♥'
# Assuming card_to_row maps cards to their positions in the flattened state, and it's available here
card_selection = np.zeros(52, dtype=int)
suits = ["♥", "♦", "♣", "♠"]
ranks = ["2", "3", "4", "5", "6", "7", "8", "9", "T", "J", "Q", "K", "A"]
card_to_row = {f"{rank}{suit}": index for index, (rank, suit) in enumerate(itertools.product(ranks, suits))}
print(card_to_row)
card_selection[card_to_row['4♣']] = 1  # Mark '4♣' for discard
card_selection[card_to_row['4♥']] = 1  # Mark '6♥' for discard
action_type = 1  # 0 for discard
action = np.concatenate([card_selection,[action_type]])
print(action)


{'2♥': 0, '2♦': 1, '2♣': 2, '2♠': 3, '3♥': 4, '3♦': 5, '3♣': 6, '3♠': 7, '4♥': 8, '4♦': 9, '4♣': 10, '4♠': 11, '5♥': 12, '5♦': 13, '5♣': 14, '5♠': 15, '6♥': 16, '6♦': 17, '6♣': 18, '6♠': 19, '7♥': 20, '7♦': 21, '7♣': 22, '7♠': 23, '8♥': 24, '8♦': 25, '8♣': 26, '8♠': 27, '9♥': 28, '9♦': 29, '9♣': 30, '9♠': 31, 'T♥': 32, 'T♦': 33, 'T♣': 34, 'T♠': 35, 'J♥': 36, 'J♦': 37, 'J♣': 38, 'J♠': 39, 'Q♥': 40, 'Q♦': 41, 'Q♣': 42, 'Q♠': 43, 'K♥': 44, 'K♦': 45, 'K♣': 46, 'K♠': 47, 'A♥': 48, 'A♦': 49, 'A♣': 50, 'A♠': 51}
[0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]


In [716]:
env.hand

['4♥', '3♦', '4♣', '5♠', '6♥', '7♦', '8♣', '9♠']

In [717]:
new_state, reward, terminated ,truncated, _ = env.step(action)


card_selections total: 2
[8, 10]
['4♥', '4♣']


In [718]:
print(new_state)
print(reward)
env.hand

[1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 1 0 0 0 0 1 1 0 0 0 0 1 1 0 0 1
 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0
 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0
 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1
 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0]
36


['3♦', '5♠', '6♥', '7♦', '8♣', '9♠', '6♦', 'T♥']

In [719]:
env = CardGameEnv()
initial_observation = env.reset()
print(initial_observation)  # This should output your observation vector without errors.

(array([1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0]), {})


In [720]:
!pip install stable-baselines3
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy

# Assuming your environment class is named CardGameEnv and is properly imported
# from your_environment_file import CardGameEnv




[notice] A new release of pip available: 22.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [721]:
# Create the environment
environment = make_vec_env(lambda: CardGameEnv(), n_envs=10)


In [722]:
model = PPO("MlpPolicy", environment, verbose=1,ent_coef=0.2,learning_rate=0.03,clip_range=0.2)

Using cpu device


In [723]:
print(model)

<stable_baselines3.ppo.ppo.PPO object at 0x0000029610DCA440>


In [725]:
model.learn(total_timesteps=100000)


card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections total: 24.0
card_selections tota

<stable_baselines3.ppo.ppo.PPO at 0x29610dca440>

In [669]:
#evaluate the model
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward}, Std reward: {std_reward}")

card_selections total: 0.0
card_selections total: 0.0
card_selections total: 0.0
card_selections total: 0.0
card_selections total: 0.0
card_selections total: 0.0
card_selections total: 0.0
card_selections total: 0.0
card_selections total: 0.0
card_selections total: 0.0
Mean reward: 0.0, Std reward: 0.0


In [670]:
# Save the model
model.save("ppo_cardgame")

# Load the model
model = PPO.load("ppo_cardgame")
