<a href="https://colab.research.google.com/github/Jenishbh/Blackjack-Using-Reinforcement-Learning/blob/main/Blackjack_Monte_Carlo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gym

# Initialize the Blackjack environment
env = gym.make('Blackjack-v1')

def play_game(strategy, verbose=True):
    state = env.reset()
    done = False
    while not done:
        if verbose:
            print(f"Current State: {state}")
        action = strategy(state)  # Define strategy function to choose action
        state, reward, done, _ = env.step(action)
        if verbose:
            print(f"Action Taken: {'Hit' if action == 1 else 'Stand'}, New State: {state}")
    if verbose:
        print(f"Game end. Reward: {reward}\n")
    return reward

def simple_strategy(state):
    """Simple example strategy: hit until reaching 17, then stand."""
    score, dealer_score, usable_ace = state
    return 0 if score >= 17 else 1

# Test the game environment
play_game(simple_strategy)


Current State: (13, 10, False)
Action Taken: Hit, New State: (23, 10, False)
Game end. Reward: -1.0



  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):


-1.0

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import random
from collections import deque

# DQN Agent
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # Discount rate
        self.epsilon = 1.0  # Exploration rate, will decrease
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        """Neural Network for Deep Q-learning Model."""
        model = tf.keras.Sequential([
            layers.Dense(24, input_dim=self.state_size, activation='relu'),
            layers.Dense(24, activation='relu'),
            layers.Dense(self.action_size, activation='linear')
        ])
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward if done else reward + self.gamma * np.max(self.model.predict(next_state)[0])
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


# Parameters
state_size = 3  # There are three components to the state: player's sum, dealer's card, usable ace
action_size = env.action_space.n  # Number of possible actions: hit or stand

# Initialize the agent with corrected state size
agent = DQNAgent(state_size, action_size)




In [None]:
def train_agent(epochs=500, batch_size=32):
    for e in range(epochs):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        while True:
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                print(f"episode: {e+1}/{epochs}, reward: {reward}, epsilon: {agent.epsilon}")
                break
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

# Call the training function
train_agent()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
episode: 385/500, reward: -1.0, epsilon: 0.013558238831322046
episode: 386/500, reward: 1.0, epsilon: 0.013490447637165436
episode: 387/500, reward: 1.0, epsilon: 0.013422995398979608
episode: 388/500, reward: -1.0, epsilon: 0.01335588042198471
episode: 389/500, reward: -1.0, epsilon: 0.013289101019874787
episode: 390/500, reward: 1.0, epsilon: 0.013222655514775413
episode: 391/500, reward: 1.0, epsilon: 0.013156542237201536
episode: 392/500, reward: -1.0, epsilon: 0.013090759526015528
episode: 393/500, reward: -1.0, epsilon: 0.01302530572838545
episode: 394/500, reward: -1.0, epsilon: 0.012960179199743523
episode: 395/500, reward: -1.0, epsilon: 0.012895378303744804
episode: 396/500, reward: -1.0, epsilon: 0.01283090141222608
episode: 397/500, reward: -1.0, epsilon: 0.012766746905164949
episode: 398/500, reward: -1.0, epsilon: 0.012702913170639124
episode: 399/500, reward: -1.0, epsilon: 0.012639398604785928
episode: 400

In [None]:
import gym
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt

def softmax(x):
    """Compute softmax values for each set of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)  # only difference

def get_probs(Q_s, policy_type='softmax', tau=1.0):
    """Returns the action probabilities based on policy type."""
    if policy_type == 'softmax':
        return softmax(Q_s / tau)
    else:
        # Fallback to epsilon-greedy if not softmax
        epsilon = 0.1
        nA = len(Q_s)
        policy_s = np.ones(nA) * epsilon / nA
        best_a = np.argmax(Q_s)
        policy_s[best_a] += (1.0 - epsilon)
        return policy_s

def monte_carlo_control(env, num_episodes, gamma=1.0, tau=1.0):
    """Perform Monte Carlo Control using softmax policy."""
    nA = env.action_space.n
    Q = defaultdict(lambda: np.zeros(nA))
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)

    for i_episode in range(1, num_episodes + 1):
        states, actions, rewards = [], [], []
        state = env.reset()
        done = False
        while not done:
            probs = get_probs(Q[state], policy_type='softmax', tau=tau)
            action = np.random.choice(np.arange(nA), p=probs)
            next_state, reward, done, _ = env.step(action)
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            state = next_state
        G = 0
        for t in reversed(range(len(states))):
            G = gamma * G + rewards[t]
            sa_pair = (states[t], actions[t])
            if sa_pair not in zip(states[:t], actions[:t]):
                returns_sum[sa_pair] += G
                returns_count[sa_pair] += 1
                Q[states[t]][actions[t]] = returns_sum[sa_pair] / returns_count[sa_pair]

    return Q

env = gym.make('Blackjack-v1')
Q = monte_carlo_control(env, num_episodes=5000000, tau=0.6)  # Adjust tau as needed

def play_blackjack(env, Q, num_games=100000):
    outcomes = {'Win': 0, 'Lose': 0, 'Draw': 0}
    for _ in range(num_games):
        state = env.reset()
        done = False
        while not done:
            action = np.argmax(Q[state]) if state in Q else env.action_space.sample()
            state, reward, done, _ = env.step(action)
        if reward > 0:
            outcomes['Win'] += 1
        elif reward < 0:
            outcomes['Lose'] += 1
        else:
            outcomes['Draw'] += 1
    # Calculate percentages
    total_games = num_games
    win_percentage = (outcomes['Win'] / total_games) * 100
    loss_percentage = (outcomes['Lose'] / total_games) * 100
    draw_percentage = (outcomes['Draw'] / total_games) * 100
    total_accuracy = (outcomes['Win'] / (outcomes['Win'] + outcomes['Lose'])) * 100 if (outcomes['Win'] + outcomes['Lose']) > 0 else 0

    # Print detailed results
    print(f"Out of {total_games} games:")
    print(f"Wins: {outcomes['Win']} ({win_percentage:.2f}%)")
    print(f"Losses: {outcomes['Lose']} ({loss_percentage:.2f}%)")
    print(f"Draws: {outcomes['Draw']} ({draw_percentage:.2f}%)")
    print(f"Total accuracy (Win/Loss ratio): {total_accuracy:.2f}%")

    return outcomes

# Run the simulation and print the results
results = play_blackjack(env, Q)


Out of 100000 games:
Wins: 43544 (43.54%)
Losses: 47934 (47.93%)
Draws: 8522 (8.52%)
Total accuracy (Win/Loss ratio): 47.60%
