In [21]:
#GOAL: Implement q-learning into the agent

import random
import numpy as np

# Define the game environment just like before in ex1 

def game_result(inputA, inputB, total_reward):
    if inputA not in ['Q1', 'Q2', 'Q3', 'Q4']:
        raise ValueError("Invalid input. Choose from Q1, Q2, Q3, or Q4.")
    if inputB not in ['play', 'leave']:
        raise ValueError("Invalid input. Choose either 'play' or 'leave'.")

    # Define the probabilities and rewards based on the inputA
    probabilities = [0.9, 0.75, 0.5, 0.1]
    rewards = [100, 1000, 10000, 50000]
    quitting_amounts = [0, 100, 1100, 11100]

    # Determine the result based on the inputs
    index = ['Q1', 'Q2', 'Q3', 'Q4'].index(inputA)
    if inputB == 'play':
        # Play starting with Q1
        if random.random() < probabilities[index]:
            result = "You won!"
            p = probabilities[index]
            rew = rewards[index]
        else:
            result = "You lost! Game Over."
            p = 0
            rew = 0
            return result, p, rew, total_reward
    else:
        result = "You quit. Game Over."
        p = 0
        rew = quitting_amounts[index]
        return result, p, rew, total_reward

    total_reward += rew
    return result, p, rew, total_reward


# Set hyperparameters
alpha = 0.5  # Learning rate
gamma = 0.9  # Discount factor
num_episodes = 1000 #number of episodes, if you wanna change how many times it runs change this

# Define the states and actions
states = ['Q1', 'Q2', 'Q3', 'Q4']
actions = ['play', 'leave']
num_states = len(states)
num_actions = len(actions)

# now set the Q-table
Q = np.zeros((num_states, num_actions))

# do the Q-learning algorithm, (note: Q-learning is a reinforcement learning algorithm that uses action-value estimates to maximize rewards and make optimal decisions)
for episode in range(num_episodes):
    total_reward = 0
    quarter = random.choice(states)
    state = states.index(quarter)

    while True:
        epsilon = 0.1  # this is the exploration rate
        if random.uniform(0, 1) < epsilon:
            action = random.choice(actions)
        else:
            action = actions[np.argmax(Q[state])]
        #set the inputs
        inputA = states[state]
        inputB = action
        #while loop
        while True:
            result, probabilities, rewards, total_reward = game_result(inputA, inputB, total_reward)
            if next_state is None or next_state in states:
                break
        #if not none
        if next_state is not None:
            next_state_index = states.index(next_state)
            Q[state, actions.index(action)] += alpha * (rewards + gamma * np.max(Q[next_state_index]) - Q[state, actions.index(action)])

        state = next_state_index
        total_reward += rewards
        #game over !
        if "Game Over" in result or next_state_index is None:
            break

    # Print the results for each episode
    print("Episode:", episode)
    print("Total Reward:", total_reward)
    print()

# Print the learned Q-values
print("Learned Q-values:")
print(Q)


Episode: 0
Total Reward: 0

Episode: 1
Total Reward: 0

Episode: 2
Total Reward: 4000

Episode: 3
Total Reward: 0

Episode: 4
Total Reward: 14300

Episode: 5
Total Reward: 2000

Episode: 6
Total Reward: 22000

Episode: 7
Total Reward: 6200

Episode: 8
Total Reward: 22000

Episode: 9
Total Reward: 2200

Episode: 10
Total Reward: 0

Episode: 11
Total Reward: 0

Episode: 12
Total Reward: 104000

Episode: 13
Total Reward: 0

Episode: 14
Total Reward: 200

Episode: 15
Total Reward: 20000

Episode: 16
Total Reward: 22000

Episode: 17
Total Reward: 36000

Episode: 18
Total Reward: 18100

Episode: 19
Total Reward: 0

Episode: 20
Total Reward: 0

Episode: 21
Total Reward: 200

Episode: 22
Total Reward: 4000

Episode: 23
Total Reward: 12200

Episode: 24
Total Reward: 6000

Episode: 25
Total Reward: 0

Episode: 26
Total Reward: 20000

Episode: 27
Total Reward: 0

Episode: 28
Total Reward: 16200

Episode: 29
Total Reward: 112100

Episode: 30
Total Reward: 0

Episode: 31
Total Reward: 0

Episode: 3