### Working 

In [2]:
import gym
import gym_sokoban
import numpy as np
import random
from PIL import Image
import pickle

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Initialize the Sokoban environment with a specific seed
env = gym.make('Sokoban-large-v1')
env.seed(42)

# Define Q-learning parameters
# Learning Rate (α): Balances new learning with retained knowledge. Higher values adapt quickly; lower values ensure stable learning.
# Discount Factor (γ): Determines the importance of future rewards. Higher values promote long-term planning; lower values focus on immediate rewards.
# Exploration Rate (ε): Manages the exploration-exploitation trade-off. Higher values encourage exploration; lower values exploit known strategies.
# Number of Episodes: More episodes provide more learning opportunities, leading to better performance but requiring more training time.
alpha = 0.1  # Learning rate
gamma = 0.99  # Discount factor
epsilon = 0.1  # Exploration rate
num_episodes = 1000  # Number of training episodes
q_table_filename = "q_table_large.pkl"
# Initialize the Q-table
q_table = {}

def get_state_key(state):
    return str(state.tolist())

def choose_action(state, epsilon):
    state_key = get_state_key(state)
    if random.uniform(0, 1) < epsilon or state_key not in q_table:
        return env.action_space.sample()  # Explore action space
    return np.argmax(q_table[state_key])  # Exploit learned values
# ALREADY TRAINED PLEASE LOAD THE PICKLE BELOW q_table.pkl
# Training the Q-learning algorithm
for episode in range(num_episodes):
    state = env.reset()
    done = False

    while not done:
        state_key = get_state_key(state)
        action = choose_action(state, epsilon)

        next_state, reward, done, info = env.step(action)
        next_state_key = get_state_key(next_state)
        print(reward, done, info)
        # Initialize Q-values for new states
        if state_key not in q_table:
            q_table[state_key] = np.zeros(env.action_space.n)
        if next_state_key not in q_table:
            q_table[next_state_key] = np.zeros(env.action_space.n)

        # Update Q-values
        best_next_action = np.argmax(q_table[next_state_key])
        td_target = reward + gamma * q_table[next_state_key][best_next_action]
        td_delta = td_target - q_table[state_key][action]
        q_table[state_key][action] += alpha * td_delta

        state = next_state

    if (episode + 1) % 100 == 0:
        print(f"Episode {episode + 1}/{num_episodes} completed.")

# Save the Q-table to a file
with open(q_table_filename, 'wb') as f:
    pickle.dump(q_table, f)
print(f"Q-table saved to {q_table_filename}")

print("Training completed.")

# Load the Q-table from the file
with open(q_table_filename, 'rb') as f:
    q_table = pickle.load(f)
print(f"Q-table loaded from {q_table_filename}")

# Function to save image as JPG
def save_image_as_jpg(image, filename):
    img = Image.fromarray(image)
    img.save(filename, 'JPEG')
    
# # Testing the Q-learning algorithm
state = env.reset()
# img = env.render(mode='rgb_array')
# save_image_as_jpg(img, f"large")
done = False
total_reward = 0
steps = 0
while not done:
    if steps == 0:
        img = env.render(mode='rgb_array')
        save_image_as_jpg(img, f"q_table_solver_{steps}")
    state_key = get_state_key(state)
    action = np.argmax(q_table[state_key])
    state, reward, done, info = env.step(action)
    total_reward += reward
    steps+=1
    img = env.render(mode='rgb_array')
    save_image_as_jpg(img, f"q_table_solver_{steps}")
    print(f"Step: {steps}, current reward {round(total_reward,2)}")

img = env.render(mode='rgb_array')
print(f"Total reward during test: {total_reward}")
print(f"Level Completed. Steps: {steps}")

  logger.warn(


0.9 False {'action.name': 'move right', 'action.moved_player': False, 'action.moved_box': False}
-0.1 False {'action.name': 'push right', 'action.moved_player': True, 'action.moved_box': True}
-0.1 False {'action.name': 'no operation', 'action.moved_player': False, 'action.moved_box': False}
-0.1 False {'action.name': 'push up', 'action.moved_player': True, 'action.moved_box': True}
-0.1 False {'action.name': 'no operation', 'action.moved_player': False, 'action.moved_box': False}
-0.1 False {'action.name': 'push up', 'action.moved_player': False, 'action.moved_box': False}
-0.1 False {'action.name': 'push down', 'action.moved_player': True, 'action.moved_box': False}
-0.1 False {'action.name': 'no operation', 'action.moved_player': False, 'action.moved_box': False}
-0.1 False {'action.name': 'push up', 'action.moved_player': True, 'action.moved_box': False}
-0.1 False {'action.name': 'push left', 'action.moved_player': True, 'action.moved_box': False}
-0.1 False {'action.name': 'no op

KeyboardInterrupt: 