In [10]:
!pip install shimmy>=2.0

In [1]:
import gymnasium as gym 
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
import os
from gym import Env
from gym.spaces import Discrete, Box
import numpy as np
import random

In [2]:
import pygame
import sys
import time
import math
import copy

In [3]:
def find_random_zero(array):
    """
    Finds a random zero in a 2D array and returns its position (i, j).
    
    :param array: A 2D array (list of lists).
    :return: Tuple (i, j) where array[i][j] == 0, or None if no zero is found.
    """
    zero_positions = [(i, j) for i in range(len(array)) for j in range(len(array[i])) if array[i][j] == 0]
    
    if zero_positions:
        return random.choice(zero_positions)
    else:
        return None

In [4]:
def NewState(state, action):
    MergedValue = 0
    NumMoved = 0
    if action == 0:
        for i in range(1,4):
            for j in range(0,4):
                if state[i][j] != 0:
                    for k in range(1,5):
                        if i-k >= 0:
                            if state[i-k][j] != 0:
                                if state[i-k][j] == state[i][j]:
                                    NewValue = state[i-k][j] * 2
                                    state[i-k][j] = NewValue
                                    state[i][j] = 0
                                    MergedValue += NewValue
                                    NumMoved += 1
                                else:
                                    if k != 1:
                                        state[i-k+1][j] = state[i][j]
                                        state[i][j] = 0
                                        NumMoved += 1
                                break
                        else:
                            state[0][j] = state[i][j]
                            state[i][j] = 0
                            NumMoved += 1
                            break
                        
    elif action == 1:
        for i in range(0,4):
            for j in range(2,-1,-1):
                if state[i][j] != 0:
                    for k in range(1,5):
                        if j+k <= 3:
                            if state[i][j+k] != 0:
                                if state[i][j+k] == state[i][j]:
                                    NewValue = state[i][j+k] * 2
                                    state[i][j+k] = NewValue
                                    state[i][j] = 0
                                    MergedValue += NewValue
                                    NumMoved += 1
                                else:
                                    if k != 1:
                                        state[i][j+k-1] = state[i][j]
                                        state[i][j] = 0
                                        NumMoved += 1
                                break
                        else:
                            state[i][3] = state[i][j]
                            state[i][j] = 0
                            NumMoved += 1
                            break
    
    elif action == 2:
        for i in range(2,-1,-1):
            for j in range(0,4):
                if state[i][j] != 0:
                    for k in range(1,5):
                        if i+k <=3:
                            if state[i+k][j] != 0:
                                if state[i+k][j] == state[i][j]:
                                    NewValue = state[i+k][j] * 2
                                    state[i+k][j] = NewValue
                                    state[i][j] = 0
                                    MergedValue += NewValue
                                    NumMoved += 1
                                else:
                                    if k != 1:
                                        state[i+k-1][j] = state[i][j]
                                        state[i][j] = 0
                                        NumMoved += 1
                                break
                        else:
                            state[3][j] = state[i][j]
                            state[i][j] = 0
                            NumMoved += 1
                            break
    elif action == 3:
        for i in range(0,4):
                for j in range(1,4):
                    if state[i][j] != 0:
                        for k in range(1,5):
                            if j-k >= 0:
                                if state[i][j-k] != 0:
                                    if state[i][j-k] == state[i][j]:
                                        NewValue = state[i][j-k] * 2
                                        state[i][j-k] = NewValue
                                        state[i][j] = 0
                                        MergedValue += NewValue
                                        NumMoved += 1
                                    else:
                                        if k != 1:
                                            state[i][j-k+1] = state[i][j]
                                            state[i][j] = 0
                                            NumMoved += 1
                                    break
                            else:
                                state[i][0] = state[i][j]
                                state[i][j] = 0
                                NumMoved += 1
                                break
    
    return state, MergedValue, NumMoved

In [5]:
class GameEnv(Env):
    def __init__(self):

        self.action_space = Discrete(4)
    
        self.observation_space = Box(low=0, high=1, shape=(4,4,12), dtype=np.int32)
 
        
        self.state = []
    
        
        self.largest =  0
        
    def step(self, action):
        # Apply action
  
        OldState = copy.deepcopy(self.state)
        
        self.state, MergedValue, NumMoved = NewState(self.state, action)

        random_zero_position = find_random_zero(self.state)
        if random_zero_position:
            if random.random() < 0.9:
                num = 2
            else:
                num = 4
            i, j = random_zero_position
            self.state[i][j] = num
        # Calculate reward
        
        CurrMax = max(map(max, self.state))
        
        if CurrMax > self.largest: 
            reward = 1
            self.largest = CurrMax
        else: 
            reward = 0
            
        reward += MergedValue
        reward -= NumMoved
        
       
        if OldState == self.state: 
            done = True
        else:
            done = False
        
        info = {}
        
        # Return step information
        
        obs = copy.deepcopy(self.state)
        
        for i in range(4):
            for j in range(4):
                obs[i][j] = [0] * 12
                if self.state[i][j] != 0:
                    index = int(math.log2(self.state[i][j]))
                    obs[i][j][index] = 1
                else:
                    obs[i][j][0] = 1

        
        return obs, reward, done, info

    def render(self):
        
        pygame.init()
        width, height = 400, 400
        cell_size = width // 4
        
        screen = pygame.display.set_mode((width, height))
        pygame.display.set_caption("2048 Game")
        
        screen.fill((255, 255, 255))
        
        for row in range(4):
            for col in range(4):
                rect = pygame.Rect(col * cell_size, row * cell_size, cell_size, cell_size)
                pygame.draw.rect(screen, (0, 0, 0), rect, 3)  # Draw the grid line
                font = pygame.font.Font(None, 36)
                text = font.render(str(self.state[row][col]), True, (0, 0, 0))
                text_rect = text.get_rect(center=(col * cell_size + cell_size // 2, row * cell_size + cell_size // 2))
                screen.blit(text, text_rect)

        # Update the display
        pygame.display.flip()
    
    def reset(self):
        # Reset board
        InitTable = [
            [0,0,0,0],
            [0,0,0,0],
            [0,0,0,0],
            [0,0,0,0]
                    ]
        
        numbers1 = [0,1,2,3]
        numbers2 = [0,1,2,3]
        
        r1 = random.randint(0,3)
        r2 = random.randint(0,3)
        
        if random.random() < 0.9:
            value1 = 2
        else:
            value1 = 4
        
        InitTable[r1][r2] = value1
        
        numbers1.remove(r1)
        numbers2.remove(r2)
        
        r1 = random.randint(0,2)
        r2 = random.randint(0,2)
        
        if random.random() < 0.9:
            value2 = 2
        else:
            value2 = 4
        
        InitTable[numbers1[r1]][numbers2[r2]] = value2
        
        self.state = InitTable
        # Reset shower time
        self.largest =  max(map(max, self.state))
        
        obs = copy.deepcopy(self.state)
        
        for i in range(4):
            for j in range(4):
                obs[i][j] = [0] * 12
                if self.state[i][j] != 0:
                    index = int(math.log2(self.state[i][j]))
                    obs[i][j][index] = 1
                else:
                    obs[i][j][0] = 1
        
        return obs
    
    def close(self):
        pygame.quit()
        sys.exit()

In [7]:
env = GameEnv()

In [8]:
model = PPO("MlpPolicy", env, verbose=1, )

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




In [24]:
model.learning_rate

0.0003

In [11]:
model.get_parameters()

{'policy': OrderedDict([('mlp_extractor.policy_net.0.weight',
               tensor([[ 0.1091,  0.2359, -0.0654,  ..., -0.1436,  0.0510,  0.0608],
                       [ 0.0281,  0.1331,  0.1148,  ..., -0.3203, -0.2780,  0.1374],
                       [ 0.0349,  0.1665,  0.2267,  ...,  0.2661,  0.0371,  0.1629],
                       ...,
                       [ 0.0338,  0.0635, -0.1941,  ..., -0.0524, -0.4068,  0.1326],
                       [ 0.0536, -0.0183, -0.0860,  ..., -0.1782, -0.2284,  0.0413],
                       [ 0.0860,  0.0530,  0.1379,  ...,  0.1261,  0.2125, -0.0694]])),
              ('mlp_extractor.policy_net.0.bias',
               tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])),
              ('mlp

In [26]:
model.learn(total_timesteps=20000000)

NameError: name 'model' is not defined

In [11]:
episodes = 1000
totalScore = 0
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        time.sleep(0.1)
        action, _ = model.predict(state, deterministic=False)
        state, reward, done, info = env.step(action)
        score += reward

    totalScore += score

print(totalScore/1000)

KeyboardInterrupt: 

In [None]:
episodes = 1000
totalScore = 0
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        action, _ = model.predict(state, deterministic=False)
        state, reward, done, info = env.step(action)
        score += reward

    totalScore += score

print(totalScore/1000)

In [None]:
episodes = 1000
totalScore = 0
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
   
        action, _ = model.predict(state, deterministic=False)
        state, reward, done, info = env.step(action)
        score += reward

    totalScore += score

print(totalScore/1000)


KeyboardInterrupt



In [9]:
PPOpath = os.path.join('SavedModels', 'PPO2048Second')

In [10]:
model.save(PPOpath)

In [10]:
model = PPO.load(PPOpath, env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


Exception: code() takes at most 16 arguments (18 given)
Exception: code() takes at most 16 arguments (18 given)


In [13]:
episodes = 1000
totalScore = 0
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        time.sleep(0.1)
        action = env.action_space.sample()
        state, reward, done, info = env.step(action)
        score += reward

    totalScore += score

print(totalScore/1000)

KeyboardInterrupt: 

In [42]:
vec_env = model.get_env()
obs = vec_env.reset()
while True:
    action, _states = model.predict(obs, deterministic=False)
    obs, rewards, dones, info = vec_env.step(action)

[[[0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]]]
[[[0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]]]
[[[0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0 0]
  [0 0 0

KeyboardInterrupt: 

In [47]:
env.close()

SystemExit: 