# RF approach to play the wordle

In [1]:
import gym
import gym_wordle
import random
import numpy as np
from gym_wordle.utils import to_array, to_english
from stable_baselines3 import DQN, A2C, PPO
from stable_baselines3.common.env_util import make_vec_env

In [2]:
# https://stackoverflow.com/questions/51002045/how-to-make-jupyter-notebook-to-run-on-gpu
# https://www.techentice.com/how-to-make-jupyter-notebook-to-run-on-gpu/
import tensorflow as tf
tf.test.is_built_with_cuda()
# https://stackoverflow.com/questions/57814535/assertionerror-torch-not-compiled-with-cuda-enabled-in-spite-upgrading-to-cud
# https://stackoverflow.com/questions/57238344/i-have-a-gpu-and-cuda-installed-in-windows-10-but-pytorchs-torch-cuda-is-availa
import torch
print("torch.cuda.is_available():    {}".format(torch.cuda.is_available()))
print("torch.cuda.current_device():  {}".format(torch.cuda.current_device()))

torch.cuda.is_available():    True
torch.cuda.current_device():  0


## Testing the environment

In [3]:
env = gym.make("Wordle-v0")
env = env.unwrapped 
env.reset()
env.action_space

Discrete(12972)

In [4]:
env.observation_space
# character flag codes
# no_char = 0
# right_pos = 1
# wrong_pos = 2
# wrong_char = 3


Box([[0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]], [[26 26 26 26 26  4  4  4  4  4]
 [26 26 26 26 26  4  4  4  4  4]
 [26 26 26 26 26  4  4  4  4  4]
 [26 26 26 26 26  4  4  4  4  4]
 [26 26 26 26 26  4  4  4  4  4]
 [26 26 26 26 26  4  4  4  4  4]], (6, 10), int64)

In [2]:
env = gym.make("Wordle-v0")
env.reset()
solution = env.solution
print(to_english(env.solution_space[solution]),solution)
# act = env.action_space.sample()
# step = env.step(env.solution)
word = "cocky"
act = env.unwrapped.action_space.index_of(to_array(word))
obs, reward, done, _ = env.step(act)
print(obs)
print("Reward: {}".format(reward))
act = env.unwrapped.action_space.index_of(to_array(word))
obs, reward, done, _ = env.step(act)
print(obs)
print("Reward: {}".format(reward))
act = env.unwrapped.action_space.index_of(to_array(word))
obs, reward, done, _ = env.step(act)
print(obs)
print("Reward: {}".format(reward))

growl 924
[[3 2 3 3 3 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]]
Reward: 2
[[3 2 3 3 3 0 0 0 0 0]
 [3 2 3 3 3 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]]
Reward: -5
[[3 2 3 3 3 0 0 0 0 0]
 [3 2 3 3 3 0 0 0 0 0]
 [3 2 3 3 3 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]]
Reward: -10


## Random solving

In [14]:
# bazowe uczenie (na farta)
env = gym.make('Wordle-v0')

success = 0
i = 0
limit = 100_000
reward = -1

while True:

    if i % 10_000 == 0:
        print("Iteration: {}".format(i))
    if i == limit:
        break

    env.reset()
    done = False
    while not done:

        # make a random guess
        act = env.action_space.sample()

        # take a step
        obs, reward, done, info = env.step(act)

    if reward > 30:
        env.render()
        success += 1
        
    i += 1

env.close()
print("========== Summary ==========")
print("Number of successes: {}".format(success))
print("Success rate: {}%".format(success/i))

Iteration: 0
Iteration: 10000
Iteration: 20000
Iteration: 30000
Iteration: 40000
Iteration: 50000
Iteration: 60000
Iteration: 70000
Iteration: 80000
Iteration: 90000
Iteration: 100000
Number of successes: 0
Success rate: 0.0%


In [2]:
def play_single_game(model):
    env = gym.make("Wordle-v0")
    obs = env.reset()

    print("Solution:",to_english(env.solution_space[env.solution]))

    done = False
    while not done:
        action, _states = model.predict(obs)
        print("Word:",to_english(env.action_space[action]))
        obs, rewards, done, info = env.step(action)
        print("Reward: {}".format(rewards))
        env.render()

def play_n_games(model, n: int, threshold: int):
    env = gym.make('Wordle-v0')

    success = 0
    i = 0
    limit = n

    while True:

        if i % 10_000 == 0:
            print("Iteration: {}".format(i))
        if i == limit:
            break

        obs = env.reset()
        done = False
        
        while not done:
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)

        if reward > threshold:
            # env.render()
            success += 1
            
        i += 1

    env.close()
    print("========== Summary ==========")
    print("Number of successes: {}".format(success))
    print("Success rate: {}%".format(success/i))

## Deep Q Network

In [2]:
# https://stable-baselines3.readthedocs.io/en/master/modules/dqn.html
# stable_baselines3.dqn.MlpPolicy - Policy class with Q-Value Net and target net for DQN
# https://www.reddit.com/r/reinforcementlearning/comments/gkbt45/how_do_you_decide_the_discount_factor/

# Parallel environments
def train_DQN(epoch: int):
    # Create a wrapped, monitored VecEnv
    # https://stable-baselines3.readthedocs.io/en/master/common/env_util.html
    env = make_vec_env("Wordle-v0", n_envs=10)

    """Deep Q-Network (DQN)

    Default hyperparameters are taken from the Nature paper,
    except for the optimizer and learning rate
    that were taken from Stable Baselines defaults.

    Paper: https://arxiv.org/abs/1312.5602, https://www.nature.com/articles/nature14236
    """

    model = DQN(
        policy="MlpPolicy",          # the policy model to use (MlpPolicy, CnnPolicy, MultiInputPolicy)
        env=env,                     # the environment to learn from (if registered in Gym, can be str)
        learning_rate=0.02,          # the learning rate (alpha), it can be a function of the current progress remaining (from 1 to 0)
        buffer_size=10000,           # size of the replay buffer
        learning_starts=epoch//100,  # how many steps of the model to collect transitions for before learning starts
        gamma=0.05,                  # the discount factor
        train_freq=4,                # update the model every train_freq steps
        target_update_interval=1000, # update the target network every target_update_interval environment steps
        exploration_fraction=0.75,   # fraction of entire training period over which the exploration rate is reduced (epsilon_start)
        exploration_final_eps=0.3,   # final value of random action probability (epsilon_end)
        verbose=1,                   # 0 for no output, 1 for info messages, 2 for debug messages
        device='cuda',               # device (cpu, cuda, …) on which the code should be run
    )
    try:
        
        # Return a trained model
        model.learn(
            total_timesteps=epoch,   # the total number of samples (env steps) to train on
            log_interval=epoch//1000 # the number of timesteps before logging.
        )
    except KeyboardInterrupt:
        pass
    # Save all the attributes of the object and the model parameters in a zip-file
    model.save("wordle_dqn")

    return model

train_DQN(15_000_000)
# CPU => 8m 33.7s (1_000_000);
# CUDA => 3m 35.7s (1_000_000);
# CUDA => 35m 22.1s (10_000_000);
# CUDA => 62m 54.9s (15_000_000);
# CUDA => 85m 13.6s (25_000_000); # przy oryginalnych wagach - przeuczenie (znaczne)

Using cuda device
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 6        |
|    ep_rew_mean      | -3.81    |
|    exploration_rate | 0.994    |
| time/               |          |
|    episodes         | 15000    |
|    fps              | 11488    |
|    time_elapsed     | 7        |
|    total_timesteps  | 90000    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 6        |
|    ep_rew_mean      | -2.4     |
|    exploration_rate | 0.989    |
| time/               |          |
|    episodes         | 30000    |
|    fps              | 9576     |
|    time_elapsed     | 18       |
|    total_timesteps  | 179980   |
| train/              |          |
|    learning_rate    | 0.02     |
|    loss             | 6.16     |
|    n_updates        | 749      |
----------------------------------
----------------------------------
| rollout/            |          |
| 

<stable_baselines3.dqn.dqn.DQN at 0x24c3e449f90>

In [15]:
# Load the model from a zip-file.
model = DQN.load("wordle_dqn_15_000_000", device='cuda')
# Warning: load re-creates the model from scratch,
# it does not update it in-place!
# For an in-place load use set_parameters instead.
model

<stable_baselines3.dqn.dqn.DQN at 0x2102fc4dc90>

In [16]:
play_single_game(model)

Solution: adorn
Word: roopy
Reward: 9
BBACC
     
     
     
     
     
Word: goors
Reward: 12
BBACC
CBAAC
     
     
     
     
Word: lomed
Reward: 4
BBACC
CBAAC
CBCCB
     
     
     
Word: mikra
Reward: 7
BBACC
CBAAC
CBCCB
CCCAB
     
     
Word: lomed
Reward: -5
BBACC
CBAAC
CBCCB
CCCAB
CBCCB
     
Word: tesla
Reward: -35
BBACC
CBAAC
CBCCB
CCCAB
CBCCB
CCCCB


In [8]:
play_n_games(model, n=100_000, threshold=100)

Iteration: 0
Iteration: 10000
Iteration: 20000
Iteration: 30000
Iteration: 40000
Iteration: 50000
Iteration: 60000
Iteration: 70000
Iteration: 80000
Iteration: 90000
Iteration: 100000
Number of successes: 14
Success rate: 0.00014%


## Advantage Actor Critic

In [2]:
# https://stable-baselines3.readthedocs.io/en/master/modules/a2c.html
def train_A2C():
    # Parallel environments
    env = make_vec_env("Wordle-v0", n_envs=2)

    model = A2C("MlpPolicy", env, verbose=1)
    model.learn(total_timesteps=1e7, log_interval=1000)
    model.save("wordle_a2c")
    return model

train_A2C()
# CUDA => 33m 39.3s (1_000_000);

Using cuda device
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 6        |
|    ep_rew_mean        | -17.8    |
| time/                 |          |
|    fps                | 469      |
|    iterations         | 1000     |
|    time_elapsed       | 21       |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -9.33    |
|    explained_variance | 0.643    |
|    learning_rate      | 0.0007   |
|    n_updates          | 999      |
|    policy_loss        | -24.2    |
|    value_loss         | 14.5     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 6        |
|    ep_rew_mean        | -17.9    |
| time/                 |          |
|    fps                | 477      |
|    iterations         | 2000     |
|    time_elapsed       | 41       |
|    total_timesteps    | 20000    |
| train/            

<stable_baselines3.a2c.a2c.A2C at 0x25e3596a5f0>

In [18]:
model = A2C.load("wordle_a2c")
model

<stable_baselines3.a2c.a2c.A2C at 0x2103ccceb60>

In [19]:
play_single_game(model)

Solution: drawl
Word: strae
Reward: 4
CCBBC
     
     
     
     
     
Word: arret
Reward: 7
CCBBC
BACCC
     
     
     
     
Word: alien
Reward: 4
CCBBC
BACCC
BBCCC
     
     
     
Word: shear
Reward: 4
CCBBC
BACCC
BBCCC
CCCBB
     
     
Word: morae
Reward: 4
CCBBC
BACCC
BBCCC
CCCBB
CCBBC
     
Word: siren
Reward: -35
CCBBC
BACCC
BBCCC
CCCBB
CCBBC
CCBCC


In [5]:
play_n_games(model, n=100_000, threshold=100)

Iteration: 0
Iteration: 10000
Iteration: 20000
Iteration: 30000
Iteration: 40000
Iteration: 50000
Iteration: 60000
Iteration: 70000
Iteration: 80000
Iteration: 90000
Iteration: 100000
Number of successes: 185
Success rate: 0.00185%


## Proximal Policy Optimization

In [3]:
# https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html
def train_PPO():
    # Parallel environments
    env = make_vec_env("Wordle-v0", n_envs=4)

    model = PPO("MlpPolicy", env, verbose=1)
    model.learn(total_timesteps=1e7, log_interval=1000)
    model.save("wordle_ppo")
    return model

train_PPO()
# CUDA => 28m 26.8s (1_000_000);

Using cuda device


<stable_baselines3.ppo.ppo.PPO at 0x224c5606ef0>

In [4]:
model = PPO.load("wordle_ppo")
model

<stable_baselines3.ppo.ppo.PPO at 0x224b8dcaf80>

In [8]:
play_single_game(model)

Solution: endow
Word: trail
Reward: 0
CCCCC
     
     
     
     
     
Word: saine
Reward: 4
CCCCC
CCCBB
     
     
     
     
Word: pored
Reward: 6
CCCCC
CCCBB
CBCBB
     
     
     
Word: soler
Reward: 4
CCCCC
CCCBB
CBCBB
CBCBC
     
     
Word: crone
Reward: 6
CCCCC
CCCBB
CBCBB
CBCBC
CCBBB
     
Word: hutch
Reward: -35
CCCCC
CCCBB
CBCBB
CBCBC
CCBBB
CCCCC


In [6]:
play_n_games(model, n=100_000, threshold=100)

Iteration: 0
Iteration: 10000
Iteration: 20000
Iteration: 30000
Iteration: 40000
Iteration: 50000
Iteration: 60000
Iteration: 70000
Iteration: 80000
Iteration: 90000
Iteration: 100000
Number of successes: 370
Success rate: 0.0037%
